From f7a7a1b173a1c70c415e989f979c344bced3016b Mon Sep 17 00:00:00 2001 From: IRBorisov <8611739+IRBorisov@users.noreply.github.com> Date: Sun, 24 Sep 2023 19:08:17 +0300 Subject: [PATCH] Add backend support for text parsing --- rsconcept/backend/apps/rsform/serializers.py | 24 +++++++ .../backend/apps/rsform/tests/t_views.py | 52 +++++++++++--- rsconcept/backend/apps/rsform/urls.py | 12 +++- rsconcept/backend/apps/rsform/views.py | 71 ++++++++++++++++++- rsconcept/backend/cctext/__init__.py | 2 +- rsconcept/backend/cctext/conceptapi.py | 11 ++- .../backend/cctext/tests/t_conceptapi.py | 11 ++- rsconcept/backend/project/urls.py | 1 - 8 files changed, 162 insertions(+), 22 deletions(-) diff --git a/rsconcept/backend/apps/rsform/serializers.py b/rsconcept/backend/apps/rsform/serializers.py index 1cf8573c..72b380e6 100644 --- a/rsconcept/backend/apps/rsform/serializers.py +++ b/rsconcept/backend/apps/rsform/serializers.py @@ -27,6 +27,30 @@ class ExpressionSerializer(serializers.Serializer): expression = serializers.CharField() +class WordFormSerializer(serializers.Serializer): + ''' Serializer: inflect request. ''' + text = serializers.CharField() + grams = serializers.CharField() + + +class MultiFormSerializer(serializers.Serializer): + ''' Serializer: inflect request. ''' + items = serializers.ListField( + child=WordFormSerializer() + ) + + @staticmethod + def from_list(data: list[tuple[str, str]]) -> dict: + result: dict = {} + result['items'] = [] + for item in data: + result['items'].append({ + 'text': item[0], + 'grams': item[1] + }) + return result + + class TextSerializer(serializers.Serializer): ''' Serializer: Text with references. ''' text = serializers.CharField() diff --git a/rsconcept/backend/apps/rsform/tests/t_views.py b/rsconcept/backend/apps/rsform/tests/t_views.py index 0cc89ff3..b55d077e 100644 --- a/rsconcept/backend/apps/rsform/tests/t_views.py +++ b/rsconcept/backend/apps/rsform/tests/t_views.py @@ -6,14 +6,17 @@ from zipfile import ZipFile from rest_framework.test import APITestCase, APIRequestFactory, APIClient from rest_framework.exceptions import ErrorDetail -from cctext import ReferenceType +from cctext import ReferenceType, split_grams from apps.users.models import User from apps.rsform.models import Syntax, RSForm, Constituenta, CstType, LibraryItem, LibraryItemType, Subscription from apps.rsform.views import ( convert_to_ascii, convert_to_math, - parse_expression + parse_expression, + inflect, + parse_text, + generate_lexeme ) @@ -572,7 +575,7 @@ class TestRSFormViewset(APITestCase): self.assertEqual(response.data['items'][1]['term_resolved'], d1.term_resolved) -class TestFunctionalViews(APITestCase): +class TestRSLanguageViews(APITestCase): def setUp(self): self.factory = APIRequestFactory() self.user = User.objects.create(username='UserTest') @@ -601,35 +604,35 @@ class TestFunctionalViews(APITestCase): def test_convert_to_ascii(self): data = {'expression': '1=1'} - request = self.factory.post('/api/func/to-ascii', data) + request = self.factory.post('/api/rslang/to-ascii', data) response = convert_to_ascii(request) self.assertEqual(response.status_code, 200) self.assertEqual(response.data['result'], r'1 \eq 1') def test_convert_to_ascii_missing_data(self): data = {'data': '1=1'} - request = self.factory.post('/api/func/to-ascii', data) + request = self.factory.post('/api/rslang/to-ascii', data) response = convert_to_ascii(request) self.assertEqual(response.status_code, 400) self.assertIsInstance(response.data['expression'][0], ErrorDetail) def test_convert_to_math(self): data = {'expression': r'1 \eq 1'} - request = self.factory.post('/api/func/to-math', data) + request = self.factory.post('/api/rslang/to-math', data) response = convert_to_math(request) self.assertEqual(response.status_code, 200) self.assertEqual(response.data['result'], r'1=1') def test_convert_to_math_missing_data(self): data = {'data': r'1 \eq 1'} - request = self.factory.post('/api/func/to-math', data) + request = self.factory.post('/api/rslang/to-math', data) response = convert_to_math(request) self.assertEqual(response.status_code, 400) self.assertIsInstance(response.data['expression'][0], ErrorDetail) def test_parse_expression(self): data = {'expression': r'1=1'} - request = self.factory.post('/api/func/parse-expression', data) + request = self.factory.post('/api/rslang/parse-expression', data) response = parse_expression(request) self.assertEqual(response.status_code, 200) self.assertEqual(response.data['parseResult'], True) @@ -638,7 +641,38 @@ class TestFunctionalViews(APITestCase): def test_parse_expression_missing_data(self): data = {'data': r'1=1'} - request = self.factory.post('/api/func/parse-expression', data) + request = self.factory.post('/api/rslang/parse-expression', data) response = parse_expression(request) self.assertEqual(response.status_code, 400) self.assertIsInstance(response.data['expression'][0], ErrorDetail) + + +class TestNaturalLanguageViews(APITestCase): + def setUp(self): + self.factory = APIRequestFactory() + self.client = APIClient() + + def _assert_tags(self, actual: str, expected: str): + self.assertEqual(set(split_grams(actual)), set(split_grams(expected))) + + def test_parse_text(self): + data = {'text': 'синим слонам'} + request = self.factory.post('/api/cctext/parse', data) + response = parse_text(request) + self.assertEqual(response.status_code, 200) + self._assert_tags(response.data['result'], 'datv,NOUN,plur,anim,masc') + + def test_inflect(self): + data = {'text': 'синий слон', 'grams': 'plur,datv'} + request = self.factory.post('/api/cctext/inflect', data) + response = inflect(request) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.data['result'], 'синим слонам') + + def test_generate_lexeme(self): + data = {'text': 'синий слон'} + request = self.factory.post('/api/cctext/generate-lexeme', data) + response = generate_lexeme(request) + self.assertEqual(response.status_code, 200) + self.assertEqual(len(response.data['items']), 12) + self.assertEqual(response.data['items'][0]['text'], 'синий слон') diff --git a/rsconcept/backend/apps/rsform/urls.py b/rsconcept/backend/apps/rsform/urls.py index 9fb6b76b..bc4aac7a 100644 --- a/rsconcept/backend/apps/rsform/urls.py +++ b/rsconcept/backend/apps/rsform/urls.py @@ -12,8 +12,14 @@ urlpatterns = [ path('constituents/', views.ConstituentAPIView.as_view(), name='constituenta-detail'), path('rsforms/import-trs', views.TrsImportView.as_view()), path('rsforms/create-detailed', views.create_rsform), - path('func/parse-expression', views.parse_expression), - path('func/to-ascii', views.convert_to_ascii), - path('func/to-math', views.convert_to_math), + + path('rslang/parse-expression', views.parse_expression), + path('rslang/to-ascii', views.convert_to_ascii), + path('rslang/to-math', views.convert_to_math), + + path('cctext/inflect', views.inflect), + path('cctext/generate-lexeme', views.generate_lexeme), + path('cctext/parse', views.parse_text), + path('', include(library_router.urls)), ] diff --git a/rsconcept/backend/apps/rsform/views.py b/rsconcept/backend/apps/rsform/views.py index 91ed6036..32c650f7 100644 --- a/rsconcept/backend/apps/rsform/views.py +++ b/rsconcept/backend/apps/rsform/views.py @@ -13,6 +13,7 @@ from drf_spectacular.utils import extend_schema, extend_schema_view from rest_framework import status as c import pyconcept +import cctext from . import models as m from . import serializers as s from . import utils @@ -527,7 +528,10 @@ def convert_to_ascii(request): serializer.is_valid(raise_exception=True) expression = serializer.validated_data['expression'] result = pyconcept.convert_to_ascii(expression) - return Response({'result': result}) + return Response( + status=c.HTTP_200_OK, + data={'result': result} + ) @extend_schema( @@ -544,4 +548,67 @@ def convert_to_math(request): serializer.is_valid(raise_exception=True) expression = serializer.validated_data['expression'] result = pyconcept.convert_to_math(expression) - return Response({'result': result}) + return Response( + status=c.HTTP_200_OK, + data={'result': result} + ) + +@extend_schema( + summary='generate wordform', + tags=['NaturalLanguage'], + request=s.WordFormSerializer, + responses={200: s.ResultTextResponse}, + auth=None +) +@api_view(['POST']) +def inflect(request): + ''' Endpoint: Generate wordform with set grammemes. ''' + serializer = s.WordFormSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + text = serializer.validated_data['text'] + grams = serializer.validated_data['grams'] + result = cctext.inflect(text, grams) + return Response( + status=c.HTTP_200_OK, + data={'result': result} + ) + + +@extend_schema( + summary='basic set of wordforms', + tags=['NaturalLanguage'], + request=s.TextSerializer, + responses={200: s.MultiFormSerializer}, + auth=None +) +@api_view(['POST']) +def generate_lexeme(request): + ''' Endpoint: Generate basic set of wordforms. ''' + serializer = s.TextSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + nominal = serializer.validated_data['text'] + result = cctext.generate_lexeme(nominal) + return Response( + status=c.HTTP_200_OK, + data=s.MultiFormSerializer.from_list(result) + ) + + +@extend_schema( + summary='get all language parse variants', + tags=['NaturalLanguage'], + request=s.TextSerializer, + responses={200: s.ResultTextResponse}, + auth=None +) +@api_view(['POST']) +def parse_text(request): + ''' Endpoint: Get likely vocabulary parse. ''' + serializer = s.TextSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + text = serializer.validated_data['text'] + result = cctext.parse(text) + return Response( + status=c.HTTP_200_OK, + data={'result': result} + ) diff --git a/rsconcept/backend/cctext/__init__.py b/rsconcept/backend/cctext/__init__.py index bf5bcf0f..35bcebef 100644 --- a/rsconcept/backend/cctext/__init__.py +++ b/rsconcept/backend/cctext/__init__.py @@ -9,7 +9,7 @@ from .resolver import Reference, Position, Resolver, ResolvedReference, resolve_ from .conceptapi import ( parse, normalize, - get_all_forms, inflect, inflect_context, inflect_substitute, inflect_dependant, + generate_lexeme, inflect, inflect_context, inflect_substitute, inflect_dependant, match_all_morpho, find_substr ) diff --git a/rsconcept/backend/cctext/conceptapi.py b/rsconcept/backend/cctext/conceptapi.py index 26382739..9cd76538 100644 --- a/rsconcept/backend/cctext/conceptapi.py +++ b/rsconcept/backend/cctext/conceptapi.py @@ -21,14 +21,19 @@ def parse(text: str, require_grams: str = '') -> str: return result if result != 'UNKN' else '' -def get_all_forms(text_normal: str) -> list[tuple[str, str]]: - ''' Get all infeclted forms. ''' +# def parse_variants(text: str, require_grams: str = '') -> list[tuple[str, str]]: +# ''' Get all variants of a parse. +# ::returns:: string of comma separated grammar tags or empty string ''' + + +def generate_lexeme(text_normal: str) -> list[tuple[str, str]]: + ''' Get all inflected forms belonging to same Lexeme. ''' model = parser.parse(text_normal) if not model: return [] result = [] for form in model.get_form().lexeme: - result.append((form.word, Morphology(form.tag).to_text())) + result.append((model.inflect(form.tag.grammemes), Morphology(form.tag).to_text())) return result diff --git a/rsconcept/backend/cctext/tests/t_conceptapi.py b/rsconcept/backend/cctext/tests/t_conceptapi.py index 28eaf1e1..becbe093 100644 --- a/rsconcept/backend/cctext/tests/t_conceptapi.py +++ b/rsconcept/backend/cctext/tests/t_conceptapi.py @@ -21,15 +21,20 @@ class TestConceptAPI(unittest.TestCase): self.assertEqual(cc.normalize('первого'), 'первый') self.assertEqual(cc.normalize('диких людей'), 'дикий человек') - def test_get_all_forms(self): + def test_generate_lexeme(self): ''' Test all lexical forms. ''' - self.assertEqual(cc.get_all_forms(''), []) + self.assertEqual(cc.generate_lexeme(''), []) - forms = cc.get_all_forms('наверное') + forms = cc.generate_lexeme('наверное') self.assertEqual(len(forms), 1) self.assertEqual(forms[0][0], 'наверное') self._assert_tags(forms[0][1], 'CONJ,Prnt') + forms = cc.generate_lexeme('молодой человек') + self.assertEqual(len(forms), 19) + self.assertEqual(forms[0][0], 'молодой человек') + self._assert_tags(forms[0][1], 'nomn,masc,sing,anim,NOUN') + def test_inflect(self): ''' Test inflection. ''' self.assertEqual(cc.inflect('', ''), '') diff --git a/rsconcept/backend/project/urls.py b/rsconcept/backend/project/urls.py index 7be34937..55823dc2 100644 --- a/rsconcept/backend/project/urls.py +++ b/rsconcept/backend/project/urls.py @@ -1,6 +1,5 @@ ''' Main URL router ''' from django.contrib import admin -from django.shortcuts import redirect from django.urls import path, include from django.conf import settings from django.conf.urls.static import static