Add backend support for text parsing

This commit is contained in:
IRBorisov 2023-09-24 19:08:17 +03:00
parent f8c087ad87
commit f7a7a1b173
8 changed files with 162 additions and 22 deletions

View File

@ -27,6 +27,30 @@ class ExpressionSerializer(serializers.Serializer):
expression = serializers.CharField() expression = serializers.CharField()
class WordFormSerializer(serializers.Serializer):
''' Serializer: inflect request. '''
text = serializers.CharField()
grams = serializers.CharField()
class MultiFormSerializer(serializers.Serializer):
''' Serializer: inflect request. '''
items = serializers.ListField(
child=WordFormSerializer()
)
@staticmethod
def from_list(data: list[tuple[str, str]]) -> dict:
result: dict = {}
result['items'] = []
for item in data:
result['items'].append({
'text': item[0],
'grams': item[1]
})
return result
class TextSerializer(serializers.Serializer): class TextSerializer(serializers.Serializer):
''' Serializer: Text with references. ''' ''' Serializer: Text with references. '''
text = serializers.CharField() text = serializers.CharField()

View File

@ -6,14 +6,17 @@ from zipfile import ZipFile
from rest_framework.test import APITestCase, APIRequestFactory, APIClient from rest_framework.test import APITestCase, APIRequestFactory, APIClient
from rest_framework.exceptions import ErrorDetail from rest_framework.exceptions import ErrorDetail
from cctext import ReferenceType from cctext import ReferenceType, split_grams
from apps.users.models import User from apps.users.models import User
from apps.rsform.models import Syntax, RSForm, Constituenta, CstType, LibraryItem, LibraryItemType, Subscription from apps.rsform.models import Syntax, RSForm, Constituenta, CstType, LibraryItem, LibraryItemType, Subscription
from apps.rsform.views import ( from apps.rsform.views import (
convert_to_ascii, convert_to_ascii,
convert_to_math, convert_to_math,
parse_expression parse_expression,
inflect,
parse_text,
generate_lexeme
) )
@ -572,7 +575,7 @@ class TestRSFormViewset(APITestCase):
self.assertEqual(response.data['items'][1]['term_resolved'], d1.term_resolved) self.assertEqual(response.data['items'][1]['term_resolved'], d1.term_resolved)
class TestFunctionalViews(APITestCase): class TestRSLanguageViews(APITestCase):
def setUp(self): def setUp(self):
self.factory = APIRequestFactory() self.factory = APIRequestFactory()
self.user = User.objects.create(username='UserTest') self.user = User.objects.create(username='UserTest')
@ -601,35 +604,35 @@ class TestFunctionalViews(APITestCase):
def test_convert_to_ascii(self): def test_convert_to_ascii(self):
data = {'expression': '1=1'} data = {'expression': '1=1'}
request = self.factory.post('/api/func/to-ascii', data) request = self.factory.post('/api/rslang/to-ascii', data)
response = convert_to_ascii(request) response = convert_to_ascii(request)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['result'], r'1 \eq 1') self.assertEqual(response.data['result'], r'1 \eq 1')
def test_convert_to_ascii_missing_data(self): def test_convert_to_ascii_missing_data(self):
data = {'data': '1=1'} data = {'data': '1=1'}
request = self.factory.post('/api/func/to-ascii', data) request = self.factory.post('/api/rslang/to-ascii', data)
response = convert_to_ascii(request) response = convert_to_ascii(request)
self.assertEqual(response.status_code, 400) self.assertEqual(response.status_code, 400)
self.assertIsInstance(response.data['expression'][0], ErrorDetail) self.assertIsInstance(response.data['expression'][0], ErrorDetail)
def test_convert_to_math(self): def test_convert_to_math(self):
data = {'expression': r'1 \eq 1'} data = {'expression': r'1 \eq 1'}
request = self.factory.post('/api/func/to-math', data) request = self.factory.post('/api/rslang/to-math', data)
response = convert_to_math(request) response = convert_to_math(request)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['result'], r'1=1') self.assertEqual(response.data['result'], r'1=1')
def test_convert_to_math_missing_data(self): def test_convert_to_math_missing_data(self):
data = {'data': r'1 \eq 1'} data = {'data': r'1 \eq 1'}
request = self.factory.post('/api/func/to-math', data) request = self.factory.post('/api/rslang/to-math', data)
response = convert_to_math(request) response = convert_to_math(request)
self.assertEqual(response.status_code, 400) self.assertEqual(response.status_code, 400)
self.assertIsInstance(response.data['expression'][0], ErrorDetail) self.assertIsInstance(response.data['expression'][0], ErrorDetail)
def test_parse_expression(self): def test_parse_expression(self):
data = {'expression': r'1=1'} data = {'expression': r'1=1'}
request = self.factory.post('/api/func/parse-expression', data) request = self.factory.post('/api/rslang/parse-expression', data)
response = parse_expression(request) response = parse_expression(request)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['parseResult'], True) self.assertEqual(response.data['parseResult'], True)
@ -638,7 +641,38 @@ class TestFunctionalViews(APITestCase):
def test_parse_expression_missing_data(self): def test_parse_expression_missing_data(self):
data = {'data': r'1=1'} data = {'data': r'1=1'}
request = self.factory.post('/api/func/parse-expression', data) request = self.factory.post('/api/rslang/parse-expression', data)
response = parse_expression(request) response = parse_expression(request)
self.assertEqual(response.status_code, 400) self.assertEqual(response.status_code, 400)
self.assertIsInstance(response.data['expression'][0], ErrorDetail) self.assertIsInstance(response.data['expression'][0], ErrorDetail)
class TestNaturalLanguageViews(APITestCase):
def setUp(self):
self.factory = APIRequestFactory()
self.client = APIClient()
def _assert_tags(self, actual: str, expected: str):
self.assertEqual(set(split_grams(actual)), set(split_grams(expected)))
def test_parse_text(self):
data = {'text': 'синим слонам'}
request = self.factory.post('/api/cctext/parse', data)
response = parse_text(request)
self.assertEqual(response.status_code, 200)
self._assert_tags(response.data['result'], 'datv,NOUN,plur,anim,masc')
def test_inflect(self):
data = {'text': 'синий слон', 'grams': 'plur,datv'}
request = self.factory.post('/api/cctext/inflect', data)
response = inflect(request)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['result'], 'синим слонам')
def test_generate_lexeme(self):
data = {'text': 'синий слон'}
request = self.factory.post('/api/cctext/generate-lexeme', data)
response = generate_lexeme(request)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data['items']), 12)
self.assertEqual(response.data['items'][0]['text'], 'синий слон')

View File

@ -12,8 +12,14 @@ urlpatterns = [
path('constituents/<int:pk>', views.ConstituentAPIView.as_view(), name='constituenta-detail'), path('constituents/<int:pk>', views.ConstituentAPIView.as_view(), name='constituenta-detail'),
path('rsforms/import-trs', views.TrsImportView.as_view()), path('rsforms/import-trs', views.TrsImportView.as_view()),
path('rsforms/create-detailed', views.create_rsform), path('rsforms/create-detailed', views.create_rsform),
path('func/parse-expression', views.parse_expression),
path('func/to-ascii', views.convert_to_ascii), path('rslang/parse-expression', views.parse_expression),
path('func/to-math', views.convert_to_math), path('rslang/to-ascii', views.convert_to_ascii),
path('rslang/to-math', views.convert_to_math),
path('cctext/inflect', views.inflect),
path('cctext/generate-lexeme', views.generate_lexeme),
path('cctext/parse', views.parse_text),
path('', include(library_router.urls)), path('', include(library_router.urls)),
] ]

View File

@ -13,6 +13,7 @@ from drf_spectacular.utils import extend_schema, extend_schema_view
from rest_framework import status as c from rest_framework import status as c
import pyconcept import pyconcept
import cctext
from . import models as m from . import models as m
from . import serializers as s from . import serializers as s
from . import utils from . import utils
@ -527,7 +528,10 @@ def convert_to_ascii(request):
serializer.is_valid(raise_exception=True) serializer.is_valid(raise_exception=True)
expression = serializer.validated_data['expression'] expression = serializer.validated_data['expression']
result = pyconcept.convert_to_ascii(expression) result = pyconcept.convert_to_ascii(expression)
return Response({'result': result}) return Response(
status=c.HTTP_200_OK,
data={'result': result}
)
@extend_schema( @extend_schema(
@ -544,4 +548,67 @@ def convert_to_math(request):
serializer.is_valid(raise_exception=True) serializer.is_valid(raise_exception=True)
expression = serializer.validated_data['expression'] expression = serializer.validated_data['expression']
result = pyconcept.convert_to_math(expression) result = pyconcept.convert_to_math(expression)
return Response({'result': result}) return Response(
status=c.HTTP_200_OK,
data={'result': result}
)
@extend_schema(
summary='generate wordform',
tags=['NaturalLanguage'],
request=s.WordFormSerializer,
responses={200: s.ResultTextResponse},
auth=None
)
@api_view(['POST'])
def inflect(request):
''' Endpoint: Generate wordform with set grammemes. '''
serializer = s.WordFormSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
text = serializer.validated_data['text']
grams = serializer.validated_data['grams']
result = cctext.inflect(text, grams)
return Response(
status=c.HTTP_200_OK,
data={'result': result}
)
@extend_schema(
summary='basic set of wordforms',
tags=['NaturalLanguage'],
request=s.TextSerializer,
responses={200: s.MultiFormSerializer},
auth=None
)
@api_view(['POST'])
def generate_lexeme(request):
''' Endpoint: Generate basic set of wordforms. '''
serializer = s.TextSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
nominal = serializer.validated_data['text']
result = cctext.generate_lexeme(nominal)
return Response(
status=c.HTTP_200_OK,
data=s.MultiFormSerializer.from_list(result)
)
@extend_schema(
summary='get all language parse variants',
tags=['NaturalLanguage'],
request=s.TextSerializer,
responses={200: s.ResultTextResponse},
auth=None
)
@api_view(['POST'])
def parse_text(request):
''' Endpoint: Get likely vocabulary parse. '''
serializer = s.TextSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
text = serializer.validated_data['text']
result = cctext.parse(text)
return Response(
status=c.HTTP_200_OK,
data={'result': result}
)

View File

@ -9,7 +9,7 @@ from .resolver import Reference, Position, Resolver, ResolvedReference, resolve_
from .conceptapi import ( from .conceptapi import (
parse, normalize, parse, normalize,
get_all_forms, inflect, inflect_context, inflect_substitute, inflect_dependant, generate_lexeme, inflect, inflect_context, inflect_substitute, inflect_dependant,
match_all_morpho, find_substr match_all_morpho, find_substr
) )

View File

@ -21,14 +21,19 @@ def parse(text: str, require_grams: str = '') -> str:
return result if result != 'UNKN' else '' return result if result != 'UNKN' else ''
def get_all_forms(text_normal: str) -> list[tuple[str, str]]: # def parse_variants(text: str, require_grams: str = '') -> list[tuple[str, str]]:
''' Get all infeclted forms. ''' # ''' Get all variants of a parse.
# ::returns:: string of comma separated grammar tags or empty string '''
def generate_lexeme(text_normal: str) -> list[tuple[str, str]]:
''' Get all inflected forms belonging to same Lexeme. '''
model = parser.parse(text_normal) model = parser.parse(text_normal)
if not model: if not model:
return [] return []
result = [] result = []
for form in model.get_form().lexeme: for form in model.get_form().lexeme:
result.append((form.word, Morphology(form.tag).to_text())) result.append((model.inflect(form.tag.grammemes), Morphology(form.tag).to_text()))
return result return result

View File

@ -21,15 +21,20 @@ class TestConceptAPI(unittest.TestCase):
self.assertEqual(cc.normalize('первого'), 'первый') self.assertEqual(cc.normalize('первого'), 'первый')
self.assertEqual(cc.normalize('диких людей'), 'дикий человек') self.assertEqual(cc.normalize('диких людей'), 'дикий человек')
def test_get_all_forms(self): def test_generate_lexeme(self):
''' Test all lexical forms. ''' ''' Test all lexical forms. '''
self.assertEqual(cc.get_all_forms(''), []) self.assertEqual(cc.generate_lexeme(''), [])
forms = cc.get_all_forms('наверное') forms = cc.generate_lexeme('наверное')
self.assertEqual(len(forms), 1) self.assertEqual(len(forms), 1)
self.assertEqual(forms[0][0], 'наверное') self.assertEqual(forms[0][0], 'наверное')
self._assert_tags(forms[0][1], 'CONJ,Prnt') self._assert_tags(forms[0][1], 'CONJ,Prnt')
forms = cc.generate_lexeme('молодой человек')
self.assertEqual(len(forms), 19)
self.assertEqual(forms[0][0], 'молодой человек')
self._assert_tags(forms[0][1], 'nomn,masc,sing,anim,NOUN')
def test_inflect(self): def test_inflect(self):
''' Test inflection. ''' ''' Test inflection. '''
self.assertEqual(cc.inflect('', ''), '') self.assertEqual(cc.inflect('', ''), '')

View File

@ -1,6 +1,5 @@
''' Main URL router ''' ''' Main URL router '''
from django.contrib import admin from django.contrib import admin
from django.shortcuts import redirect
from django.urls import path, include from django.urls import path, include
from django.conf import settings from django.conf import settings
from django.conf.urls.static import static from django.conf.urls.static import static