Implement text reference resolution for backend

This commit is contained in:
IRBorisov 2023-08-20 13:59:46 +03:00
parent 7cd76f6004
commit 7dbbbab15a
21 changed files with 479 additions and 43 deletions

View File

@ -40,6 +40,7 @@ coverage.xml
*.py,cover
.hypothesis/
.pytest_cache/
.mypy_cache/
cover/

1
.gitignore vendored
View File

@ -35,6 +35,7 @@ coverage.xml
.hypothesis/
.pytest_cache/
cover/
.mypy_cache/
# Django

17
.vscode/settings.json vendored
View File

@ -1,4 +1,8 @@
{
"search.exclude": {
".mypy_cache/": true,
".pytest_cache/": true
},
"python.testing.unittestArgs": [
"-v",
"-s",
@ -14,5 +18,16 @@
}
],
"python.linting.enabled": true,
"python.linting.mypyEnabled": true
"python.linting.mypyEnabled": true,
"python.analysis.typeCheckingMode": "off",
"python.analysis.diagnosticSeverityOverrides": {
// "reportOptionalMemberAccess": "none"
},
"python.analysis.ignore": ["**/tests/**", "**/node_modules/**", "**/venv/**"],
"python.analysis.packageIndexDepths": [
{
"name": "django",
"depth": 5
}
]
}

View File

@ -70,6 +70,8 @@ This readme file is used mostly to document project dependencies
- gunicorn
- coreapi
- psycopg2-binary
- pymorphy2
- razdel
</pre>
</details>
<details>
@ -87,6 +89,7 @@ This readme file is used mostly to document project dependencies
<pre>
- Pylance
- Pylint
- Django
</pre>
</details>

View File

@ -1,5 +1,4 @@
''' Tests. '''
# flake8: noqa
from .t_imports import *
from .t_views import *
from .t_models import *

View File

@ -8,13 +8,21 @@ from rest_framework.permissions import BasePermission
class ObjectOwnerOrAdmin(BasePermission):
''' Permission for object ownership restriction '''
def has_object_permission(self, request, view, obj):
return request.user == obj.owner or request.user.is_staff
if request.user == obj.owner:
return True
if not hasattr(request.user, 'is_staff'):
return False
return request.user.is_staff # type: ignore
class SchemaOwnerOrAdmin(BasePermission):
''' Permission for object ownership restriction '''
def has_object_permission(self, request, view, obj):
return request.user == obj.schema.owner or request.user.is_staff
if request.user == obj.schema.owner:
return True
if not hasattr(request.user, 'is_staff'):
return False
return request.user.is_staff # type: ignore
def read_trs(file) -> dict:

View File

@ -1,14 +1,16 @@
''' Concept core text processing library. '''
# pylint: skip-file
from .syntax import RuSyntax, Capitalization
from .rumodel import Morphology, SemanticRole, WordTag, morpho
from .rumodel import Morphology, SemanticRole, WordTag, morpho, split_tags, combine_tags
from .ruparser import PhraseParser, WordToken, Collation
from .reference import EntityReference, ReferenceType, SyntacticReference, parse_reference
from .context import TermForm, Entity, TermContext
from .resolver import Position, Resolver, ResolvedReference, resolve_entity, resolve_syntactic
from .conceptapi import (
parse, normalize,
get_all_forms, inflect, inflect_context, inflect_substitute, inflect_dependant,
match_all_morpho, find_substr,
split_tags
match_all_morpho, find_substr
)
# TODO: implement Part of speech transition for VERB <-> NOUN

View File

@ -1,27 +1,23 @@
'''
Concept API Python functions.
::guarantee:: doesnt raise exceptions and returns workable outputs in situations where empty string would be returned
::guarantee:: doesnt raise exceptions and returns workable outputs
'''
from cctext.rumodel import Morphology
from .syntax import RuSyntax
from .ruparser import PhraseParser
from .rumodel import split_tags
parser = PhraseParser()
def split_tags(tags: str) -> frozenset[str]:
''' Split grammemes string into set of items. '''
return frozenset([tag.strip() for tag in filter(None, tags.split(','))])
def parse(text: str, require_tags: str = '') -> str:
''' Determine morpho tags for input text.
::returns:: string of comma separated grammar tags or empty string '''
model = parser.parse(text, require_tags=split_tags(require_tags))
if model is None:
return ''
result = model.get_morpho().as_str()
result = model.get_morpho().to_text()
return result if result != 'UNKN' else ''
@ -32,7 +28,7 @@ def get_all_forms(text_normal: str) -> list[tuple[str, str]]:
return []
result = []
for form in model.get_form().lexeme:
result.append((form.word, Morphology(form.tag).as_str()))
result.append((form.word, Morphology(form.tag).to_text()))
return result

View File

@ -0,0 +1,62 @@
''' Term context for reference resolution. '''
from typing import Iterable, Dict, Optional
from dataclasses import dataclass
from .conceptapi import inflect
@dataclass
class TermForm:
''' Term in a specific form. '''
text: str
form: str
def _search_form(query: str, data: Iterable[TermForm]) -> Optional[str]:
for tf in data:
if tf.form == query:
return tf.text
return None
class Entity:
''' Text entity. '''
def __init__(self, alias: str, nominal: str, manual_forms: Optional[Iterable[TermForm]]=None):
if manual_forms is None:
self.manual = []
else:
self.manual = list(manual_forms)
self.alias = alias
self._nominal = nominal
self._cached: list[TermForm] = []
def get_nominal(self) -> str:
''' Getter for _nominal. '''
return self._nominal
def set_nominal(self, new_text: str):
''' Setter for _nominal.
Note: clears manual and cached forms. '''
if self._nominal == new_text:
return
self._nominal = new_text
self.manual = []
self._cached = []
def get_form(self, form: str) -> str:
''' Get specific term form. '''
if form == '':
return self._nominal
text = _search_form(form, self.manual)
if text is None:
text = _search_form(form, self._cached)
if text is None:
try:
text = inflect(self._nominal, form)
except ValueError as error:
text = f'!{error}!'.replace('Unknown grammeme', 'Неизвестная граммема')
self._cached.append(TermForm(text=text, form=form))
return text
# Term context for resolving entity references.
TermContext = Dict[str, Entity]

View File

@ -0,0 +1,60 @@
''' Text reference API. '''
from enum import Enum, unique
from typing import Optional, Union
@unique
class ReferenceType(Enum):
''' Text reference types. '''
entity = 'entity'
syntactic = 'syntax'
class EntityReference:
''' Reference to entity. '''
def __init__(self, identifier: str, form: str):
self.entity = identifier
self.form = form
def get_type(self) -> ReferenceType:
return ReferenceType.entity
def to_text(self) -> str:
return f'@{{{self.entity}|{self.form}}}'
class SyntacticReference:
''' Reference to syntactic dependcy on EntityReference. '''
def __init__(self, referal_offset: int, text: str):
self.nominal = text
self.offset = referal_offset
def get_type(self) -> ReferenceType:
return ReferenceType.syntactic
def to_text(self) -> str:
return f'@{{{self.offset}|{self.nominal}}}'
Reference = Union[EntityReference, SyntacticReference]
def parse_reference(text: str) -> Optional[Reference]:
if len(text) < 4 or text[-1] != '}' or text[0:2] != '@{':
return None
blocks: list[str] = [block.strip() for block in text[2:-1].split('|')]
if len(blocks) != 2 or blocks[0] == '' or blocks[0][0] in '0':
return None
if blocks[0][0] in '-123456789':
if blocks[1] == '':
return None
try:
offset = int(blocks[0])
return SyntacticReference(offset, blocks[1])
except ValueError:
return None
else:
form = blocks[1].replace(' ', '')
return EntityReference(blocks[0], form)

View File

@ -0,0 +1,114 @@
''' Reference resolution API. '''
import re
from typing import cast, Optional
from dataclasses import dataclass
from .conceptapi import inflect_dependant
from .context import TermContext
from .reference import EntityReference, SyntacticReference, parse_reference, Reference
def resolve_entity(ref: EntityReference, context: TermContext) -> str:
''' Resolve entity reference. '''
alias = ref.entity
if alias not in context:
return f'!Неизвестная сущность: {alias}!'
return context[alias].get_form(ref.form)
def resolve_syntactic(ref: SyntacticReference, index: int, allrefs: list['ResolvedReference']) -> str:
''' Resolve syntactic reference. '''
offset = ref.offset
mainref: Optional['ResolvedReference'] = None
if offset > 0:
index += 1
while index < len(allrefs):
if isinstance(allrefs[index].ref, EntityReference):
if offset == 1:
mainref = allrefs[index]
else:
offset -= 1
index += 1
else:
index -= 1
while index >= 0:
if isinstance(allrefs[index].ref, EntityReference):
if offset == -1:
mainref = allrefs[index]
else:
offset += 1
index -= 1
if mainref is None:
return f'!Некорректное смещение: {ref.offset}!'
return inflect_dependant(ref.nominal, mainref.resolved)
@dataclass
class Position:
''' 0-indexed contiguous segment position in text. '''
start: int = 0
finish: int = 0
@dataclass
class ResolvedReference:
''' Resolved reference data '''
ref: Reference
resolved: str = ''
pos_input: Position = Position()
pos_output: Position = Position()
class Resolver:
''' Text reference resolver '''
REFERENCE_PATTERN = re.compile(r'@{.*?}')
def __init__(self, context: TermContext):
self.context = context
self.refs = cast(list[ResolvedReference], [])
self.input = ''
self.output = ''
def resolve(self, text: str) -> str:
''' Resolve references in input text.
Note: data on references positions is accessed through class attributes '''
self._reset(text)
self._parse_refs()
if len(self.refs) == 0:
self.output = self.input
return self.output
else:
self._resolve_refs()
self._combine_output()
return self.output
def _reset(self, input_text: str):
self.refs = cast(list[ResolvedReference], [])
self.input = input_text
self.output = ''
def _parse_refs(self):
for segment in re.finditer(Resolver.REFERENCE_PATTERN, self.input):
parse = parse_reference(segment[0])
if parse is not None:
ref_info = ResolvedReference(ref=parse,
resolved='',
pos_input=Position(segment.start(0), segment.end(0)))
self.refs.append(ref_info)
def _resolve_refs(self):
for ref in self.refs:
if isinstance(ref.ref, EntityReference):
ref.resolved = resolve_entity(ref.ref, self.context)
for (index, ref) in enumerate(self.refs):
if isinstance(ref.ref, SyntacticReference):
ref.resolved = resolve_syntactic(ref.ref, index, self.refs)
def _combine_output(self):
pos_in = 0
for ref in self.refs:
self.output += self.input[pos_in : ref.pos_input.start]
self.output += ref.resolved
ref.pos_output = Position(len(self.output) - len(ref.resolved), len(self.output))
pos_in = ref.pos_input.finish
self.output += self.input[pos_in : len(self.input)]

View File

@ -1,7 +1,7 @@
''' Russian language models. '''
from __future__ import annotations
from enum import Enum, unique
from typing import Iterable
from typing import Iterable, Optional
from pymorphy2 import MorphAnalyzer
from pymorphy2.tagset import OpencorporaTag as WordTag
@ -10,6 +10,16 @@ from pymorphy2.tagset import OpencorporaTag as WordTag
morpho = MorphAnalyzer()
def split_tags(text: str) -> list[str]:
''' Split grammemes string into set of items. '''
return [tag.strip() for tag in filter(None, text.split(','))]
def combine_tags(tags: Iterable[str]) -> str:
''' Combine grammemes into string. '''
return ','.join(tags)
@unique
class SemanticRole(Enum):
''' Enumerating semantic types for different parse patterns. '''
@ -19,8 +29,8 @@ class SemanticRole(Enum):
definition = 3
@staticmethod
def from_pos(pos: str) -> SemanticRole:
''' Fabric method to produce types from part of speech. '''
def from_POS(pos: Optional[str]) -> SemanticRole:
''' Production method: types from part of speech. '''
if pos in ['NOUN', 'NPRO']:
return SemanticRole.term
elif pos in ['VERB', 'INFN', 'PRTF', 'PRTS']:
@ -36,10 +46,7 @@ class Morphology:
'''
def __init__(self, tag: WordTag, semantic=SemanticRole.unknwn):
self.tag = tag
self.semantic = semantic if semantic != SemanticRole.unknwn else SemanticRole.from_pos(tag.POS)
def __del__(self):
pass
self.semantic = semantic if semantic != SemanticRole.unknwn else SemanticRole.from_POS(tag.POS)
_TAGS_IMMUTABLE = frozenset(['INFN', 'ADVB', 'COMP', 'PNCT', 'PREP', 'CONJ', 'PRCL', 'INTJ'])
@ -60,9 +67,9 @@ class Morphology:
return pos in ['ADJF', 'ADJS', 'PRTF', 'PRTS']
@property
def effective_POS(self) -> str:
def effective_POS(self) -> Optional[str]:
''' Access part of speech. Pronouns are considered as nouns '''
pos: str = self.tag.POS
pos: Optional[str] = self.tag.POS
if pos and self.tag.POS == 'NPRO':
return 'NOUN'
return pos
@ -105,14 +112,6 @@ class Morphology:
result.add(self.tag.gender)
return result
def as_str(self) -> str:
def to_text(self) -> str:
''' Produce string of all grammemes. '''
grammemes = self.tag.grammemes
count = len(grammemes)
if count == 0:
return ''
elif count == 1:
result: str = next(iter(grammemes))
return result
else:
return ','.join(grammemes)
return combine_tags(self.tag.grammemes)

View File

@ -381,7 +381,7 @@ class PhraseParser:
case = form.tag.case
if pos not in ['ADJF', 'ADJS', 'PRTF', 'PRTS']:
continue
if SemanticRole.from_pos(pos) == SemanticRole.term and case == 'gent':
if SemanticRole.from_POS(pos) == SemanticRole.term and case == 'gent':
if before_main:
continue
else:

View File

@ -1 +1,8 @@
''' Tests. '''
from .t_reference import *
from .t_ruparser import *
from .t_syntax import *
from .t_conceptapi import *
from .t_rumodel import *
from .t_context import *
from .t_resolver import *

View File

@ -1,4 +1,4 @@
'''Test Concept Text API'''
''' Unit tests: conceptapi. '''
import unittest
import cctext as cc
@ -7,12 +7,12 @@ import cctext as cc
class TestConceptAPI(unittest.TestCase):
'''Test class for Concept API.'''
def _assert_tags(self, actual: str, expected: str):
self.assertEqual(cc.split_tags(actual), cc.split_tags(expected))
self.assertEqual(set(cc.split_tags(actual)), set(cc.split_tags(expected)))
def test_parse(self):
''' Test parsing. '''
self._assert_tags(cc.parse(''), '')
self._assert_tags(cc.parse('1'), 'intg,NUMB')
self._assert_tags(cc.parse('1'), 'NUMB,intg')
self._assert_tags(cc.parse('слон', require_tags='masc'), 'NOUN,anim,masc,sing,nomn')
def test_normalize_word(self):

View File

@ -0,0 +1,32 @@
''' Unit tests: context. '''
import unittest
from cctext.context import TermForm, Entity, TermContext
class TestEntity(unittest.TestCase):
'''Test Entity termform access.'''
def setUp(self):
self.alias = 'X1'
self.nominal = 'человек'
self.text1 = 'test1'
self.form1 = 'sing,datv'
self.entity = Entity(self.alias, self.nominal, [TermForm(self.text1, self.form1)])
def test_attributes(self):
self.assertEqual(self.entity.alias, self.alias)
self.assertEqual(self.entity.get_nominal(), self.nominal)
self.assertEqual(self.entity.manual, [TermForm(self.text1, self.form1)])
def test_get_form(self):
self.assertEqual(self.entity.get_form(''), self.nominal)
self.assertEqual(self.entity.get_form(self.form1), self.text1)
self.assertEqual(self.entity.get_form('invalid tags'), '!Неизвестная граммема: invalid tags!')
self.assertEqual(self.entity.get_form('plur'), 'люди')
def test_set_nominal(self):
new_nomial = 'TEST'
self.assertEqual(self.entity.get_form('plur'), 'люди')
self.entity.set_nominal(new_nomial)
self.assertEqual(self.entity.get_nominal(), new_nomial)
self.assertEqual(self.entity.get_form('plur'), new_nomial)
self.assertEqual(self.entity.manual, [])

View File

@ -0,0 +1,43 @@
''' Unit tests: reference. '''
import unittest
from cctext import EntityReference, ReferenceType, SyntacticReference, parse_reference
class TestReferences(unittest.TestCase):
''' Test class for references. '''
def test_EntityReference(self):
''' Testing EntityRefence basics. '''
ref = EntityReference('X1', 'sing,nomn')
self.assertEqual(ref.get_type(), ReferenceType.entity)
self.assertEqual(ref.to_text(), '@{X1|sing,nomn}')
def test_SyntacticReference(self):
''' Testing SyntacticReference basics. '''
ref = SyntacticReference(-1, 'черный')
self.assertEqual(ref.get_type(), ReferenceType.syntactic)
self.assertEqual(ref.to_text(), '@{-1|черный}')
def test_parse_reference_invalid(self):
''' Testing parsing reference invalid input. '''
self.assertIsNone(parse_reference(''))
self.assertIsNone(parse_reference('X1'))
self.assertIsNone(parse_reference('invalid'))
self.assertIsNone(parse_reference(' '))
self.assertIsNone(parse_reference('@{|}'))
self.assertIsNone(parse_reference('@{ | }'))
self.assertIsNone(parse_reference('@{-1| }'))
self.assertIsNone(parse_reference('@{1| }'))
self.assertIsNone(parse_reference('@{0|черный}'))
def test_parse_reference(self):
''' Testing parsing reference text. '''
ref = parse_reference('@{1| черный }')
self.assertIsNotNone(ref)
self.assertEqual(ref.to_text(), '@{1|черный}')
self.assertEqual(ref.get_type(), ReferenceType.syntactic)
ref = parse_reference('@{X1 | VERB, past, sing}')
self.assertIsNotNone(ref)
self.assertEqual(ref.to_text(), '@{X1|VERB,past,sing}')
self.assertEqual(ref.get_type(), ReferenceType.entity)

View File

@ -0,0 +1,76 @@
''' Unit tests: resolver. '''
import unittest
from typing import cast
from cctext import (
EntityReference, TermContext, Entity, SyntacticReference,
Resolver, ResolvedReference, Position,
resolve_entity, resolve_syntactic
)
class TestResolver(unittest.TestCase):
'''Test reference Resolver.'''
def setUp(self):
self.context = cast(TermContext, {})
self.context['X1'] = Entity('X1', 'человек')
self.resolver = Resolver(self.context)
def test_resolve_entity(self):
self.assertEqual(resolve_entity(EntityReference('X1', ''), self.context), 'человек')
self.assertEqual(resolve_entity(EntityReference('X1', 'plur'), self.context), 'люди')
self.assertEqual(resolve_entity(EntityReference('X1', 'invalid'), self.context), '!Неизвестная граммема: invalid!')
self.assertEqual(resolve_entity(EntityReference('X123', 'plur'), self.context), '!Неизвестная сущность: X123!')
def test_resolve_syntactic(self):
ref = ResolvedReference(ref=EntityReference('X1', 'sing,datv'), resolved='человеку')
allrefs = [ref, ref, ref, ref]
self.assertEqual(resolve_syntactic(SyntacticReference(text='умный', referal_offset=-1), 0, allrefs), '!Некорректное смещение: -1!')
self.assertEqual(resolve_syntactic(SyntacticReference(text='умный', referal_offset=1), 3, allrefs), '!Некорректное смещение: 1!')
self.assertEqual(resolve_syntactic(SyntacticReference(text='умный', referal_offset=1), 0, allrefs), 'умному')
self.assertEqual(resolve_syntactic(SyntacticReference(text='умный', referal_offset=2), 0, allrefs), 'умному')
self.assertEqual(resolve_syntactic(SyntacticReference(text='умный', referal_offset=3), 0, allrefs), 'умному')
self.assertEqual(resolve_syntactic(SyntacticReference(text='умный', referal_offset=-1), 3, allrefs), 'умному')
self.assertEqual(resolve_syntactic(SyntacticReference(text='умный', referal_offset=-2), 3, allrefs), 'умному')
self.assertEqual(resolve_syntactic(SyntacticReference(text='умный', referal_offset=-3), 3, allrefs), 'умному')
def test_resolve_invalid(self):
self.assertEqual(self.resolver.resolve(''), '')
self.assertEqual(len(self.resolver.refs), 0)
self.assertEqual(self.resolver.resolve('simple text'), 'simple text')
self.assertEqual(len(self.resolver.refs), 0)
self.assertEqual(self.resolver.resolve('simple @{unparsable ref} text'), 'simple @{unparsable ref} text')
self.assertEqual(len(self.resolver.refs), 0)
def test_resolve_single(self):
self.assertEqual(self.resolver.resolve('просто @{-1|умный} текст'), 'просто !Некорректное смещение: -1! текст')
self.assertEqual(len(self.resolver.refs), 1)
self.assertEqual(self.resolver.refs[0].pos_input, Position(7, 18))
self.assertEqual(self.resolver.refs[0].pos_output, Position(7, 34))
self.assertEqual(self.resolver.resolve('просто @{X123|sing,nomn} текст'), 'просто !Неизвестная сущность: X123! текст')
self.assertEqual(len(self.resolver.refs), 1)
self.assertEqual(self.resolver.refs[0].pos_input, Position(7, 24))
self.assertEqual(self.resolver.refs[0].pos_output, Position(7, 35))
self.assertEqual(self.resolver.resolve('@{X1|sing,nomn}'), 'человек')
self.assertEqual(len(self.resolver.refs), 1)
self.assertEqual(self.resolver.refs[0].pos_input, Position(0, 15))
self.assertEqual(self.resolver.refs[0].pos_output, Position(0, 7))
self.assertEqual(self.resolver.resolve('просто @{X1|sing,nomn} текст'), 'просто человек текст')
self.assertEqual(len(self.resolver.refs), 1)
self.assertEqual(self.resolver.refs[0].pos_input, Position(7, 22))
self.assertEqual(self.resolver.refs[0].pos_output, Position(7, 14))
def test_resolve_multiple(self):
input = '@{X1|sing,datv} @{-1|умный} @{X1|plur} завидуют'
self.assertEqual(self.resolver.resolve(input), 'человеку умному люди завидуют')
self.assertEqual(len(self.resolver.refs), 3)
self.assertEqual(self.resolver.refs[0].pos_input, Position(0, 15))
self.assertEqual(self.resolver.refs[0].pos_output, Position(0, 8))
self.assertEqual(self.resolver.refs[1].pos_input, Position(16, 27))
self.assertEqual(self.resolver.refs[1].pos_output, Position(9, 15))
self.assertEqual(self.resolver.refs[2].pos_input, Position(28, 38))
self.assertEqual(self.resolver.refs[2].pos_output, Position(16, 20))

View File

@ -0,0 +1,18 @@
''' Unit tests: rumodel. '''
import unittest
from cctext import split_tags, combine_tags
class TestTags(unittest.TestCase):
'''Test tags manipulation.'''
def test_split_tags(self):
self.assertEqual(split_tags(''), [])
self.assertEqual(split_tags('NOUN'), ['NOUN'])
self.assertEqual(split_tags('NOUN,plur,sing'), ['NOUN','plur','sing'])
def test_combine_tags(self):
self.assertEqual(combine_tags([]), '')
self.assertEqual(combine_tags(['NOUN']), 'NOUN')
self.assertEqual(combine_tags(['NOUN','plur','sing']), 'NOUN,plur,sing')

View File

@ -1,4 +1,4 @@
''' Test russian language parsing. '''
''' Unit tests: ruparser. '''
import unittest
from typing import Iterable, Optional
@ -10,7 +10,7 @@ parser = PhraseParser()
class TestRuParser(unittest.TestCase):
''' Test class for russian parsing. '''
def _assert_parse(self, text: str, expected: list[str],
def _assert_parse(self, text: str, expected: Iterable[str],
require_index: int = -1,
require_tags: Optional[Iterable[str]] = None):
phrase = parser.parse(text, require_index, require_tags)

View File

@ -1,10 +1,10 @@
'''Test module for Russian syntax'''
''' Unit tests: syntax. '''
import unittest
from cctext import RuSyntax, Capitalization
class TestRusParser(unittest.TestCase):
class TestRusSyntax(unittest.TestCase):
''' Test class for russian syntax. '''
def test_capitalization(self):