From b3fd0d9ff72af8e35fcb14774c972b936213e890 Mon Sep 17 00:00:00 2001 From: IRBorisov <8611739+IRBorisov@users.noreply.github.com> Date: Sun, 20 Aug 2023 14:12:09 +0300 Subject: [PATCH] Refactoring: distinguish grammems and WordTag --- rsconcept/backend/cctext/__init__.py | 2 +- rsconcept/backend/cctext/conceptapi.py | 16 ++-- rsconcept/backend/cctext/rumodel.py | 13 +-- rsconcept/backend/cctext/ruparser.py | 88 +++++++++---------- .../backend/cctext/tests/t_conceptapi.py | 4 +- rsconcept/backend/cctext/tests/t_rumodel.py | 14 +-- rsconcept/backend/cctext/tests/t_ruparser.py | 14 +-- 7 files changed, 75 insertions(+), 76 deletions(-) diff --git a/rsconcept/backend/cctext/__init__.py b/rsconcept/backend/cctext/__init__.py index 545d4b2e..53b0a899 100644 --- a/rsconcept/backend/cctext/__init__.py +++ b/rsconcept/backend/cctext/__init__.py @@ -1,7 +1,7 @@ ''' Concept core text processing library. ''' # pylint: skip-file from .syntax import RuSyntax, Capitalization -from .rumodel import Morphology, SemanticRole, WordTag, morpho, split_tags, combine_tags +from .rumodel import Morphology, SemanticRole, WordTag, morpho, split_grams, combine_grams from .ruparser import PhraseParser, WordToken, Collation from .reference import EntityReference, ReferenceType, SyntacticReference, parse_reference from .context import TermForm, Entity, TermContext diff --git a/rsconcept/backend/cctext/conceptapi.py b/rsconcept/backend/cctext/conceptapi.py index 90caf793..26382739 100644 --- a/rsconcept/backend/cctext/conceptapi.py +++ b/rsconcept/backend/cctext/conceptapi.py @@ -6,15 +6,15 @@ Concept API Python functions. from cctext.rumodel import Morphology from .syntax import RuSyntax from .ruparser import PhraseParser -from .rumodel import split_tags +from .rumodel import split_grams parser = PhraseParser() -def parse(text: str, require_tags: str = '') -> str: +def parse(text: str, require_grams: str = '') -> str: ''' Determine morpho tags for input text. ::returns:: string of comma separated grammar tags or empty string ''' - model = parser.parse(text, require_tags=split_tags(require_tags)) + model = parser.parse(text, require_grams=split_grams(require_grams)) if model is None: return '' result = model.get_morpho().to_text() @@ -41,10 +41,10 @@ def normalize(text: str) -> str: return model.normal_form() -def inflect(text: str, target_tags: str) -> str: +def inflect(text: str, target_grams: str) -> str: ''' Inflect text to match required tags. ::returns:: infected text or initial text if infection failed ''' - target_set = split_tags(target_tags) + target_set = split_grams(target_grams) model = parser.parse(text) if model is None: return text @@ -66,15 +66,15 @@ def inflect_dependant(dependant_normal: str, master: str) -> str: return parser.inflect_dependant(dependant_normal, master) -def match_all_morpho(text: str, filter_tags: str) -> list[list[int]]: +def match_all_morpho(text: str, filter_grams: str) -> list[list[int]]: ''' Search for all words corresponding to tags. ''' - target_set = split_tags(filter_tags) + target_set = split_grams(filter_grams) if len(target_set) == 0: return [] result = [] for elem in RuSyntax.tokenize(text): - model = parser.parse(elem.text, require_tags=target_set) + model = parser.parse(elem.text, require_grams=target_set) if model: result.append([elem.start, elem.stop]) return result diff --git a/rsconcept/backend/cctext/rumodel.py b/rsconcept/backend/cctext/rumodel.py index b9deacd2..3b88304b 100644 --- a/rsconcept/backend/cctext/rumodel.py +++ b/rsconcept/backend/cctext/rumodel.py @@ -8,14 +8,15 @@ from pymorphy2.tagset import OpencorporaTag as WordTag # ''' Morphology parser. ''' morpho = MorphAnalyzer() +Grammemes = Iterable[str] -def split_tags(text: str) -> list[str]: +def split_grams(text: str) -> list[str]: ''' Split grammemes string into set of items. ''' return [tag.strip() for tag in filter(None, text.split(','))] -def combine_tags(tags: Iterable[str]) -> str: +def combine_grams(tags: Iterable[str]) -> str: ''' Combine grammemes into string. ''' return ','.join(tags) @@ -74,9 +75,9 @@ class Morphology: return 'NOUN' return pos - def complete_tags(self, tags: Iterable[str]) -> set[str]: + def complete_grams(self, grams: Iterable[str]) -> set[str]: ''' Add missing tags before inflection. ''' - result = set(tags) + result = set(grams) pos = self.tag.POS if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH): result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB') @@ -100,7 +101,7 @@ class Morphology: result = result.difference(WordTag.GENDERS) return result - def coordination_tags(self) -> set[str]: + def coordination_grams(self) -> set[str]: ''' Return set of grammemes for inflection to keep coordination . ''' result = set() if self.tag.case: @@ -114,4 +115,4 @@ class Morphology: def to_text(self) -> str: ''' Produce string of all grammemes. ''' - return combine_tags(self.tag.grammemes) + return combine_grams(self.tag.grammemes) diff --git a/rsconcept/backend/cctext/ruparser.py b/rsconcept/backend/cctext/ruparser.py index 9bac7b25..6ac70ff1 100644 --- a/rsconcept/backend/cctext/ruparser.py +++ b/rsconcept/backend/cctext/ruparser.py @@ -1,38 +1,36 @@ ''' Parsing russian language using pymorphy2 and natasha libraries. ''' from __future__ import annotations -from typing import Iterable, Optional +from typing import Optional from razdel.substring import Substring as Segment -from pymorphy2.analyzer import Parse as WordForm +from pymorphy2.analyzer import Parse as WordParse from .syntax import RuSyntax, Capitalization -from .rumodel import SemanticRole, Morphology, WordTag, morpho +from .rumodel import SemanticRole, Morphology, WordTag, morpho, Grammemes INDEX_NONE = -1 NO_COORDINATION = -1 WORD_NONE = -1 -Tags = Iterable[str] - class WordToken: ''' Atomic text token. ''' - def __init__(self, segment: Segment, forms: list[WordForm], main_form: int = 0): + def __init__(self, segment: Segment, parse: list[WordParse], main_parse: int = 0): self.segment: Segment = segment - self.forms: list[WordForm] = forms - self.main: int = main_form + self.forms: list[WordParse] = parse + self.main: int = main_parse def get_morpho(self) -> Morphology: ''' Return morphology for current token. ''' - return Morphology(self.get_form().tag) + return Morphology(self.get_parse().tag) - def get_form(self) -> WordForm: + def get_parse(self) -> WordParse: ''' Access main form. ''' return self.forms[self.main] - def inflect(self, inflection_tags: set[str]) -> Optional[WordForm]: + def inflect(self, inflection_grams: set[str]) -> Optional[WordParse]: ''' Apply inflection to segment text. Does not modify forms ''' - inflected = self.get_form().inflect(inflection_tags) + inflected = self.get_parse().inflect(inflection_grams) if not inflected: return None self.segment.text = Capitalization.from_text(self.segment.text).apply_to(inflected.word) @@ -51,9 +49,9 @@ class Collation: ''' Check if data is parsed correctly ''' return self.main_word != WORD_NONE - def get_form(self) -> WordForm: - ''' Access WordForm. ''' - return self.words[self.main_word].get_form() + def get_form(self) -> WordParse: + ''' Access WordParse. ''' + return self.words[self.main_word].get_parse() def get_morpho(self) -> Morphology: ''' Access parsed main mrophology. ''' @@ -64,12 +62,12 @@ class Collation: self.words.append(WordToken(segment, forms, main_form)) self.coordination.append(NO_COORDINATION if not need_coordination else 0) - def inflect(self, target_tags: Tags) -> str: + def inflect(self, target_grams: Grammemes) -> str: ''' Inflect text to match required tags. ''' if self.is_valid(): origin = self.get_morpho() - if not origin.tag.grammemes.issuperset(target_tags): - if self._apply_inflection(origin, target_tags): + if not origin.tag.grammemes.issuperset(target_grams): + if self._apply_inflection(origin, target_grams): return self._generate_text() return self.text @@ -80,7 +78,7 @@ class Collation: if morph.effective_POS: tags = set() tags.add(morph.effective_POS) - tags = morph.complete_tags(tags) + tags = morph.complete_grams(tags) return self.inflect(tags) return self.text @@ -88,8 +86,8 @@ class Collation: ''' Create inflection to coordinate with master_model form. ''' assert self.is_valid() morph = master_model.get_morpho() - tags = morph.coordination_tags() - tags = self.get_morpho().complete_tags(tags) + tags = morph.coordination_grams() + tags = self.get_morpho().complete_grams(tags) return self.inflect(tags) def normal_form(self) -> str: @@ -97,8 +95,8 @@ class Collation: if self.is_valid(): main_form = self.get_form() new_morpho = Morphology(main_form.normalized.tag) - new_tags = new_morpho.complete_tags(frozenset()) - return self.inflect(new_tags) + new_grams = new_morpho.complete_grams(frozenset()) + return self.inflect(new_grams) return self.text def _iterate_coordinated(self): @@ -108,22 +106,22 @@ class Collation: yield self.words[current_word] current_word += self.coordination[current_word] - def _inflect_main_word(self, origin: Morphology, target_tags: Tags) -> Optional[Morphology]: - full_tags = origin.complete_tags(target_tags) - inflected = self.words[self.main_word].inflect(full_tags) + def _inflect_main_word(self, origin: Morphology, target_grams: Grammemes) -> Optional[Morphology]: + full_grams = origin.complete_grams(target_grams) + inflected = self.words[self.main_word].inflect(full_grams) if not inflected: return None return Morphology(inflected.tag) - def _apply_inflection(self, origin: Morphology, target_tags: Tags) -> bool: - new_moprho = self._inflect_main_word(origin, target_tags) + def _apply_inflection(self, origin: Morphology, target_grams: Grammemes) -> bool: + new_moprho = self._inflect_main_word(origin, target_grams) if not new_moprho: return False - inflection_tags = new_moprho.coordination_tags() - if len(inflection_tags) == 0: + inflection_grams = new_moprho.coordination_grams() + if len(inflection_grams) == 0: return True for word in self._iterate_coordinated(): - word.inflect(inflection_tags) + word.inflect(inflection_grams) return True def _generate_text(self) -> str: @@ -156,7 +154,7 @@ class PhraseParser: def parse(self, text: str, require_index: int = INDEX_NONE, - require_tags: Optional[Tags] = None) -> Optional[Collation]: + require_grams: Optional[Grammemes] = None) -> Optional[Collation]: ''' Determine morpho tags for input text. ::returns:: Morphology of a text or None if no suitable form is available @@ -165,9 +163,9 @@ class PhraseParser: if len(segments) == 0: return None elif len(segments) == 1: - return self._parse_single(segments[0], require_index, require_tags) + return self._parse_single(segments[0], require_index, require_grams) else: - return self._parse_multiword(text, segments, require_index, require_tags) + return self._parse_multiword(text, segments, require_index, require_grams) def normalize(self, text: str): ''' Get normal form for target text. ''' @@ -237,7 +235,7 @@ class PhraseParser: return dependant_normal return dependant_model.inflect_dependant(master_model) - def _parse_single(self, segment, require_index: int, require_tags: Optional[Tags]) -> Optional[Collation]: + def _parse_single(self, segment, require_index: int, require_grams: Optional[Grammemes]) -> Optional[Collation]: forms = list(self._filtered_parse(segment.text)) parse_index = INDEX_NONE if len(forms) == 0 or require_index >= len(forms): @@ -245,13 +243,13 @@ class PhraseParser: if require_index != INDEX_NONE: tags = forms[require_index].tag - if require_tags and not tags.grammemes.issuperset(require_tags): + if require_grams and not tags.grammemes.issuperset(require_grams): return None parse_index = require_index else: current_score = 0 for (index, form) in enumerate(forms): - if not require_tags or form.tag.grammemes.issuperset(require_tags): + if not require_grams or form.tag.grammemes.issuperset(require_grams): if form.tag.case == 'nomn': parse_index = index break @@ -270,7 +268,7 @@ class PhraseParser: return result def _parse_multiword(self, text: str, segments: list, require_index: int, - require_tags: Optional[Tags]) -> Optional[Collation]: + require_grams: Optional[Grammemes]) -> Optional[Collation]: result = Collation(text) priority_main: float = self._PRIORITY_NONE segment_index = 0 @@ -280,7 +278,7 @@ class PhraseParser: if main_wait > PhraseParser._MAIN_WAIT_LIMIT: break segment_index += 1 - priority = self._parse_segment(result, segment, require_index, require_tags) + priority = self._parse_segment(result, segment, require_index, require_grams) if priority is None: continue # skip non-parsable entities main_wait += 1 @@ -299,7 +297,7 @@ class PhraseParser: output: Collation, segment: Segment, require_index: int, - require_tags: Optional[Tags]) -> Optional[float]: + require_grams: Optional[Grammemes]) -> Optional[float]: ''' Return priority for this can be a new main word ''' forms = list(self._filtered_parse(segment.text)) if len(forms) == 0: @@ -311,14 +309,14 @@ class PhraseParser: score_sum: float = 0 if require_index != INDEX_NONE: form = forms[require_index] - if not require_tags or form.tag.grammemes.issuperset(require_tags): + if not require_grams or form.tag.grammemes.issuperset(require_grams): (local_max, segment_score) = PhraseParser._get_priorities_for(form.tag) main_index = require_index needs_coordination = Morphology.is_dependable(form.tag.POS) else: local_max = self._PRIORITY_NONE for (index, form) in enumerate(forms): - if require_tags and not form.tag.grammemes.issuperset(require_tags): + if require_grams and not form.tag.grammemes.issuperset(require_grams): continue (local_priority, global_priority) = PhraseParser._get_priorities_for(form.tag) needs_coordination = needs_coordination or Morphology.is_dependable(form.tag.POS) @@ -419,20 +417,20 @@ class PhraseParser: @staticmethod def _parse_word(text: str, require_index: int = INDEX_NONE, - require_tags: Optional[Tags] = None) -> Optional[Morphology]: + require_grams: Optional[Grammemes] = None) -> Optional[Morphology]: parsed_variants = morpho.parse(text) if not parsed_variants or require_index >= len(parsed_variants): return None if require_index != INDEX_NONE: tags = parsed_variants[require_index].tag - if not require_tags or tags.grammemes.issuperset(require_tags): + if not require_grams or tags.grammemes.issuperset(require_grams): return Morphology(tags) else: return None else: for variant in parsed_variants: tags = variant.tag - if not require_tags or tags.grammemes.issuperset(require_tags): + if not require_grams or tags.grammemes.issuperset(require_grams): return Morphology(tags) return None diff --git a/rsconcept/backend/cctext/tests/t_conceptapi.py b/rsconcept/backend/cctext/tests/t_conceptapi.py index 76405b3e..28eaf1e1 100644 --- a/rsconcept/backend/cctext/tests/t_conceptapi.py +++ b/rsconcept/backend/cctext/tests/t_conceptapi.py @@ -7,13 +7,13 @@ import cctext as cc class TestConceptAPI(unittest.TestCase): '''Test class for Concept API.''' def _assert_tags(self, actual: str, expected: str): - self.assertEqual(set(cc.split_tags(actual)), set(cc.split_tags(expected))) + self.assertEqual(set(cc.split_grams(actual)), set(cc.split_grams(expected))) def test_parse(self): ''' Test parsing. ''' self._assert_tags(cc.parse(''), '') self._assert_tags(cc.parse('1'), 'NUMB,intg') - self._assert_tags(cc.parse('слон', require_tags='masc'), 'NOUN,anim,masc,sing,nomn') + self._assert_tags(cc.parse('слон', require_grams='masc'), 'NOUN,anim,masc,sing,nomn') def test_normalize_word(self): ''' Test normalize for single word. ''' diff --git a/rsconcept/backend/cctext/tests/t_rumodel.py b/rsconcept/backend/cctext/tests/t_rumodel.py index c5e80ac7..870960b2 100644 --- a/rsconcept/backend/cctext/tests/t_rumodel.py +++ b/rsconcept/backend/cctext/tests/t_rumodel.py @@ -1,18 +1,18 @@ ''' Unit tests: rumodel. ''' import unittest -from cctext import split_tags, combine_tags +from cctext import split_grams, combine_grams class TestTags(unittest.TestCase): '''Test tags manipulation.''' def test_split_tags(self): - self.assertEqual(split_tags(''), []) - self.assertEqual(split_tags('NOUN'), ['NOUN']) - self.assertEqual(split_tags('NOUN,plur,sing'), ['NOUN','plur','sing']) + self.assertEqual(split_grams(''), []) + self.assertEqual(split_grams('NOUN'), ['NOUN']) + self.assertEqual(split_grams('NOUN,plur,sing'), ['NOUN','plur','sing']) def test_combine_tags(self): - self.assertEqual(combine_tags([]), '') - self.assertEqual(combine_tags(['NOUN']), 'NOUN') - self.assertEqual(combine_tags(['NOUN','plur','sing']), 'NOUN,plur,sing') + self.assertEqual(combine_grams([]), '') + self.assertEqual(combine_grams(['NOUN']), 'NOUN') + self.assertEqual(combine_grams(['NOUN','plur','sing']), 'NOUN,plur,sing') diff --git a/rsconcept/backend/cctext/tests/t_ruparser.py b/rsconcept/backend/cctext/tests/t_ruparser.py index e3a352cb..eab2d77c 100644 --- a/rsconcept/backend/cctext/tests/t_ruparser.py +++ b/rsconcept/backend/cctext/tests/t_ruparser.py @@ -12,8 +12,8 @@ class TestRuParser(unittest.TestCase): def _assert_parse(self, text: str, expected: Iterable[str], require_index: int = -1, - require_tags: Optional[Iterable[str]] = None): - phrase = parser.parse(text, require_index, require_tags) + require_grams: Optional[Iterable[str]] = None): + phrase = parser.parse(text, require_index, require_grams) self.assertIsNotNone(phrase) if phrase: self.assertEqual(phrase.get_morpho().tag.grammemes, set(expected)) @@ -51,10 +51,10 @@ class TestRuParser(unittest.TestCase): self._assert_parse('32-', ['intg', 'NUMB']) self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_index=0) - self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_tags=['masc']) + self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_grams=['masc']) self._assert_parse('прямой', ['ADJF', 'gent', 'sing', 'femn', 'Qual'], require_index=0) self._assert_parse('прямой', ['ADJF', 'datv', 'Qual', 'sing', 'femn'], require_index=1) - self._assert_parse('прямой', ['NOUN', 'sing', 'inan', 'femn', 'gent'], require_tags=['NOUN']) + self._assert_parse('прямой', ['NOUN', 'sing', 'inan', 'femn', 'gent'], require_grams=['NOUN']) self._assert_parse('консистенции', ['NOUN', 'inan', 'femn', 'plur', 'nomn']) self._assert_parse('тест', ['NOUN', 'sing', 'masc', 'inan', 'nomn']) @@ -65,9 +65,9 @@ class TestRuParser(unittest.TestCase): self.assertEqual(parser.parse('КАиП'), None) self.assertEqual(parser.parse('СЛОН'), None) self.assertEqual(parser.parse(''), None) - self.assertEqual(parser.parse('слон', require_tags=set(['femn'])), None) - self.assertEqual(parser.parse('32', require_tags=set(['NOUN'])), None) - self.assertEqual(parser.parse('32-', require_tags=set(['NOUN'])), None) + self.assertEqual(parser.parse('слон', require_grams=set(['femn'])), None) + self.assertEqual(parser.parse('32', require_grams=set(['NOUN'])), None) + self.assertEqual(parser.parse('32-', require_grams=set(['NOUN'])), None) self.assertEqual(parser.parse('слон', require_index=42), None) def test_parse_text(self):