Refactoring: distinguish grammems and WordTag

This commit is contained in:
IRBorisov 2023-08-20 14:12:09 +03:00
parent 7dbbbab15a
commit b3fd0d9ff7
7 changed files with 75 additions and 76 deletions

View File

@ -1,7 +1,7 @@
''' Concept core text processing library. ''' ''' Concept core text processing library. '''
# pylint: skip-file # pylint: skip-file
from .syntax import RuSyntax, Capitalization from .syntax import RuSyntax, Capitalization
from .rumodel import Morphology, SemanticRole, WordTag, morpho, split_tags, combine_tags from .rumodel import Morphology, SemanticRole, WordTag, morpho, split_grams, combine_grams
from .ruparser import PhraseParser, WordToken, Collation from .ruparser import PhraseParser, WordToken, Collation
from .reference import EntityReference, ReferenceType, SyntacticReference, parse_reference from .reference import EntityReference, ReferenceType, SyntacticReference, parse_reference
from .context import TermForm, Entity, TermContext from .context import TermForm, Entity, TermContext

View File

@ -6,15 +6,15 @@ Concept API Python functions.
from cctext.rumodel import Morphology from cctext.rumodel import Morphology
from .syntax import RuSyntax from .syntax import RuSyntax
from .ruparser import PhraseParser from .ruparser import PhraseParser
from .rumodel import split_tags from .rumodel import split_grams
parser = PhraseParser() parser = PhraseParser()
def parse(text: str, require_tags: str = '') -> str: def parse(text: str, require_grams: str = '') -> str:
''' Determine morpho tags for input text. ''' Determine morpho tags for input text.
::returns:: string of comma separated grammar tags or empty string ''' ::returns:: string of comma separated grammar tags or empty string '''
model = parser.parse(text, require_tags=split_tags(require_tags)) model = parser.parse(text, require_grams=split_grams(require_grams))
if model is None: if model is None:
return '' return ''
result = model.get_morpho().to_text() result = model.get_morpho().to_text()
@ -41,10 +41,10 @@ def normalize(text: str) -> str:
return model.normal_form() return model.normal_form()
def inflect(text: str, target_tags: str) -> str: def inflect(text: str, target_grams: str) -> str:
''' Inflect text to match required tags. ''' Inflect text to match required tags.
::returns:: infected text or initial text if infection failed ''' ::returns:: infected text or initial text if infection failed '''
target_set = split_tags(target_tags) target_set = split_grams(target_grams)
model = parser.parse(text) model = parser.parse(text)
if model is None: if model is None:
return text return text
@ -66,15 +66,15 @@ def inflect_dependant(dependant_normal: str, master: str) -> str:
return parser.inflect_dependant(dependant_normal, master) return parser.inflect_dependant(dependant_normal, master)
def match_all_morpho(text: str, filter_tags: str) -> list[list[int]]: def match_all_morpho(text: str, filter_grams: str) -> list[list[int]]:
''' Search for all words corresponding to tags. ''' ''' Search for all words corresponding to tags. '''
target_set = split_tags(filter_tags) target_set = split_grams(filter_grams)
if len(target_set) == 0: if len(target_set) == 0:
return [] return []
result = [] result = []
for elem in RuSyntax.tokenize(text): for elem in RuSyntax.tokenize(text):
model = parser.parse(elem.text, require_tags=target_set) model = parser.parse(elem.text, require_grams=target_set)
if model: if model:
result.append([elem.start, elem.stop]) result.append([elem.start, elem.stop])
return result return result

View File

@ -8,14 +8,15 @@ from pymorphy2.tagset import OpencorporaTag as WordTag
# ''' Morphology parser. ''' # ''' Morphology parser. '''
morpho = MorphAnalyzer() morpho = MorphAnalyzer()
Grammemes = Iterable[str]
def split_tags(text: str) -> list[str]: def split_grams(text: str) -> list[str]:
''' Split grammemes string into set of items. ''' ''' Split grammemes string into set of items. '''
return [tag.strip() for tag in filter(None, text.split(','))] return [tag.strip() for tag in filter(None, text.split(','))]
def combine_tags(tags: Iterable[str]) -> str: def combine_grams(tags: Iterable[str]) -> str:
''' Combine grammemes into string. ''' ''' Combine grammemes into string. '''
return ','.join(tags) return ','.join(tags)
@ -74,9 +75,9 @@ class Morphology:
return 'NOUN' return 'NOUN'
return pos return pos
def complete_tags(self, tags: Iterable[str]) -> set[str]: def complete_grams(self, grams: Iterable[str]) -> set[str]:
''' Add missing tags before inflection. ''' ''' Add missing tags before inflection. '''
result = set(tags) result = set(grams)
pos = self.tag.POS pos = self.tag.POS
if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH): if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH):
result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB') result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB')
@ -100,7 +101,7 @@ class Morphology:
result = result.difference(WordTag.GENDERS) result = result.difference(WordTag.GENDERS)
return result return result
def coordination_tags(self) -> set[str]: def coordination_grams(self) -> set[str]:
''' Return set of grammemes for inflection to keep coordination . ''' ''' Return set of grammemes for inflection to keep coordination . '''
result = set() result = set()
if self.tag.case: if self.tag.case:
@ -114,4 +115,4 @@ class Morphology:
def to_text(self) -> str: def to_text(self) -> str:
''' Produce string of all grammemes. ''' ''' Produce string of all grammemes. '''
return combine_tags(self.tag.grammemes) return combine_grams(self.tag.grammemes)

View File

@ -1,38 +1,36 @@
''' Parsing russian language using pymorphy2 and natasha libraries. ''' ''' Parsing russian language using pymorphy2 and natasha libraries. '''
from __future__ import annotations from __future__ import annotations
from typing import Iterable, Optional from typing import Optional
from razdel.substring import Substring as Segment from razdel.substring import Substring as Segment
from pymorphy2.analyzer import Parse as WordForm from pymorphy2.analyzer import Parse as WordParse
from .syntax import RuSyntax, Capitalization from .syntax import RuSyntax, Capitalization
from .rumodel import SemanticRole, Morphology, WordTag, morpho from .rumodel import SemanticRole, Morphology, WordTag, morpho, Grammemes
INDEX_NONE = -1 INDEX_NONE = -1
NO_COORDINATION = -1 NO_COORDINATION = -1
WORD_NONE = -1 WORD_NONE = -1
Tags = Iterable[str]
class WordToken: class WordToken:
''' Atomic text token. ''' ''' Atomic text token. '''
def __init__(self, segment: Segment, forms: list[WordForm], main_form: int = 0): def __init__(self, segment: Segment, parse: list[WordParse], main_parse: int = 0):
self.segment: Segment = segment self.segment: Segment = segment
self.forms: list[WordForm] = forms self.forms: list[WordParse] = parse
self.main: int = main_form self.main: int = main_parse
def get_morpho(self) -> Morphology: def get_morpho(self) -> Morphology:
''' Return morphology for current token. ''' ''' Return morphology for current token. '''
return Morphology(self.get_form().tag) return Morphology(self.get_parse().tag)
def get_form(self) -> WordForm: def get_parse(self) -> WordParse:
''' Access main form. ''' ''' Access main form. '''
return self.forms[self.main] return self.forms[self.main]
def inflect(self, inflection_tags: set[str]) -> Optional[WordForm]: def inflect(self, inflection_grams: set[str]) -> Optional[WordParse]:
''' Apply inflection to segment text. Does not modify forms ''' ''' Apply inflection to segment text. Does not modify forms '''
inflected = self.get_form().inflect(inflection_tags) inflected = self.get_parse().inflect(inflection_grams)
if not inflected: if not inflected:
return None return None
self.segment.text = Capitalization.from_text(self.segment.text).apply_to(inflected.word) self.segment.text = Capitalization.from_text(self.segment.text).apply_to(inflected.word)
@ -51,9 +49,9 @@ class Collation:
''' Check if data is parsed correctly ''' ''' Check if data is parsed correctly '''
return self.main_word != WORD_NONE return self.main_word != WORD_NONE
def get_form(self) -> WordForm: def get_form(self) -> WordParse:
''' Access WordForm. ''' ''' Access WordParse. '''
return self.words[self.main_word].get_form() return self.words[self.main_word].get_parse()
def get_morpho(self) -> Morphology: def get_morpho(self) -> Morphology:
''' Access parsed main mrophology. ''' ''' Access parsed main mrophology. '''
@ -64,12 +62,12 @@ class Collation:
self.words.append(WordToken(segment, forms, main_form)) self.words.append(WordToken(segment, forms, main_form))
self.coordination.append(NO_COORDINATION if not need_coordination else 0) self.coordination.append(NO_COORDINATION if not need_coordination else 0)
def inflect(self, target_tags: Tags) -> str: def inflect(self, target_grams: Grammemes) -> str:
''' Inflect text to match required tags. ''' ''' Inflect text to match required tags. '''
if self.is_valid(): if self.is_valid():
origin = self.get_morpho() origin = self.get_morpho()
if not origin.tag.grammemes.issuperset(target_tags): if not origin.tag.grammemes.issuperset(target_grams):
if self._apply_inflection(origin, target_tags): if self._apply_inflection(origin, target_grams):
return self._generate_text() return self._generate_text()
return self.text return self.text
@ -80,7 +78,7 @@ class Collation:
if morph.effective_POS: if morph.effective_POS:
tags = set() tags = set()
tags.add(morph.effective_POS) tags.add(morph.effective_POS)
tags = morph.complete_tags(tags) tags = morph.complete_grams(tags)
return self.inflect(tags) return self.inflect(tags)
return self.text return self.text
@ -88,8 +86,8 @@ class Collation:
''' Create inflection to coordinate with master_model form. ''' ''' Create inflection to coordinate with master_model form. '''
assert self.is_valid() assert self.is_valid()
morph = master_model.get_morpho() morph = master_model.get_morpho()
tags = morph.coordination_tags() tags = morph.coordination_grams()
tags = self.get_morpho().complete_tags(tags) tags = self.get_morpho().complete_grams(tags)
return self.inflect(tags) return self.inflect(tags)
def normal_form(self) -> str: def normal_form(self) -> str:
@ -97,8 +95,8 @@ class Collation:
if self.is_valid(): if self.is_valid():
main_form = self.get_form() main_form = self.get_form()
new_morpho = Morphology(main_form.normalized.tag) new_morpho = Morphology(main_form.normalized.tag)
new_tags = new_morpho.complete_tags(frozenset()) new_grams = new_morpho.complete_grams(frozenset())
return self.inflect(new_tags) return self.inflect(new_grams)
return self.text return self.text
def _iterate_coordinated(self): def _iterate_coordinated(self):
@ -108,22 +106,22 @@ class Collation:
yield self.words[current_word] yield self.words[current_word]
current_word += self.coordination[current_word] current_word += self.coordination[current_word]
def _inflect_main_word(self, origin: Morphology, target_tags: Tags) -> Optional[Morphology]: def _inflect_main_word(self, origin: Morphology, target_grams: Grammemes) -> Optional[Morphology]:
full_tags = origin.complete_tags(target_tags) full_grams = origin.complete_grams(target_grams)
inflected = self.words[self.main_word].inflect(full_tags) inflected = self.words[self.main_word].inflect(full_grams)
if not inflected: if not inflected:
return None return None
return Morphology(inflected.tag) return Morphology(inflected.tag)
def _apply_inflection(self, origin: Morphology, target_tags: Tags) -> bool: def _apply_inflection(self, origin: Morphology, target_grams: Grammemes) -> bool:
new_moprho = self._inflect_main_word(origin, target_tags) new_moprho = self._inflect_main_word(origin, target_grams)
if not new_moprho: if not new_moprho:
return False return False
inflection_tags = new_moprho.coordination_tags() inflection_grams = new_moprho.coordination_grams()
if len(inflection_tags) == 0: if len(inflection_grams) == 0:
return True return True
for word in self._iterate_coordinated(): for word in self._iterate_coordinated():
word.inflect(inflection_tags) word.inflect(inflection_grams)
return True return True
def _generate_text(self) -> str: def _generate_text(self) -> str:
@ -156,7 +154,7 @@ class PhraseParser:
def parse(self, text: str, def parse(self, text: str,
require_index: int = INDEX_NONE, require_index: int = INDEX_NONE,
require_tags: Optional[Tags] = None) -> Optional[Collation]: require_grams: Optional[Grammemes] = None) -> Optional[Collation]:
''' '''
Determine morpho tags for input text. Determine morpho tags for input text.
::returns:: Morphology of a text or None if no suitable form is available ::returns:: Morphology of a text or None if no suitable form is available
@ -165,9 +163,9 @@ class PhraseParser:
if len(segments) == 0: if len(segments) == 0:
return None return None
elif len(segments) == 1: elif len(segments) == 1:
return self._parse_single(segments[0], require_index, require_tags) return self._parse_single(segments[0], require_index, require_grams)
else: else:
return self._parse_multiword(text, segments, require_index, require_tags) return self._parse_multiword(text, segments, require_index, require_grams)
def normalize(self, text: str): def normalize(self, text: str):
''' Get normal form for target text. ''' ''' Get normal form for target text. '''
@ -237,7 +235,7 @@ class PhraseParser:
return dependant_normal return dependant_normal
return dependant_model.inflect_dependant(master_model) return dependant_model.inflect_dependant(master_model)
def _parse_single(self, segment, require_index: int, require_tags: Optional[Tags]) -> Optional[Collation]: def _parse_single(self, segment, require_index: int, require_grams: Optional[Grammemes]) -> Optional[Collation]:
forms = list(self._filtered_parse(segment.text)) forms = list(self._filtered_parse(segment.text))
parse_index = INDEX_NONE parse_index = INDEX_NONE
if len(forms) == 0 or require_index >= len(forms): if len(forms) == 0 or require_index >= len(forms):
@ -245,13 +243,13 @@ class PhraseParser:
if require_index != INDEX_NONE: if require_index != INDEX_NONE:
tags = forms[require_index].tag tags = forms[require_index].tag
if require_tags and not tags.grammemes.issuperset(require_tags): if require_grams and not tags.grammemes.issuperset(require_grams):
return None return None
parse_index = require_index parse_index = require_index
else: else:
current_score = 0 current_score = 0
for (index, form) in enumerate(forms): for (index, form) in enumerate(forms):
if not require_tags or form.tag.grammemes.issuperset(require_tags): if not require_grams or form.tag.grammemes.issuperset(require_grams):
if form.tag.case == 'nomn': if form.tag.case == 'nomn':
parse_index = index parse_index = index
break break
@ -270,7 +268,7 @@ class PhraseParser:
return result return result
def _parse_multiword(self, text: str, segments: list, require_index: int, def _parse_multiword(self, text: str, segments: list, require_index: int,
require_tags: Optional[Tags]) -> Optional[Collation]: require_grams: Optional[Grammemes]) -> Optional[Collation]:
result = Collation(text) result = Collation(text)
priority_main: float = self._PRIORITY_NONE priority_main: float = self._PRIORITY_NONE
segment_index = 0 segment_index = 0
@ -280,7 +278,7 @@ class PhraseParser:
if main_wait > PhraseParser._MAIN_WAIT_LIMIT: if main_wait > PhraseParser._MAIN_WAIT_LIMIT:
break break
segment_index += 1 segment_index += 1
priority = self._parse_segment(result, segment, require_index, require_tags) priority = self._parse_segment(result, segment, require_index, require_grams)
if priority is None: if priority is None:
continue # skip non-parsable entities continue # skip non-parsable entities
main_wait += 1 main_wait += 1
@ -299,7 +297,7 @@ class PhraseParser:
output: Collation, output: Collation,
segment: Segment, segment: Segment,
require_index: int, require_index: int,
require_tags: Optional[Tags]) -> Optional[float]: require_grams: Optional[Grammemes]) -> Optional[float]:
''' Return priority for this can be a new main word ''' ''' Return priority for this can be a new main word '''
forms = list(self._filtered_parse(segment.text)) forms = list(self._filtered_parse(segment.text))
if len(forms) == 0: if len(forms) == 0:
@ -311,14 +309,14 @@ class PhraseParser:
score_sum: float = 0 score_sum: float = 0
if require_index != INDEX_NONE: if require_index != INDEX_NONE:
form = forms[require_index] form = forms[require_index]
if not require_tags or form.tag.grammemes.issuperset(require_tags): if not require_grams or form.tag.grammemes.issuperset(require_grams):
(local_max, segment_score) = PhraseParser._get_priorities_for(form.tag) (local_max, segment_score) = PhraseParser._get_priorities_for(form.tag)
main_index = require_index main_index = require_index
needs_coordination = Morphology.is_dependable(form.tag.POS) needs_coordination = Morphology.is_dependable(form.tag.POS)
else: else:
local_max = self._PRIORITY_NONE local_max = self._PRIORITY_NONE
for (index, form) in enumerate(forms): for (index, form) in enumerate(forms):
if require_tags and not form.tag.grammemes.issuperset(require_tags): if require_grams and not form.tag.grammemes.issuperset(require_grams):
continue continue
(local_priority, global_priority) = PhraseParser._get_priorities_for(form.tag) (local_priority, global_priority) = PhraseParser._get_priorities_for(form.tag)
needs_coordination = needs_coordination or Morphology.is_dependable(form.tag.POS) needs_coordination = needs_coordination or Morphology.is_dependable(form.tag.POS)
@ -419,20 +417,20 @@ class PhraseParser:
@staticmethod @staticmethod
def _parse_word(text: str, require_index: int = INDEX_NONE, def _parse_word(text: str, require_index: int = INDEX_NONE,
require_tags: Optional[Tags] = None) -> Optional[Morphology]: require_grams: Optional[Grammemes] = None) -> Optional[Morphology]:
parsed_variants = morpho.parse(text) parsed_variants = morpho.parse(text)
if not parsed_variants or require_index >= len(parsed_variants): if not parsed_variants or require_index >= len(parsed_variants):
return None return None
if require_index != INDEX_NONE: if require_index != INDEX_NONE:
tags = parsed_variants[require_index].tag tags = parsed_variants[require_index].tag
if not require_tags or tags.grammemes.issuperset(require_tags): if not require_grams or tags.grammemes.issuperset(require_grams):
return Morphology(tags) return Morphology(tags)
else: else:
return None return None
else: else:
for variant in parsed_variants: for variant in parsed_variants:
tags = variant.tag tags = variant.tag
if not require_tags or tags.grammemes.issuperset(require_tags): if not require_grams or tags.grammemes.issuperset(require_grams):
return Morphology(tags) return Morphology(tags)
return None return None

View File

@ -7,13 +7,13 @@ import cctext as cc
class TestConceptAPI(unittest.TestCase): class TestConceptAPI(unittest.TestCase):
'''Test class for Concept API.''' '''Test class for Concept API.'''
def _assert_tags(self, actual: str, expected: str): def _assert_tags(self, actual: str, expected: str):
self.assertEqual(set(cc.split_tags(actual)), set(cc.split_tags(expected))) self.assertEqual(set(cc.split_grams(actual)), set(cc.split_grams(expected)))
def test_parse(self): def test_parse(self):
''' Test parsing. ''' ''' Test parsing. '''
self._assert_tags(cc.parse(''), '') self._assert_tags(cc.parse(''), '')
self._assert_tags(cc.parse('1'), 'NUMB,intg') self._assert_tags(cc.parse('1'), 'NUMB,intg')
self._assert_tags(cc.parse('слон', require_tags='masc'), 'NOUN,anim,masc,sing,nomn') self._assert_tags(cc.parse('слон', require_grams='masc'), 'NOUN,anim,masc,sing,nomn')
def test_normalize_word(self): def test_normalize_word(self):
''' Test normalize for single word. ''' ''' Test normalize for single word. '''

View File

@ -1,18 +1,18 @@
''' Unit tests: rumodel. ''' ''' Unit tests: rumodel. '''
import unittest import unittest
from cctext import split_tags, combine_tags from cctext import split_grams, combine_grams
class TestTags(unittest.TestCase): class TestTags(unittest.TestCase):
'''Test tags manipulation.''' '''Test tags manipulation.'''
def test_split_tags(self): def test_split_tags(self):
self.assertEqual(split_tags(''), []) self.assertEqual(split_grams(''), [])
self.assertEqual(split_tags('NOUN'), ['NOUN']) self.assertEqual(split_grams('NOUN'), ['NOUN'])
self.assertEqual(split_tags('NOUN,plur,sing'), ['NOUN','plur','sing']) self.assertEqual(split_grams('NOUN,plur,sing'), ['NOUN','plur','sing'])
def test_combine_tags(self): def test_combine_tags(self):
self.assertEqual(combine_tags([]), '') self.assertEqual(combine_grams([]), '')
self.assertEqual(combine_tags(['NOUN']), 'NOUN') self.assertEqual(combine_grams(['NOUN']), 'NOUN')
self.assertEqual(combine_tags(['NOUN','plur','sing']), 'NOUN,plur,sing') self.assertEqual(combine_grams(['NOUN','plur','sing']), 'NOUN,plur,sing')

View File

@ -12,8 +12,8 @@ class TestRuParser(unittest.TestCase):
def _assert_parse(self, text: str, expected: Iterable[str], def _assert_parse(self, text: str, expected: Iterable[str],
require_index: int = -1, require_index: int = -1,
require_tags: Optional[Iterable[str]] = None): require_grams: Optional[Iterable[str]] = None):
phrase = parser.parse(text, require_index, require_tags) phrase = parser.parse(text, require_index, require_grams)
self.assertIsNotNone(phrase) self.assertIsNotNone(phrase)
if phrase: if phrase:
self.assertEqual(phrase.get_morpho().tag.grammemes, set(expected)) self.assertEqual(phrase.get_morpho().tag.grammemes, set(expected))
@ -51,10 +51,10 @@ class TestRuParser(unittest.TestCase):
self._assert_parse('32-', ['intg', 'NUMB']) self._assert_parse('32-', ['intg', 'NUMB'])
self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_index=0) self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_index=0)
self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_tags=['masc']) self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_grams=['masc'])
self._assert_parse('прямой', ['ADJF', 'gent', 'sing', 'femn', 'Qual'], require_index=0) self._assert_parse('прямой', ['ADJF', 'gent', 'sing', 'femn', 'Qual'], require_index=0)
self._assert_parse('прямой', ['ADJF', 'datv', 'Qual', 'sing', 'femn'], require_index=1) self._assert_parse('прямой', ['ADJF', 'datv', 'Qual', 'sing', 'femn'], require_index=1)
self._assert_parse('прямой', ['NOUN', 'sing', 'inan', 'femn', 'gent'], require_tags=['NOUN']) self._assert_parse('прямой', ['NOUN', 'sing', 'inan', 'femn', 'gent'], require_grams=['NOUN'])
self._assert_parse('консистенции', ['NOUN', 'inan', 'femn', 'plur', 'nomn']) self._assert_parse('консистенции', ['NOUN', 'inan', 'femn', 'plur', 'nomn'])
self._assert_parse('тест', ['NOUN', 'sing', 'masc', 'inan', 'nomn']) self._assert_parse('тест', ['NOUN', 'sing', 'masc', 'inan', 'nomn'])
@ -65,9 +65,9 @@ class TestRuParser(unittest.TestCase):
self.assertEqual(parser.parse('КАиП'), None) self.assertEqual(parser.parse('КАиП'), None)
self.assertEqual(parser.parse('СЛОН'), None) self.assertEqual(parser.parse('СЛОН'), None)
self.assertEqual(parser.parse(''), None) self.assertEqual(parser.parse(''), None)
self.assertEqual(parser.parse('слон', require_tags=set(['femn'])), None) self.assertEqual(parser.parse('слон', require_grams=set(['femn'])), None)
self.assertEqual(parser.parse('32', require_tags=set(['NOUN'])), None) self.assertEqual(parser.parse('32', require_grams=set(['NOUN'])), None)
self.assertEqual(parser.parse('32-', require_tags=set(['NOUN'])), None) self.assertEqual(parser.parse('32-', require_grams=set(['NOUN'])), None)
self.assertEqual(parser.parse('слон', require_index=42), None) self.assertEqual(parser.parse('слон', require_index=42), None)
def test_parse_text(self): def test_parse_text(self):