''' Russian language models. ''' from __future__ import annotations from enum import Enum, IntEnum, unique from pymorphy2 import MorphAnalyzer from pymorphy2.tagset import OpencorporaTag as WordTag # ''' Morphology parser. ''' morpho = MorphAnalyzer() @unique class NamedEntityRole(IntEnum): ''' Enumerating NER types. ''' unknwn = 0 loc = 1 per = 2 org = 3 @staticmethod def from_str(text: str) -> NamedEntityRole: ''' From text to ID. ''' if text == 'LOC': return NamedEntityRole.loc elif text == 'PER': return NamedEntityRole.per elif text == 'ORG': return NamedEntityRole.org return NamedEntityRole.unknwn def as_str(self) -> str: ''' From ID to text. ''' if self.value == NamedEntityRole.loc: return 'LOC' elif self.value == NamedEntityRole.per: return 'PER' elif self.value == NamedEntityRole.org: return 'ORG' return 'UNKN' @unique class SemanticRole(Enum): ''' Enumerating semantic types for different parse patterns. ''' unknwn = 0 term = 1 action = 2 definition = 3 @staticmethod def from_pos(pos: str) -> SemanticRole: ''' Fabric method to produce types from part of speech. ''' if pos in ['NOUN', 'NPRO']: return SemanticRole.term elif pos in ['VERB', 'INFN', 'PRTF', 'PRTS']: return SemanticRole.action elif pos in ['ADJF', 'ADJS']: return SemanticRole.definition return SemanticRole.unknwn class Morphology: ''' Wrapper for OpencorporaTag expanding functionality for multiword. Full morphology tags see http://opencorpora.org/dict.php?act=gram ''' def __init__(self, tag: WordTag, semantic=SemanticRole.unknwn): self.tag = tag self.semantic = semantic if semantic != SemanticRole.unknwn else SemanticRole.from_pos(tag.POS) def __del__(self): pass _TAGS_IMMUTABLE = frozenset(['INFN', 'ADVB', 'COMP', 'PNCT', 'PREP', 'CONJ', 'PRCL', 'INTJ']) _TAGS_NO_TENSE = frozenset(['NOUN', 'NPRO', 'ADJF', 'ADJS']) _TAGS_NO_CASE = frozenset(['GRND', 'VERB', 'ADJS', 'PRTS']) _TAGS_NO_NUMBER = frozenset(['GRND']) _TAGS_NO_GENDER = frozenset(['GRND', 'NOUN', 'NPRO', 'plur']) _TAGS_NO_PERSON = frozenset(['GRND', 'NOUN', 'ADJF', 'ADJS', 'PRTF', 'PRTS', 'past']) @property def can_coordinate(self) -> bool: ''' Check if coordination can change text. ''' return self.tag.POS in ['NOUN', 'NPRO', 'NUMR', 'ADJF', 'ADJS', 'PRTF', 'PRTS'] @staticmethod def is_dependable(pos: str): ''' Check if this morphology can be dependant. ''' return pos in ['ADJF', 'ADJS', 'PRTF', 'PRTS'] @property def effective_pos(self) -> str: ''' Access part of speech. Pronouns are considered as nouns ''' pos = self.tag.POS if pos and self.tag.POS == 'NPRO': return 'NOUN' return pos def complete_tags(self, tags: frozenset[str]) -> set[str]: ''' Add missing tags before inflection. ''' result = set(tags) pos = self.tag.POS if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH): result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB') if not result.isdisjoint(self._TAGS_IMMUTABLE): return result if self.tag.case and result.isdisjoint(WordTag.CASES) and result.isdisjoint(self._TAGS_NO_CASE): result.add(self.tag.case) if self.tag.tense and result.isdisjoint(WordTag.TENSES) and result.isdisjoint(self._TAGS_NO_TENSE): if (self.tag.tense != 'past' or result.isdisjoint(WordTag.PERSONS)) \ and (self.tag.tense != 'pres' or result.isdisjoint(WordTag.GENDERS)): result.add(self.tag.tense) if self.tag.number and result.isdisjoint(WordTag.NUMBERS) and result.isdisjoint(self._TAGS_NO_NUMBER): if self.tag.number != 'plur' or result.isdisjoint(WordTag.GENDERS): result.add(self.tag.number) if self.tag.gender and result.isdisjoint(WordTag.GENDERS) and result.isdisjoint(self._TAGS_NO_GENDER): if 'PRTF' in result or 'pres' not in result: result.add(self.tag.gender) if self.tag.person and result.isdisjoint(WordTag.PERSONS) and result.isdisjoint(self._TAGS_NO_PERSON): result.add(self.tag.person) if 'plur' in result and not result.isdisjoint(WordTag.GENDERS): result = result.difference(WordTag.GENDERS) return result def coordination_tags(self) -> set[str]: ''' Return set of grammemes for inflection to keep coordination . ''' result = set() if self.tag.case: result.add(self.tag.case) if self.tag: number = self.tag.number result.add(number) if self.tag.gender and 'plur' not in result: result.add(self.tag.gender) return result def as_str(self) -> str: ''' Produce string of all grammemes. ''' grammemes = self.tag.grammemes count = len(grammemes) if count == 0: return '' elif count == 1: return next(iter(grammemes)) else: return ','.join(grammemes)