CCText/cctext/rumodel.py
2024-04-10 20:54:12 +03:00

119 lines
4.6 KiB
Python

''' Russian language models. '''
from __future__ import annotations
from enum import Enum, unique
from typing import Iterable, Optional
from pymorphy3 import MorphAnalyzer
from pymorphy3.tagset import OpencorporaTag as WordTag
# ''' Morphology parser. '''
morpho = MorphAnalyzer()
Grammemes = Iterable[str]
def split_grams(text: str) -> list[str]:
''' Split grammemes string into set of items. '''
return [tag.strip() for tag in filter(None, text.split(','))]
def combine_grams(tags: Iterable[str]) -> str:
''' Combine grammemes into string. '''
return ','.join(tags)
@unique
class SemanticRole(Enum):
''' Enumerating semantic types for different parse patterns. '''
unknwn = 0
term = 1
action = 2
definition = 3
@staticmethod
def from_POS(pos: Optional[str]) -> SemanticRole:
''' Production method: types from part of speech. '''
if pos in ['NOUN', 'NPRO']:
return SemanticRole.term
elif pos in ['VERB', 'INFN', 'PRTF', 'PRTS']:
return SemanticRole.action
elif pos in ['ADJF', 'ADJS']:
return SemanticRole.definition
return SemanticRole.unknwn
class Morphology:
''' Wrapper for OpencorporaTag expanding functionality for multiword.
Full morphology tags see http://opencorpora.org/dict.php?act=gram
'''
def __init__(self, tag: WordTag, semantic=SemanticRole.unknwn):
self.tag = tag
self.semantic = semantic if semantic != SemanticRole.unknwn else SemanticRole.from_POS(tag.POS)
_TAGS_IMMUTABLE = frozenset(['INFN', 'ADVB', 'COMP', 'PNCT', 'PREP', 'CONJ', 'PRCL', 'INTJ'])
_TAGS_NO_TENSE = frozenset(['NOUN', 'NPRO', 'ADJF', 'ADJS'])
_TAGS_NO_CASE = frozenset(['GRND', 'VERB', 'ADJS', 'PRTS'])
_TAGS_NO_NUMBER = frozenset(['GRND'])
_TAGS_NO_GENDER = frozenset(['GRND', 'NOUN', 'NPRO', 'plur'])
_TAGS_NO_PERSON = frozenset(['GRND', 'NOUN', 'ADJF', 'ADJS', 'PRTF', 'PRTS', 'past'])
@property
def can_coordinate(self) -> bool:
''' Check if coordination can change text. '''
return self.tag.POS in ['NOUN', 'NPRO', 'NUMR', 'ADJF', 'ADJS', 'PRTF', 'PRTS']
@staticmethod
def is_dependable(pos: str):
''' Check if this morphology can be dependant. '''
return pos in ['ADJF', 'ADJS', 'PRTF', 'PRTS']
@property
def effective_POS(self) -> Optional[str]:
''' Access part of speech. Pronouns are considered as nouns '''
pos: Optional[str] = self.tag.POS
if pos and self.tag.POS == 'NPRO':
return 'NOUN'
return pos
def complete_grams(self, grams: Iterable[str]) -> set[str]:
''' Add missing tags before inflection. '''
result = set(grams)
pos = self.tag.POS
if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH):
result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB')
if not result.isdisjoint(self._TAGS_IMMUTABLE):
return result
if self.tag.case and result.isdisjoint(WordTag.CASES) and result.isdisjoint(self._TAGS_NO_CASE):
result.add(self.tag.case)
if self.tag.tense and result.isdisjoint(WordTag.TENSES) and result.isdisjoint(self._TAGS_NO_TENSE):
if (self.tag.tense != 'past' or result.isdisjoint(WordTag.PERSONS)) \
and (self.tag.tense != 'pres' or result.isdisjoint(WordTag.GENDERS)):
result.add(self.tag.tense)
if self.tag.number and result.isdisjoint(WordTag.NUMBERS) and result.isdisjoint(self._TAGS_NO_NUMBER):
if self.tag.number != 'plur' or result.isdisjoint(WordTag.GENDERS):
result.add(self.tag.number)
if self.tag.gender and result.isdisjoint(WordTag.GENDERS) and result.isdisjoint(self._TAGS_NO_GENDER):
if 'PRTF' in result or 'pres' not in result:
result.add(self.tag.gender)
if self.tag.person and result.isdisjoint(WordTag.PERSONS) and result.isdisjoint(self._TAGS_NO_PERSON):
result.add(self.tag.person)
if 'plur' in result and not result.isdisjoint(WordTag.GENDERS):
result = result.difference(WordTag.GENDERS)
return result
def coordination_grams(self) -> set[str]:
''' Return set of grammemes for inflection to keep coordination . '''
result = set()
if self.tag.case:
result.add(self.tag.case)
if self.tag:
number = self.tag.number
result.add(number)
if self.tag.gender and 'plur' not in result:
result.add(self.tag.gender)
return result
def to_text(self) -> str:
''' Produce string of all grammemes. '''
return combine_grams(self.tag.grammemes)