mirror of
https://github.com/IRBorisov/cctext.git
synced 2025-06-25 21:20:36 +03:00
119 lines
4.6 KiB
Python
119 lines
4.6 KiB
Python
''' Russian language models. '''
|
|
from __future__ import annotations
|
|
from enum import Enum, unique
|
|
from typing import Iterable, Optional
|
|
|
|
from pymorphy3 import MorphAnalyzer
|
|
from pymorphy3.tagset import OpencorporaTag as WordTag
|
|
|
|
# ''' Morphology parser. '''
|
|
morpho = MorphAnalyzer()
|
|
Grammemes = Iterable[str]
|
|
|
|
|
|
def split_grams(text: str) -> list[str]:
|
|
''' Split grammemes string into set of items. '''
|
|
return [tag.strip() for tag in filter(None, text.split(','))]
|
|
|
|
|
|
def combine_grams(tags: Iterable[str]) -> str:
|
|
''' Combine grammemes into string. '''
|
|
return ','.join(tags)
|
|
|
|
|
|
@unique
|
|
class SemanticRole(Enum):
|
|
''' Enumerating semantic types for different parse patterns. '''
|
|
unknwn = 0
|
|
term = 1
|
|
action = 2
|
|
definition = 3
|
|
|
|
@staticmethod
|
|
def from_POS(pos: Optional[str]) -> SemanticRole:
|
|
''' Production method: types from part of speech. '''
|
|
if pos in ['NOUN', 'NPRO']:
|
|
return SemanticRole.term
|
|
elif pos in ['VERB', 'INFN', 'PRTF', 'PRTS']:
|
|
return SemanticRole.action
|
|
elif pos in ['ADJF', 'ADJS']:
|
|
return SemanticRole.definition
|
|
return SemanticRole.unknwn
|
|
|
|
|
|
class Morphology:
|
|
''' Wrapper for OpencorporaTag expanding functionality for multiword.
|
|
Full morphology tags see http://opencorpora.org/dict.php?act=gram
|
|
'''
|
|
def __init__(self, tag: WordTag, semantic=SemanticRole.unknwn):
|
|
self.tag = tag
|
|
self.semantic = semantic if semantic != SemanticRole.unknwn else SemanticRole.from_POS(tag.POS)
|
|
|
|
_TAGS_IMMUTABLE = frozenset(['INFN', 'ADVB', 'COMP', 'PNCT', 'PREP', 'CONJ', 'PRCL', 'INTJ'])
|
|
|
|
_TAGS_NO_TENSE = frozenset(['NOUN', 'NPRO', 'ADJF', 'ADJS'])
|
|
_TAGS_NO_CASE = frozenset(['GRND', 'VERB', 'ADJS', 'PRTS'])
|
|
_TAGS_NO_NUMBER = frozenset(['GRND'])
|
|
_TAGS_NO_GENDER = frozenset(['GRND', 'NOUN', 'NPRO', 'plur'])
|
|
_TAGS_NO_PERSON = frozenset(['GRND', 'NOUN', 'ADJF', 'ADJS', 'PRTF', 'PRTS', 'past'])
|
|
|
|
@property
|
|
def can_coordinate(self) -> bool:
|
|
''' Check if coordination can change text. '''
|
|
return self.tag.POS in ['NOUN', 'NPRO', 'NUMR', 'ADJF', 'ADJS', 'PRTF', 'PRTS']
|
|
|
|
@staticmethod
|
|
def is_dependable(pos: str):
|
|
''' Check if this morphology can be dependant. '''
|
|
return pos in ['ADJF', 'ADJS', 'PRTF', 'PRTS']
|
|
|
|
@property
|
|
def effective_POS(self) -> Optional[str]:
|
|
''' Access part of speech. Pronouns are considered as nouns '''
|
|
pos: Optional[str] = self.tag.POS
|
|
if pos and self.tag.POS == 'NPRO':
|
|
return 'NOUN'
|
|
return pos
|
|
|
|
def complete_grams(self, grams: Iterable[str]) -> set[str]:
|
|
''' Add missing tags before inflection. '''
|
|
result = set(grams)
|
|
pos = self.tag.POS
|
|
if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH):
|
|
result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB')
|
|
if not result.isdisjoint(self._TAGS_IMMUTABLE):
|
|
return result
|
|
if self.tag.case and result.isdisjoint(WordTag.CASES) and result.isdisjoint(self._TAGS_NO_CASE):
|
|
result.add(self.tag.case)
|
|
if self.tag.tense and result.isdisjoint(WordTag.TENSES) and result.isdisjoint(self._TAGS_NO_TENSE):
|
|
if (self.tag.tense != 'past' or result.isdisjoint(WordTag.PERSONS)) \
|
|
and (self.tag.tense != 'pres' or result.isdisjoint(WordTag.GENDERS)):
|
|
result.add(self.tag.tense)
|
|
if self.tag.number and result.isdisjoint(WordTag.NUMBERS) and result.isdisjoint(self._TAGS_NO_NUMBER):
|
|
if self.tag.number != 'plur' or result.isdisjoint(WordTag.GENDERS):
|
|
result.add(self.tag.number)
|
|
if self.tag.gender and result.isdisjoint(WordTag.GENDERS) and result.isdisjoint(self._TAGS_NO_GENDER):
|
|
if 'PRTF' in result or 'pres' not in result:
|
|
result.add(self.tag.gender)
|
|
if self.tag.person and result.isdisjoint(WordTag.PERSONS) and result.isdisjoint(self._TAGS_NO_PERSON):
|
|
result.add(self.tag.person)
|
|
if 'plur' in result and not result.isdisjoint(WordTag.GENDERS):
|
|
result = result.difference(WordTag.GENDERS)
|
|
return result
|
|
|
|
def coordination_grams(self) -> set[str]:
|
|
''' Return set of grammemes for inflection to keep coordination . '''
|
|
result = set()
|
|
if self.tag.case:
|
|
result.add(self.tag.case)
|
|
if self.tag:
|
|
number = self.tag.number
|
|
result.add(number)
|
|
if self.tag.gender and 'plur' not in result:
|
|
result.add(self.tag.gender)
|
|
return result
|
|
|
|
def to_text(self) -> str:
|
|
''' Produce string of all grammemes. '''
|
|
return combine_grams(self.tag.grammemes)
|