CCText/cctext/rumodel.py

''' Russian language models. '''
from __future__ import annotations
from enum import Enum, unique
from typing import Iterable, Optional

from pymorphy3 import MorphAnalyzer
from pymorphy3.tagset import OpencorporaTag as WordTag

# ''' Morphology parser. '''
morpho = MorphAnalyzer()
Grammemes = Iterable[str]


def split_grams(text: str) -> list[str]:
    ''' Split grammemes string into set of items. '''
    return [tag.strip() for tag in filter(None, text.split(','))]


def combine_grams(tags: Iterable[str]) -> str:
    ''' Combine grammemes into string. '''
    return ','.join(tags)


@unique
class SemanticRole(Enum):
    ''' Enumerating semantic types for different parse patterns. '''
    unknwn = 0
    term = 1
    action = 2
    definition = 3

    @staticmethod
    def from_POS(pos: Optional[str]) -> SemanticRole:
        ''' Production method: types from part of speech. '''
        if pos in ['NOUN', 'NPRO']:
            return SemanticRole.term
        elif pos in ['VERB', 'INFN', 'PRTF', 'PRTS']:
            return SemanticRole.action
        elif pos in ['ADJF', 'ADJS']:
            return SemanticRole.definition
        return SemanticRole.unknwn


class Morphology:
    ''' Wrapper for OpencorporaTag expanding functionality for multiword.
        Full morphology tags see http://opencorpora.org/dict.php?act=gram
    '''
    def __init__(self, tag: WordTag, semantic=SemanticRole.unknwn):
        self.tag = tag
        self.semantic = semantic if semantic != SemanticRole.unknwn else SemanticRole.from_POS(tag.POS)

    _TAGS_IMMUTABLE = frozenset(['INFN', 'ADVB', 'COMP', 'PNCT', 'PREP', 'CONJ', 'PRCL', 'INTJ'])

    _TAGS_NO_TENSE = frozenset(['NOUN', 'NPRO', 'ADJF', 'ADJS'])
    _TAGS_NO_CASE = frozenset(['GRND', 'VERB', 'ADJS', 'PRTS'])
    _TAGS_NO_NUMBER = frozenset(['GRND'])
    _TAGS_NO_GENDER = frozenset(['GRND', 'NOUN', 'NPRO', 'plur'])
    _TAGS_NO_PERSON = frozenset(['GRND', 'NOUN', 'ADJF', 'ADJS', 'PRTF', 'PRTS', 'past'])

    @property
    def can_coordinate(self) -> bool:
        ''' Check if coordination can change text. '''
        return self.tag.POS in ['NOUN', 'NPRO', 'NUMR', 'ADJF', 'ADJS', 'PRTF', 'PRTS']

    @staticmethod
    def is_dependable(pos: str):
        ''' Check if this morphology can be dependant. '''
        return pos in ['ADJF', 'ADJS', 'PRTF', 'PRTS']

    @property
    def effective_POS(self) -> Optional[str]:
        ''' Access part of speech. Pronouns are considered as nouns '''
        pos: Optional[str] = self.tag.POS
        if pos and self.tag.POS == 'NPRO':
            return 'NOUN'
        return pos

    def complete_grams(self, grams: Iterable[str]) -> set[str]:
        ''' Add missing tags before inflection. '''
        result = set(grams)
        pos = self.tag.POS
        if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH):
            result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB')
        if not result.isdisjoint(self._TAGS_IMMUTABLE):
            return result
        if self.tag.case and result.isdisjoint(WordTag.CASES) and result.isdisjoint(self._TAGS_NO_CASE):
            result.add(self.tag.case)
        if self.tag.tense and result.isdisjoint(WordTag.TENSES) and result.isdisjoint(self._TAGS_NO_TENSE):
            if (self.tag.tense != 'past' or result.isdisjoint(WordTag.PERSONS)) \
                    and (self.tag.tense != 'pres' or result.isdisjoint(WordTag.GENDERS)):
                result.add(self.tag.tense)
        if self.tag.number and result.isdisjoint(WordTag.NUMBERS) and result.isdisjoint(self._TAGS_NO_NUMBER):
            if self.tag.number != 'plur' or result.isdisjoint(WordTag.GENDERS):
                result.add(self.tag.number)
        if self.tag.gender and result.isdisjoint(WordTag.GENDERS) and result.isdisjoint(self._TAGS_NO_GENDER):
            if 'PRTF' in result or 'pres' not in result:
                result.add(self.tag.gender)
        if self.tag.person and result.isdisjoint(WordTag.PERSONS) and result.isdisjoint(self._TAGS_NO_PERSON):
            result.add(self.tag.person)
        if 'plur' in result and not result.isdisjoint(WordTag.GENDERS):
            result = result.difference(WordTag.GENDERS)
        return result

    def coordination_grams(self) -> set[str]:
        ''' Return set of grammemes for inflection to keep coordination . '''
        result = set()
        if self.tag.case:
            result.add(self.tag.case)
        if self.tag:
            number = self.tag.number
            result.add(number)
        if self.tag.gender and 'plur' not in result:
            result.add(self.tag.gender)
        return result

    def to_text(self) -> str:
        ''' Produce string of all grammemes. '''
        return combine_grams(self.tag.grammemes)