mirror of
https://github.com/IRBorisov/ConceptPortal.git
synced 2025-08-14 21:00:37 +03:00
Refactoring: distinguish grammems and WordTag
This commit is contained in:
parent
7dbbbab15a
commit
b3fd0d9ff7
|
@ -1,7 +1,7 @@
|
||||||
''' Concept core text processing library. '''
|
''' Concept core text processing library. '''
|
||||||
# pylint: skip-file
|
# pylint: skip-file
|
||||||
from .syntax import RuSyntax, Capitalization
|
from .syntax import RuSyntax, Capitalization
|
||||||
from .rumodel import Morphology, SemanticRole, WordTag, morpho, split_tags, combine_tags
|
from .rumodel import Morphology, SemanticRole, WordTag, morpho, split_grams, combine_grams
|
||||||
from .ruparser import PhraseParser, WordToken, Collation
|
from .ruparser import PhraseParser, WordToken, Collation
|
||||||
from .reference import EntityReference, ReferenceType, SyntacticReference, parse_reference
|
from .reference import EntityReference, ReferenceType, SyntacticReference, parse_reference
|
||||||
from .context import TermForm, Entity, TermContext
|
from .context import TermForm, Entity, TermContext
|
||||||
|
|
|
@ -6,15 +6,15 @@ Concept API Python functions.
|
||||||
from cctext.rumodel import Morphology
|
from cctext.rumodel import Morphology
|
||||||
from .syntax import RuSyntax
|
from .syntax import RuSyntax
|
||||||
from .ruparser import PhraseParser
|
from .ruparser import PhraseParser
|
||||||
from .rumodel import split_tags
|
from .rumodel import split_grams
|
||||||
|
|
||||||
parser = PhraseParser()
|
parser = PhraseParser()
|
||||||
|
|
||||||
|
|
||||||
def parse(text: str, require_tags: str = '') -> str:
|
def parse(text: str, require_grams: str = '') -> str:
|
||||||
''' Determine morpho tags for input text.
|
''' Determine morpho tags for input text.
|
||||||
::returns:: string of comma separated grammar tags or empty string '''
|
::returns:: string of comma separated grammar tags or empty string '''
|
||||||
model = parser.parse(text, require_tags=split_tags(require_tags))
|
model = parser.parse(text, require_grams=split_grams(require_grams))
|
||||||
if model is None:
|
if model is None:
|
||||||
return ''
|
return ''
|
||||||
result = model.get_morpho().to_text()
|
result = model.get_morpho().to_text()
|
||||||
|
@ -41,10 +41,10 @@ def normalize(text: str) -> str:
|
||||||
return model.normal_form()
|
return model.normal_form()
|
||||||
|
|
||||||
|
|
||||||
def inflect(text: str, target_tags: str) -> str:
|
def inflect(text: str, target_grams: str) -> str:
|
||||||
''' Inflect text to match required tags.
|
''' Inflect text to match required tags.
|
||||||
::returns:: infected text or initial text if infection failed '''
|
::returns:: infected text or initial text if infection failed '''
|
||||||
target_set = split_tags(target_tags)
|
target_set = split_grams(target_grams)
|
||||||
model = parser.parse(text)
|
model = parser.parse(text)
|
||||||
if model is None:
|
if model is None:
|
||||||
return text
|
return text
|
||||||
|
@ -66,15 +66,15 @@ def inflect_dependant(dependant_normal: str, master: str) -> str:
|
||||||
return parser.inflect_dependant(dependant_normal, master)
|
return parser.inflect_dependant(dependant_normal, master)
|
||||||
|
|
||||||
|
|
||||||
def match_all_morpho(text: str, filter_tags: str) -> list[list[int]]:
|
def match_all_morpho(text: str, filter_grams: str) -> list[list[int]]:
|
||||||
''' Search for all words corresponding to tags. '''
|
''' Search for all words corresponding to tags. '''
|
||||||
target_set = split_tags(filter_tags)
|
target_set = split_grams(filter_grams)
|
||||||
if len(target_set) == 0:
|
if len(target_set) == 0:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
for elem in RuSyntax.tokenize(text):
|
for elem in RuSyntax.tokenize(text):
|
||||||
model = parser.parse(elem.text, require_tags=target_set)
|
model = parser.parse(elem.text, require_grams=target_set)
|
||||||
if model:
|
if model:
|
||||||
result.append([elem.start, elem.stop])
|
result.append([elem.start, elem.stop])
|
||||||
return result
|
return result
|
||||||
|
|
|
@ -8,14 +8,15 @@ from pymorphy2.tagset import OpencorporaTag as WordTag
|
||||||
|
|
||||||
# ''' Morphology parser. '''
|
# ''' Morphology parser. '''
|
||||||
morpho = MorphAnalyzer()
|
morpho = MorphAnalyzer()
|
||||||
|
Grammemes = Iterable[str]
|
||||||
|
|
||||||
|
|
||||||
def split_tags(text: str) -> list[str]:
|
def split_grams(text: str) -> list[str]:
|
||||||
''' Split grammemes string into set of items. '''
|
''' Split grammemes string into set of items. '''
|
||||||
return [tag.strip() for tag in filter(None, text.split(','))]
|
return [tag.strip() for tag in filter(None, text.split(','))]
|
||||||
|
|
||||||
|
|
||||||
def combine_tags(tags: Iterable[str]) -> str:
|
def combine_grams(tags: Iterable[str]) -> str:
|
||||||
''' Combine grammemes into string. '''
|
''' Combine grammemes into string. '''
|
||||||
return ','.join(tags)
|
return ','.join(tags)
|
||||||
|
|
||||||
|
@ -74,9 +75,9 @@ class Morphology:
|
||||||
return 'NOUN'
|
return 'NOUN'
|
||||||
return pos
|
return pos
|
||||||
|
|
||||||
def complete_tags(self, tags: Iterable[str]) -> set[str]:
|
def complete_grams(self, grams: Iterable[str]) -> set[str]:
|
||||||
''' Add missing tags before inflection. '''
|
''' Add missing tags before inflection. '''
|
||||||
result = set(tags)
|
result = set(grams)
|
||||||
pos = self.tag.POS
|
pos = self.tag.POS
|
||||||
if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH):
|
if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH):
|
||||||
result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB')
|
result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB')
|
||||||
|
@ -100,7 +101,7 @@ class Morphology:
|
||||||
result = result.difference(WordTag.GENDERS)
|
result = result.difference(WordTag.GENDERS)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def coordination_tags(self) -> set[str]:
|
def coordination_grams(self) -> set[str]:
|
||||||
''' Return set of grammemes for inflection to keep coordination . '''
|
''' Return set of grammemes for inflection to keep coordination . '''
|
||||||
result = set()
|
result = set()
|
||||||
if self.tag.case:
|
if self.tag.case:
|
||||||
|
@ -114,4 +115,4 @@ class Morphology:
|
||||||
|
|
||||||
def to_text(self) -> str:
|
def to_text(self) -> str:
|
||||||
''' Produce string of all grammemes. '''
|
''' Produce string of all grammemes. '''
|
||||||
return combine_tags(self.tag.grammemes)
|
return combine_grams(self.tag.grammemes)
|
||||||
|
|
|
@ -1,38 +1,36 @@
|
||||||
''' Parsing russian language using pymorphy2 and natasha libraries. '''
|
''' Parsing russian language using pymorphy2 and natasha libraries. '''
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Iterable, Optional
|
from typing import Optional
|
||||||
|
|
||||||
from razdel.substring import Substring as Segment
|
from razdel.substring import Substring as Segment
|
||||||
from pymorphy2.analyzer import Parse as WordForm
|
from pymorphy2.analyzer import Parse as WordParse
|
||||||
|
|
||||||
from .syntax import RuSyntax, Capitalization
|
from .syntax import RuSyntax, Capitalization
|
||||||
from .rumodel import SemanticRole, Morphology, WordTag, morpho
|
from .rumodel import SemanticRole, Morphology, WordTag, morpho, Grammemes
|
||||||
|
|
||||||
INDEX_NONE = -1
|
INDEX_NONE = -1
|
||||||
NO_COORDINATION = -1
|
NO_COORDINATION = -1
|
||||||
WORD_NONE = -1
|
WORD_NONE = -1
|
||||||
|
|
||||||
Tags = Iterable[str]
|
|
||||||
|
|
||||||
|
|
||||||
class WordToken:
|
class WordToken:
|
||||||
''' Atomic text token. '''
|
''' Atomic text token. '''
|
||||||
def __init__(self, segment: Segment, forms: list[WordForm], main_form: int = 0):
|
def __init__(self, segment: Segment, parse: list[WordParse], main_parse: int = 0):
|
||||||
self.segment: Segment = segment
|
self.segment: Segment = segment
|
||||||
self.forms: list[WordForm] = forms
|
self.forms: list[WordParse] = parse
|
||||||
self.main: int = main_form
|
self.main: int = main_parse
|
||||||
|
|
||||||
def get_morpho(self) -> Morphology:
|
def get_morpho(self) -> Morphology:
|
||||||
''' Return morphology for current token. '''
|
''' Return morphology for current token. '''
|
||||||
return Morphology(self.get_form().tag)
|
return Morphology(self.get_parse().tag)
|
||||||
|
|
||||||
def get_form(self) -> WordForm:
|
def get_parse(self) -> WordParse:
|
||||||
''' Access main form. '''
|
''' Access main form. '''
|
||||||
return self.forms[self.main]
|
return self.forms[self.main]
|
||||||
|
|
||||||
def inflect(self, inflection_tags: set[str]) -> Optional[WordForm]:
|
def inflect(self, inflection_grams: set[str]) -> Optional[WordParse]:
|
||||||
''' Apply inflection to segment text. Does not modify forms '''
|
''' Apply inflection to segment text. Does not modify forms '''
|
||||||
inflected = self.get_form().inflect(inflection_tags)
|
inflected = self.get_parse().inflect(inflection_grams)
|
||||||
if not inflected:
|
if not inflected:
|
||||||
return None
|
return None
|
||||||
self.segment.text = Capitalization.from_text(self.segment.text).apply_to(inflected.word)
|
self.segment.text = Capitalization.from_text(self.segment.text).apply_to(inflected.word)
|
||||||
|
@ -51,9 +49,9 @@ class Collation:
|
||||||
''' Check if data is parsed correctly '''
|
''' Check if data is parsed correctly '''
|
||||||
return self.main_word != WORD_NONE
|
return self.main_word != WORD_NONE
|
||||||
|
|
||||||
def get_form(self) -> WordForm:
|
def get_form(self) -> WordParse:
|
||||||
''' Access WordForm. '''
|
''' Access WordParse. '''
|
||||||
return self.words[self.main_word].get_form()
|
return self.words[self.main_word].get_parse()
|
||||||
|
|
||||||
def get_morpho(self) -> Morphology:
|
def get_morpho(self) -> Morphology:
|
||||||
''' Access parsed main mrophology. '''
|
''' Access parsed main mrophology. '''
|
||||||
|
@ -64,12 +62,12 @@ class Collation:
|
||||||
self.words.append(WordToken(segment, forms, main_form))
|
self.words.append(WordToken(segment, forms, main_form))
|
||||||
self.coordination.append(NO_COORDINATION if not need_coordination else 0)
|
self.coordination.append(NO_COORDINATION if not need_coordination else 0)
|
||||||
|
|
||||||
def inflect(self, target_tags: Tags) -> str:
|
def inflect(self, target_grams: Grammemes) -> str:
|
||||||
''' Inflect text to match required tags. '''
|
''' Inflect text to match required tags. '''
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
origin = self.get_morpho()
|
origin = self.get_morpho()
|
||||||
if not origin.tag.grammemes.issuperset(target_tags):
|
if not origin.tag.grammemes.issuperset(target_grams):
|
||||||
if self._apply_inflection(origin, target_tags):
|
if self._apply_inflection(origin, target_grams):
|
||||||
return self._generate_text()
|
return self._generate_text()
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
|
@ -80,7 +78,7 @@ class Collation:
|
||||||
if morph.effective_POS:
|
if morph.effective_POS:
|
||||||
tags = set()
|
tags = set()
|
||||||
tags.add(morph.effective_POS)
|
tags.add(morph.effective_POS)
|
||||||
tags = morph.complete_tags(tags)
|
tags = morph.complete_grams(tags)
|
||||||
return self.inflect(tags)
|
return self.inflect(tags)
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
|
@ -88,8 +86,8 @@ class Collation:
|
||||||
''' Create inflection to coordinate with master_model form. '''
|
''' Create inflection to coordinate with master_model form. '''
|
||||||
assert self.is_valid()
|
assert self.is_valid()
|
||||||
morph = master_model.get_morpho()
|
morph = master_model.get_morpho()
|
||||||
tags = morph.coordination_tags()
|
tags = morph.coordination_grams()
|
||||||
tags = self.get_morpho().complete_tags(tags)
|
tags = self.get_morpho().complete_grams(tags)
|
||||||
return self.inflect(tags)
|
return self.inflect(tags)
|
||||||
|
|
||||||
def normal_form(self) -> str:
|
def normal_form(self) -> str:
|
||||||
|
@ -97,8 +95,8 @@ class Collation:
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
main_form = self.get_form()
|
main_form = self.get_form()
|
||||||
new_morpho = Morphology(main_form.normalized.tag)
|
new_morpho = Morphology(main_form.normalized.tag)
|
||||||
new_tags = new_morpho.complete_tags(frozenset())
|
new_grams = new_morpho.complete_grams(frozenset())
|
||||||
return self.inflect(new_tags)
|
return self.inflect(new_grams)
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
def _iterate_coordinated(self):
|
def _iterate_coordinated(self):
|
||||||
|
@ -108,22 +106,22 @@ class Collation:
|
||||||
yield self.words[current_word]
|
yield self.words[current_word]
|
||||||
current_word += self.coordination[current_word]
|
current_word += self.coordination[current_word]
|
||||||
|
|
||||||
def _inflect_main_word(self, origin: Morphology, target_tags: Tags) -> Optional[Morphology]:
|
def _inflect_main_word(self, origin: Morphology, target_grams: Grammemes) -> Optional[Morphology]:
|
||||||
full_tags = origin.complete_tags(target_tags)
|
full_grams = origin.complete_grams(target_grams)
|
||||||
inflected = self.words[self.main_word].inflect(full_tags)
|
inflected = self.words[self.main_word].inflect(full_grams)
|
||||||
if not inflected:
|
if not inflected:
|
||||||
return None
|
return None
|
||||||
return Morphology(inflected.tag)
|
return Morphology(inflected.tag)
|
||||||
|
|
||||||
def _apply_inflection(self, origin: Morphology, target_tags: Tags) -> bool:
|
def _apply_inflection(self, origin: Morphology, target_grams: Grammemes) -> bool:
|
||||||
new_moprho = self._inflect_main_word(origin, target_tags)
|
new_moprho = self._inflect_main_word(origin, target_grams)
|
||||||
if not new_moprho:
|
if not new_moprho:
|
||||||
return False
|
return False
|
||||||
inflection_tags = new_moprho.coordination_tags()
|
inflection_grams = new_moprho.coordination_grams()
|
||||||
if len(inflection_tags) == 0:
|
if len(inflection_grams) == 0:
|
||||||
return True
|
return True
|
||||||
for word in self._iterate_coordinated():
|
for word in self._iterate_coordinated():
|
||||||
word.inflect(inflection_tags)
|
word.inflect(inflection_grams)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _generate_text(self) -> str:
|
def _generate_text(self) -> str:
|
||||||
|
@ -156,7 +154,7 @@ class PhraseParser:
|
||||||
|
|
||||||
def parse(self, text: str,
|
def parse(self, text: str,
|
||||||
require_index: int = INDEX_NONE,
|
require_index: int = INDEX_NONE,
|
||||||
require_tags: Optional[Tags] = None) -> Optional[Collation]:
|
require_grams: Optional[Grammemes] = None) -> Optional[Collation]:
|
||||||
'''
|
'''
|
||||||
Determine morpho tags for input text.
|
Determine morpho tags for input text.
|
||||||
::returns:: Morphology of a text or None if no suitable form is available
|
::returns:: Morphology of a text or None if no suitable form is available
|
||||||
|
@ -165,9 +163,9 @@ class PhraseParser:
|
||||||
if len(segments) == 0:
|
if len(segments) == 0:
|
||||||
return None
|
return None
|
||||||
elif len(segments) == 1:
|
elif len(segments) == 1:
|
||||||
return self._parse_single(segments[0], require_index, require_tags)
|
return self._parse_single(segments[0], require_index, require_grams)
|
||||||
else:
|
else:
|
||||||
return self._parse_multiword(text, segments, require_index, require_tags)
|
return self._parse_multiword(text, segments, require_index, require_grams)
|
||||||
|
|
||||||
def normalize(self, text: str):
|
def normalize(self, text: str):
|
||||||
''' Get normal form for target text. '''
|
''' Get normal form for target text. '''
|
||||||
|
@ -237,7 +235,7 @@ class PhraseParser:
|
||||||
return dependant_normal
|
return dependant_normal
|
||||||
return dependant_model.inflect_dependant(master_model)
|
return dependant_model.inflect_dependant(master_model)
|
||||||
|
|
||||||
def _parse_single(self, segment, require_index: int, require_tags: Optional[Tags]) -> Optional[Collation]:
|
def _parse_single(self, segment, require_index: int, require_grams: Optional[Grammemes]) -> Optional[Collation]:
|
||||||
forms = list(self._filtered_parse(segment.text))
|
forms = list(self._filtered_parse(segment.text))
|
||||||
parse_index = INDEX_NONE
|
parse_index = INDEX_NONE
|
||||||
if len(forms) == 0 or require_index >= len(forms):
|
if len(forms) == 0 or require_index >= len(forms):
|
||||||
|
@ -245,13 +243,13 @@ class PhraseParser:
|
||||||
|
|
||||||
if require_index != INDEX_NONE:
|
if require_index != INDEX_NONE:
|
||||||
tags = forms[require_index].tag
|
tags = forms[require_index].tag
|
||||||
if require_tags and not tags.grammemes.issuperset(require_tags):
|
if require_grams and not tags.grammemes.issuperset(require_grams):
|
||||||
return None
|
return None
|
||||||
parse_index = require_index
|
parse_index = require_index
|
||||||
else:
|
else:
|
||||||
current_score = 0
|
current_score = 0
|
||||||
for (index, form) in enumerate(forms):
|
for (index, form) in enumerate(forms):
|
||||||
if not require_tags or form.tag.grammemes.issuperset(require_tags):
|
if not require_grams or form.tag.grammemes.issuperset(require_grams):
|
||||||
if form.tag.case == 'nomn':
|
if form.tag.case == 'nomn':
|
||||||
parse_index = index
|
parse_index = index
|
||||||
break
|
break
|
||||||
|
@ -270,7 +268,7 @@ class PhraseParser:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _parse_multiword(self, text: str, segments: list, require_index: int,
|
def _parse_multiword(self, text: str, segments: list, require_index: int,
|
||||||
require_tags: Optional[Tags]) -> Optional[Collation]:
|
require_grams: Optional[Grammemes]) -> Optional[Collation]:
|
||||||
result = Collation(text)
|
result = Collation(text)
|
||||||
priority_main: float = self._PRIORITY_NONE
|
priority_main: float = self._PRIORITY_NONE
|
||||||
segment_index = 0
|
segment_index = 0
|
||||||
|
@ -280,7 +278,7 @@ class PhraseParser:
|
||||||
if main_wait > PhraseParser._MAIN_WAIT_LIMIT:
|
if main_wait > PhraseParser._MAIN_WAIT_LIMIT:
|
||||||
break
|
break
|
||||||
segment_index += 1
|
segment_index += 1
|
||||||
priority = self._parse_segment(result, segment, require_index, require_tags)
|
priority = self._parse_segment(result, segment, require_index, require_grams)
|
||||||
if priority is None:
|
if priority is None:
|
||||||
continue # skip non-parsable entities
|
continue # skip non-parsable entities
|
||||||
main_wait += 1
|
main_wait += 1
|
||||||
|
@ -299,7 +297,7 @@ class PhraseParser:
|
||||||
output: Collation,
|
output: Collation,
|
||||||
segment: Segment,
|
segment: Segment,
|
||||||
require_index: int,
|
require_index: int,
|
||||||
require_tags: Optional[Tags]) -> Optional[float]:
|
require_grams: Optional[Grammemes]) -> Optional[float]:
|
||||||
''' Return priority for this can be a new main word '''
|
''' Return priority for this can be a new main word '''
|
||||||
forms = list(self._filtered_parse(segment.text))
|
forms = list(self._filtered_parse(segment.text))
|
||||||
if len(forms) == 0:
|
if len(forms) == 0:
|
||||||
|
@ -311,14 +309,14 @@ class PhraseParser:
|
||||||
score_sum: float = 0
|
score_sum: float = 0
|
||||||
if require_index != INDEX_NONE:
|
if require_index != INDEX_NONE:
|
||||||
form = forms[require_index]
|
form = forms[require_index]
|
||||||
if not require_tags or form.tag.grammemes.issuperset(require_tags):
|
if not require_grams or form.tag.grammemes.issuperset(require_grams):
|
||||||
(local_max, segment_score) = PhraseParser._get_priorities_for(form.tag)
|
(local_max, segment_score) = PhraseParser._get_priorities_for(form.tag)
|
||||||
main_index = require_index
|
main_index = require_index
|
||||||
needs_coordination = Morphology.is_dependable(form.tag.POS)
|
needs_coordination = Morphology.is_dependable(form.tag.POS)
|
||||||
else:
|
else:
|
||||||
local_max = self._PRIORITY_NONE
|
local_max = self._PRIORITY_NONE
|
||||||
for (index, form) in enumerate(forms):
|
for (index, form) in enumerate(forms):
|
||||||
if require_tags and not form.tag.grammemes.issuperset(require_tags):
|
if require_grams and not form.tag.grammemes.issuperset(require_grams):
|
||||||
continue
|
continue
|
||||||
(local_priority, global_priority) = PhraseParser._get_priorities_for(form.tag)
|
(local_priority, global_priority) = PhraseParser._get_priorities_for(form.tag)
|
||||||
needs_coordination = needs_coordination or Morphology.is_dependable(form.tag.POS)
|
needs_coordination = needs_coordination or Morphology.is_dependable(form.tag.POS)
|
||||||
|
@ -419,20 +417,20 @@ class PhraseParser:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_word(text: str, require_index: int = INDEX_NONE,
|
def _parse_word(text: str, require_index: int = INDEX_NONE,
|
||||||
require_tags: Optional[Tags] = None) -> Optional[Morphology]:
|
require_grams: Optional[Grammemes] = None) -> Optional[Morphology]:
|
||||||
parsed_variants = morpho.parse(text)
|
parsed_variants = morpho.parse(text)
|
||||||
if not parsed_variants or require_index >= len(parsed_variants):
|
if not parsed_variants or require_index >= len(parsed_variants):
|
||||||
return None
|
return None
|
||||||
if require_index != INDEX_NONE:
|
if require_index != INDEX_NONE:
|
||||||
tags = parsed_variants[require_index].tag
|
tags = parsed_variants[require_index].tag
|
||||||
if not require_tags or tags.grammemes.issuperset(require_tags):
|
if not require_grams or tags.grammemes.issuperset(require_grams):
|
||||||
return Morphology(tags)
|
return Morphology(tags)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
for variant in parsed_variants:
|
for variant in parsed_variants:
|
||||||
tags = variant.tag
|
tags = variant.tag
|
||||||
if not require_tags or tags.grammemes.issuperset(require_tags):
|
if not require_grams or tags.grammemes.issuperset(require_grams):
|
||||||
return Morphology(tags)
|
return Morphology(tags)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
|
@ -7,13 +7,13 @@ import cctext as cc
|
||||||
class TestConceptAPI(unittest.TestCase):
|
class TestConceptAPI(unittest.TestCase):
|
||||||
'''Test class for Concept API.'''
|
'''Test class for Concept API.'''
|
||||||
def _assert_tags(self, actual: str, expected: str):
|
def _assert_tags(self, actual: str, expected: str):
|
||||||
self.assertEqual(set(cc.split_tags(actual)), set(cc.split_tags(expected)))
|
self.assertEqual(set(cc.split_grams(actual)), set(cc.split_grams(expected)))
|
||||||
|
|
||||||
def test_parse(self):
|
def test_parse(self):
|
||||||
''' Test parsing. '''
|
''' Test parsing. '''
|
||||||
self._assert_tags(cc.parse(''), '')
|
self._assert_tags(cc.parse(''), '')
|
||||||
self._assert_tags(cc.parse('1'), 'NUMB,intg')
|
self._assert_tags(cc.parse('1'), 'NUMB,intg')
|
||||||
self._assert_tags(cc.parse('слон', require_tags='masc'), 'NOUN,anim,masc,sing,nomn')
|
self._assert_tags(cc.parse('слон', require_grams='masc'), 'NOUN,anim,masc,sing,nomn')
|
||||||
|
|
||||||
def test_normalize_word(self):
|
def test_normalize_word(self):
|
||||||
''' Test normalize for single word. '''
|
''' Test normalize for single word. '''
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
''' Unit tests: rumodel. '''
|
''' Unit tests: rumodel. '''
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from cctext import split_tags, combine_tags
|
from cctext import split_grams, combine_grams
|
||||||
|
|
||||||
|
|
||||||
class TestTags(unittest.TestCase):
|
class TestTags(unittest.TestCase):
|
||||||
'''Test tags manipulation.'''
|
'''Test tags manipulation.'''
|
||||||
|
|
||||||
def test_split_tags(self):
|
def test_split_tags(self):
|
||||||
self.assertEqual(split_tags(''), [])
|
self.assertEqual(split_grams(''), [])
|
||||||
self.assertEqual(split_tags('NOUN'), ['NOUN'])
|
self.assertEqual(split_grams('NOUN'), ['NOUN'])
|
||||||
self.assertEqual(split_tags('NOUN,plur,sing'), ['NOUN','plur','sing'])
|
self.assertEqual(split_grams('NOUN,plur,sing'), ['NOUN','plur','sing'])
|
||||||
|
|
||||||
def test_combine_tags(self):
|
def test_combine_tags(self):
|
||||||
self.assertEqual(combine_tags([]), '')
|
self.assertEqual(combine_grams([]), '')
|
||||||
self.assertEqual(combine_tags(['NOUN']), 'NOUN')
|
self.assertEqual(combine_grams(['NOUN']), 'NOUN')
|
||||||
self.assertEqual(combine_tags(['NOUN','plur','sing']), 'NOUN,plur,sing')
|
self.assertEqual(combine_grams(['NOUN','plur','sing']), 'NOUN,plur,sing')
|
||||||
|
|
|
@ -12,8 +12,8 @@ class TestRuParser(unittest.TestCase):
|
||||||
|
|
||||||
def _assert_parse(self, text: str, expected: Iterable[str],
|
def _assert_parse(self, text: str, expected: Iterable[str],
|
||||||
require_index: int = -1,
|
require_index: int = -1,
|
||||||
require_tags: Optional[Iterable[str]] = None):
|
require_grams: Optional[Iterable[str]] = None):
|
||||||
phrase = parser.parse(text, require_index, require_tags)
|
phrase = parser.parse(text, require_index, require_grams)
|
||||||
self.assertIsNotNone(phrase)
|
self.assertIsNotNone(phrase)
|
||||||
if phrase:
|
if phrase:
|
||||||
self.assertEqual(phrase.get_morpho().tag.grammemes, set(expected))
|
self.assertEqual(phrase.get_morpho().tag.grammemes, set(expected))
|
||||||
|
@ -51,10 +51,10 @@ class TestRuParser(unittest.TestCase):
|
||||||
self._assert_parse('32-', ['intg', 'NUMB'])
|
self._assert_parse('32-', ['intg', 'NUMB'])
|
||||||
|
|
||||||
self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_index=0)
|
self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_index=0)
|
||||||
self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_tags=['masc'])
|
self._assert_parse('слон', ['NOUN', 'anim', 'masc', 'sing', 'nomn'], require_grams=['masc'])
|
||||||
self._assert_parse('прямой', ['ADJF', 'gent', 'sing', 'femn', 'Qual'], require_index=0)
|
self._assert_parse('прямой', ['ADJF', 'gent', 'sing', 'femn', 'Qual'], require_index=0)
|
||||||
self._assert_parse('прямой', ['ADJF', 'datv', 'Qual', 'sing', 'femn'], require_index=1)
|
self._assert_parse('прямой', ['ADJF', 'datv', 'Qual', 'sing', 'femn'], require_index=1)
|
||||||
self._assert_parse('прямой', ['NOUN', 'sing', 'inan', 'femn', 'gent'], require_tags=['NOUN'])
|
self._assert_parse('прямой', ['NOUN', 'sing', 'inan', 'femn', 'gent'], require_grams=['NOUN'])
|
||||||
|
|
||||||
self._assert_parse('консистенции', ['NOUN', 'inan', 'femn', 'plur', 'nomn'])
|
self._assert_parse('консистенции', ['NOUN', 'inan', 'femn', 'plur', 'nomn'])
|
||||||
self._assert_parse('тест', ['NOUN', 'sing', 'masc', 'inan', 'nomn'])
|
self._assert_parse('тест', ['NOUN', 'sing', 'masc', 'inan', 'nomn'])
|
||||||
|
@ -65,9 +65,9 @@ class TestRuParser(unittest.TestCase):
|
||||||
self.assertEqual(parser.parse('КАиП'), None)
|
self.assertEqual(parser.parse('КАиП'), None)
|
||||||
self.assertEqual(parser.parse('СЛОН'), None)
|
self.assertEqual(parser.parse('СЛОН'), None)
|
||||||
self.assertEqual(parser.parse(''), None)
|
self.assertEqual(parser.parse(''), None)
|
||||||
self.assertEqual(parser.parse('слон', require_tags=set(['femn'])), None)
|
self.assertEqual(parser.parse('слон', require_grams=set(['femn'])), None)
|
||||||
self.assertEqual(parser.parse('32', require_tags=set(['NOUN'])), None)
|
self.assertEqual(parser.parse('32', require_grams=set(['NOUN'])), None)
|
||||||
self.assertEqual(parser.parse('32-', require_tags=set(['NOUN'])), None)
|
self.assertEqual(parser.parse('32-', require_grams=set(['NOUN'])), None)
|
||||||
self.assertEqual(parser.parse('слон', require_index=42), None)
|
self.assertEqual(parser.parse('слон', require_index=42), None)
|
||||||
|
|
||||||
def test_parse_text(self):
|
def test_parse_text(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user