ConceptPortal-public/rsconcept/backend/cctext/syntax.py

88 lines
2.5 KiB
Python
Raw Normal View History

2023-12-26 14:23:51 +03:00
''' Russian language syntax incapsulation. '''
2023-08-16 21:26:59 +03:00
from __future__ import annotations
from enum import Enum, unique
from razdel import tokenize
@unique
class Capitalization(Enum):
''' Enumerating capitalization types. '''
unknwn = 0
lower_case = 1
upper_case = 2
first_capital = 3
mixed = 4
@staticmethod
def from_text(text: str) -> Capitalization:
''' Fabric method to identify capitalization in text. '''
if len(text) == 0:
return Capitalization.unknwn
first_capital = Capitalization._is_capital(text[0])
has_mid_capital = False
has_lower = not first_capital
for symbol in text[1:]:
if Capitalization._is_capital(symbol):
if has_lower:
return Capitalization.mixed
has_mid_capital = True
else:
if has_mid_capital:
return Capitalization.mixed
else:
has_lower = True
if has_mid_capital:
return Capitalization.upper_case
elif first_capital:
return Capitalization.first_capital
else:
return Capitalization.lower_case
def apply_to(self, text: str) -> str:
''' Apply capitalization to text. '''
if not text or self in [Capitalization.unknwn, Capitalization.mixed]:
return text
elif self == Capitalization.lower_case:
return text.lower()
elif self == Capitalization.upper_case:
return text.upper()
else:
return text[0].upper() + text[1:]
@staticmethod
def _is_capital(symbol: str) -> bool:
return 'А' <= symbol <= 'Я' or 'A' <= symbol <= 'Z'
class RuSyntax:
2023-12-26 14:23:51 +03:00
''' Russian language syntax parser. '''
2023-08-16 21:26:59 +03:00
def __init__(self):
pass
def __del__(self):
pass
@staticmethod
def is_single_word(text: str) -> bool:
''' Test if text is a single word. '''
try:
gen = tokenize(text)
2023-08-17 21:23:54 +03:00
if next(gen) == '':
return True
if next(gen) == '':
return True
return False
2023-08-16 21:26:59 +03:00
except StopIteration:
return True
@staticmethod
def tokenize(text: str):
''' Split text into words. Returns list[(start, stop, text)]. '''
return tokenize(text)
@staticmethod
def split_words(text: str) -> list[str]:
''' Split text into words. '''
return [elem.text for elem in tokenize(text)]