mirror of
https://github.com/IRBorisov/cctext.git
synced 2025-06-25 21:20:36 +03:00
88 lines
2.5 KiB
Python
88 lines
2.5 KiB
Python
''' Russian language syntax. '''
|
||
from __future__ import annotations
|
||
from enum import Enum, unique
|
||
|
||
from razdel import tokenize
|
||
|
||
|
||
@unique
|
||
class Capitalization(Enum):
|
||
''' Enumerating capitalization types. '''
|
||
unknwn = 0
|
||
lower_case = 1
|
||
upper_case = 2
|
||
first_capital = 3
|
||
mixed = 4
|
||
|
||
@staticmethod
|
||
def from_text(text: str) -> Capitalization:
|
||
''' Fabric method to identify capitalization in text. '''
|
||
if len(text) == 0:
|
||
return Capitalization.unknwn
|
||
first_capital = Capitalization._is_capital(text[0])
|
||
has_mid_capital = False
|
||
has_lower = not first_capital
|
||
for symbol in text[1:]:
|
||
if Capitalization._is_capital(symbol):
|
||
if has_lower:
|
||
return Capitalization.mixed
|
||
has_mid_capital = True
|
||
else:
|
||
if has_mid_capital:
|
||
return Capitalization.mixed
|
||
else:
|
||
has_lower = True
|
||
if has_mid_capital:
|
||
return Capitalization.upper_case
|
||
elif first_capital:
|
||
return Capitalization.first_capital
|
||
else:
|
||
return Capitalization.lower_case
|
||
|
||
def apply_to(self, text: str) -> str:
|
||
''' Apply capitalization to text. '''
|
||
if not text or self in [Capitalization.unknwn, Capitalization.mixed]:
|
||
return text
|
||
elif self == Capitalization.lower_case:
|
||
return text.lower()
|
||
elif self == Capitalization.upper_case:
|
||
return text.upper()
|
||
else:
|
||
return text[0].upper() + text[1:]
|
||
|
||
@staticmethod
|
||
def _is_capital(symbol: str) -> bool:
|
||
return 'А' <= symbol <= 'Я' or 'A' <= symbol <= 'Z'
|
||
|
||
|
||
class RuSyntax:
|
||
''' Russian language syntax parser. '''
|
||
def __init__(self):
|
||
pass
|
||
|
||
def __del__(self):
|
||
pass
|
||
|
||
@staticmethod
|
||
def is_single_word(text: str) -> bool:
|
||
''' Test if text is a single word. '''
|
||
try:
|
||
gen = tokenize(text)
|
||
if next(gen) == '':
|
||
return True
|
||
if next(gen) == '':
|
||
return True
|
||
return False
|
||
except StopIteration:
|
||
return True
|
||
|
||
@staticmethod
|
||
def tokenize(text: str):
|
||
''' Split text into words. Returns list[(start, stop, text)]. '''
|
||
return tokenize(text)
|
||
|
||
@staticmethod
|
||
def split_words(text: str) -> list[str]:
|
||
''' Split text into words. '''
|
||
return [elem.text for elem in tokenize(text)]
|