CCText/cctext/syntax.py
2024-04-11 23:25:09 +03:00

88 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

''' Russian language syntax. '''
from __future__ import annotations
from enum import Enum, unique
from razdel import tokenize
@unique
class Capitalization(Enum):
''' Enumerating capitalization types. '''
unknwn = 0
lower_case = 1
upper_case = 2
first_capital = 3
mixed = 4
@staticmethod
def from_text(text: str) -> Capitalization:
''' Fabric method to identify capitalization in text. '''
if len(text) == 0:
return Capitalization.unknwn
first_capital = Capitalization._is_capital(text[0])
has_mid_capital = False
has_lower = not first_capital
for symbol in text[1:]:
if Capitalization._is_capital(symbol):
if has_lower:
return Capitalization.mixed
has_mid_capital = True
else:
if has_mid_capital:
return Capitalization.mixed
else:
has_lower = True
if has_mid_capital:
return Capitalization.upper_case
elif first_capital:
return Capitalization.first_capital
else:
return Capitalization.lower_case
def apply_to(self, text: str) -> str:
''' Apply capitalization to text. '''
if not text or self in [Capitalization.unknwn, Capitalization.mixed]:
return text
elif self == Capitalization.lower_case:
return text.lower()
elif self == Capitalization.upper_case:
return text.upper()
else:
return text[0].upper() + text[1:]
@staticmethod
def _is_capital(symbol: str) -> bool:
return 'А' <= symbol <= 'Я' or 'A' <= symbol <= 'Z'
class RuSyntax:
''' Russian language syntax parser. '''
def __init__(self):
pass
def __del__(self):
pass
@staticmethod
def is_single_word(text: str) -> bool:
''' Test if text is a single word. '''
try:
gen = tokenize(text)
if next(gen) == '':
return True
if next(gen) == '':
return True
return False
except StopIteration:
return True
@staticmethod
def tokenize(text: str):
''' Split text into words. Returns list[(start, stop, text)]. '''
return tokenize(text)
@staticmethod
def split_words(text: str) -> list[str]:
''' Split text into words. '''
return [elem.text for elem in tokenize(text)]