CCText/tests/t_syntax.py
2024-04-10 20:54:12 +03:00

63 lines
3.3 KiB
Python

''' Unit tests: syntax. '''
import unittest
from cctext import RuSyntax, Capitalization
class TestRusSyntax(unittest.TestCase):
''' Test class for russian syntax. '''
def test_capitalization(self):
''' Testing capitalization. '''
self.assertEqual(Capitalization.from_text(''), Capitalization.unknwn)
self.assertEqual(Capitalization.from_text('Альфа'), Capitalization.first_capital)
self.assertEqual(Capitalization.from_text('АЛЬФА'), Capitalization.upper_case)
self.assertEqual(Capitalization.from_text('альфа'), Capitalization.lower_case)
self.assertEqual(Capitalization.from_text('альФа'), Capitalization.mixed)
self.assertEqual(Capitalization.from_text('альфА'), Capitalization.mixed)
self.assertEqual(Capitalization.from_text('КАиП'), Capitalization.mixed)
self.assertEqual(Capitalization.upper_case.apply_to('альфа'), 'АЛЬФА')
self.assertEqual(Capitalization.lower_case.apply_to('АльФа'), 'альфа')
self.assertEqual(Capitalization.first_capital.apply_to('альфа'), 'Альфа')
self.assertEqual(Capitalization.first_capital.apply_to('АльФа'), 'АльФа')
self.assertEqual(Capitalization.unknwn.apply_to('АльФа'), 'АльФа')
self.assertEqual(Capitalization.mixed.apply_to('АльФа'), 'АльФа')
def test_is_single_word(self):
''' Testing single word identification. '''
self.assertTrue(RuSyntax.is_single_word(''))
self.assertTrue(RuSyntax.is_single_word('word'))
self.assertTrue(RuSyntax.is_single_word('слово'))
self.assertTrue(RuSyntax.is_single_word(' word '), 'Whitespace doesnt count')
self.assertTrue(RuSyntax.is_single_word('1001'), 'Numbers are words')
self.assertTrue(RuSyntax.is_single_word('кое-как'), 'Hyphen doesnt break work')
self.assertTrue(RuSyntax.is_single_word('1-2-метилбутан'), 'Complex words')
self.assertFalse(RuSyntax.is_single_word('one two'))
self.assertFalse(RuSyntax.is_single_word('синий слон'))
def test_tokenize(self):
''' Testing tokenization. '''
self.assertEqual(list(RuSyntax.tokenize('')), [])
self.assertEqual(list(RuSyntax.tokenize(' ')), [])
self.assertEqual(self._list_tokenize('test'), [(0, 4, 'test')])
self.assertEqual(self._list_tokenize(' test '), [(1, 5, 'test')])
self.assertEqual(self._list_tokenize('синий слон'), [(0, 5, 'синий'), (6, 10, 'слон')])
def test_split_words(self):
''' Testing splitting text into words. '''
self.assertEqual([], list(RuSyntax.split_words('')))
self.assertEqual([], list(RuSyntax.split_words(' ')))
self.assertEqual(RuSyntax.split_words('test'), ['test'])
self.assertEqual(RuSyntax.split_words(' test '), ['test'])
self.assertEqual(RuSyntax.split_words('синий слон'), ['синий', 'слон'])
self.assertEqual(RuSyntax.split_words('синий, большой слон'), ['синий', ',', 'большой', 'слон'])
@staticmethod
def _list_tokenize(text: str):
return [(token.start, token.stop, token.text) for token in RuSyntax.tokenize(text)]
if __name__ == '__main__':
unittest.main()