mirror of
https://github.com/IRBorisov/cctext.git
synced 2025-06-25 21:20:36 +03:00
63 lines
3.3 KiB
Python
63 lines
3.3 KiB
Python
''' Unit tests: syntax. '''
|
|
import unittest
|
|
|
|
from cctext import RuSyntax, Capitalization
|
|
|
|
|
|
class TestRusSyntax(unittest.TestCase):
|
|
''' Test class for russian syntax. '''
|
|
|
|
def test_capitalization(self):
|
|
''' Testing capitalization. '''
|
|
self.assertEqual(Capitalization.from_text(''), Capitalization.unknwn)
|
|
self.assertEqual(Capitalization.from_text('Альфа'), Capitalization.first_capital)
|
|
self.assertEqual(Capitalization.from_text('АЛЬФА'), Capitalization.upper_case)
|
|
self.assertEqual(Capitalization.from_text('альфа'), Capitalization.lower_case)
|
|
self.assertEqual(Capitalization.from_text('альФа'), Capitalization.mixed)
|
|
self.assertEqual(Capitalization.from_text('альфА'), Capitalization.mixed)
|
|
self.assertEqual(Capitalization.from_text('КАиП'), Capitalization.mixed)
|
|
|
|
self.assertEqual(Capitalization.upper_case.apply_to('альфа'), 'АЛЬФА')
|
|
self.assertEqual(Capitalization.lower_case.apply_to('АльФа'), 'альфа')
|
|
self.assertEqual(Capitalization.first_capital.apply_to('альфа'), 'Альфа')
|
|
self.assertEqual(Capitalization.first_capital.apply_to('АльФа'), 'АльФа')
|
|
self.assertEqual(Capitalization.unknwn.apply_to('АльФа'), 'АльФа')
|
|
self.assertEqual(Capitalization.mixed.apply_to('АльФа'), 'АльФа')
|
|
|
|
def test_is_single_word(self):
|
|
''' Testing single word identification. '''
|
|
self.assertTrue(RuSyntax.is_single_word(''))
|
|
self.assertTrue(RuSyntax.is_single_word('word'))
|
|
self.assertTrue(RuSyntax.is_single_word('слово'))
|
|
self.assertTrue(RuSyntax.is_single_word(' word '), 'Whitespace doesnt count')
|
|
self.assertTrue(RuSyntax.is_single_word('1001'), 'Numbers are words')
|
|
self.assertTrue(RuSyntax.is_single_word('кое-как'), 'Hyphen doesnt break work')
|
|
self.assertTrue(RuSyntax.is_single_word('1-2-метилбутан'), 'Complex words')
|
|
self.assertFalse(RuSyntax.is_single_word('one two'))
|
|
self.assertFalse(RuSyntax.is_single_word('синий слон'))
|
|
|
|
def test_tokenize(self):
|
|
''' Testing tokenization. '''
|
|
self.assertEqual(list(RuSyntax.tokenize('')), [])
|
|
self.assertEqual(list(RuSyntax.tokenize(' ')), [])
|
|
self.assertEqual(self._list_tokenize('test'), [(0, 4, 'test')])
|
|
self.assertEqual(self._list_tokenize(' test '), [(1, 5, 'test')])
|
|
self.assertEqual(self._list_tokenize('синий слон'), [(0, 5, 'синий'), (6, 10, 'слон')])
|
|
|
|
def test_split_words(self):
|
|
''' Testing splitting text into words. '''
|
|
self.assertEqual([], list(RuSyntax.split_words('')))
|
|
self.assertEqual([], list(RuSyntax.split_words(' ')))
|
|
self.assertEqual(RuSyntax.split_words('test'), ['test'])
|
|
self.assertEqual(RuSyntax.split_words(' test '), ['test'])
|
|
self.assertEqual(RuSyntax.split_words('синий слон'), ['синий', 'слон'])
|
|
self.assertEqual(RuSyntax.split_words('синий, большой слон'), ['синий', ',', 'большой', 'слон'])
|
|
|
|
@staticmethod
|
|
def _list_tokenize(text: str):
|
|
return [(token.start, token.stop, token.text) for token in RuSyntax.tokenize(text)]
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|