From a210de14ff2ee01f385f1f72882ad17cbf0e508a Mon Sep 17 00:00:00 2001 From: IRBorisov <8611739+IRBorisov@users.noreply.github.com> Date: Fri, 14 Jun 2024 18:33:34 +0300 Subject: [PATCH] Update build process and remove cctext inlining --- Exteor.vcxproj | 30 +- ExteorWithCCL.sln | 18 +- import/cctext/__init__.py | 16 - import/cctext/conceptapi.py | 90 ---- import/cctext/context.py | 84 --- import/cctext/reference.py | 60 --- import/cctext/resolver.py | 140 ----- import/cctext/rumodel.py | 118 ----- import/cctext/ruparser.py | 486 ------------------ import/cctext/syntax.py | 87 ---- script/ExteorSetup_x64.iss | 5 +- script/ExteorSetup_x86.iss | 5 +- ...phyInstall.bat => installDependencies.bat} | 3 +- test/XTRCoreTest.vcxproj | 28 +- 14 files changed, 39 insertions(+), 1131 deletions(-) delete mode 100644 import/cctext/__init__.py delete mode 100644 import/cctext/conceptapi.py delete mode 100644 import/cctext/context.py delete mode 100644 import/cctext/reference.py delete mode 100644 import/cctext/resolver.py delete mode 100644 import/cctext/rumodel.py delete mode 100644 import/cctext/ruparser.py delete mode 100644 import/cctext/syntax.py rename script/{PymorphyInstall.bat => installDependencies.bat} (66%) diff --git a/Exteor.vcxproj b/Exteor.vcxproj index 846e0e0..6e57d57 100644 --- a/Exteor.vcxproj +++ b/Exteor.vcxproj @@ -84,7 +84,7 @@ - include;header;src\pch;..\ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories) + include;header;src\pch;..\GH-ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories) POCO_STATIC; NDEBUG;_WINDOWS;%(PreprocessorDefinitions) Use Level4 @@ -101,7 +101,7 @@ false - lib\x86;..\ConceptCore\output\lib\x86;..\OfficeOLE\output\lib\x86;C:\Program Files (x86)\Python312-32\libs;%(AdditionalLibraryDirectories) + lib\x86;..\GH-ConceptCore\output\lib\x86;..\OfficeOLE\output\lib\x86;C:\Program Files (x86)\Python312-32\libs;%(AdditionalLibraryDirectories) Version.lib;OfficeOLE.lib;ConceptCoreLibrary.lib;RSlang.lib;cclGraph.lib;cclLang.lib;oldnames.lib;Htmlhelp.Lib;iphlpapi.lib;%(AdditionalDependencies) Windows true @@ -121,8 +121,7 @@ $(IntDir);%(AdditionalIncludeDirectories) - xcopy /y /s /q /i "import\cctext\" "$(OutDir)\cctext" -xcopy /y /s /q /i "distr\app" "$(OutDir)" + xcopy /y /s /q /i "distr\app" "$(OutDir)" copy "C:\Program Files (x86)\Python312-32\Python312.dll" "$(OutDir)" copy "C:\Program Files (x86)\Python312-32\Python312.pdb" "$(OutDir)" if not exist bin\x86 mkdir bin\x86 @@ -131,7 +130,7 @@ copy "$(OutDir)Exteor.exe" "bin\x86\Exteor.exe" - include;header;src\pch;..\ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories) + include;header;src\pch;..\GH-ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories) POCO_STATIC;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) Use Level4 @@ -148,7 +147,7 @@ copy "$(OutDir)Exteor.exe" "bin\x86\Exteor.exe" false - lib\x64;..\ConceptCore\output\lib\x64;..\OfficeOLE\output\lib\x64;C:\Program Files\Python312\libs;%(AdditionalLibraryDirectories) + lib\x64;..\GH-ConceptCore\output\lib\x64;..\OfficeOLE\output\lib\x64;C:\Program Files\Python312\libs;%(AdditionalLibraryDirectories) Version.lib;OfficeOLE.lib;ConceptCoreLibrary.lib;RSlang.lib;cclGraph.lib;cclLang.lib;oldnames.lib;Htmlhelp.Lib;iphlpapi.lib;%(AdditionalDependencies) Windows true @@ -168,8 +167,7 @@ copy "$(OutDir)Exteor.exe" "bin\x86\Exteor.exe" $(IntDir);%(AdditionalIncludeDirectories) - xcopy /y /s /q /i "import\cctext" "$(OutDir)\cctext\" -xcopy /y /s /q /i "distr\app" "$(OutDir)" + xcopy /y /s /q /i "distr\app" "$(OutDir)" if not exist bin\x64 mkdir bin\x64 copy "$(OutDir)Exteor.exe" "bin\x64\Exteor.exe" @@ -182,7 +180,7 @@ copy "$(OutDir)Exteor.exe" "bin\x64\Exteor.exe" true stdcpplatest true - include;header;src\pch;..\ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories) + include;header;src\pch;..\GH-ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories) POCO_STATIC;_DEBUG;_WINDOWS;%(PreprocessorDefinitions) $(IntDir) $(IntDir)obj\ @@ -191,7 +189,7 @@ copy "$(OutDir)Exteor.exe" "bin\x64\Exteor.exe" Windows - lib\x86;..\ConceptCore\output\lib\x86;..\OfficeOLE\output\lib\x86;C:\Program Files (x86)\Python312-32\libs;%(AdditionalLibraryDirectories) + lib\x86;..\GH-ConceptCore\output\lib\x86;..\OfficeOLE\output\lib\x86;C:\Program Files (x86)\Python312-32\libs;%(AdditionalLibraryDirectories) Version.lib;OfficeOLEd.lib;ConceptCoreLibraryd.lib;cclLangd.lib;RSlangd.lib;cclGraphd.lib;Htmlhelp.Lib;iphlpapi.lib;%(AdditionalDependencies) $(OutDir)$(TargetName)$(TargetExt) $(OutDir)$(ProjectName).pdb @@ -211,8 +209,7 @@ copy "$(OutDir)Exteor.exe" "bin\x64\Exteor.exe" $(IntDir);%(AdditionalIncludeDirectories) - xcopy /y /s /q /i "import\cctext" "$(OutDir)\cctext\" -xcopy /y /s /q /i "distr\app" "$(OutDir)" + xcopy /y /s /q /i "distr\app" "$(OutDir)" copy "C:\Program Files (x86)\Python312-32\Python312_d.dll" "$(OutDir)" copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)" @@ -229,7 +226,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)" true stdcpplatest true - include;header;src\pch;..\ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories) + include;header;src\pch;..\GH-ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories) POCO_STATIC; _DEBUG;_WINDOWS;%(PreprocessorDefinitions) $(IntDir) $(IntDir)obj\ @@ -238,7 +235,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)" Windows - lib\x64;..\ConceptCore\output\lib\x64;..\OfficeOLE\output\lib\x64;C:\Program Files\Python312\libs;%(AdditionalLibraryDirectories) + lib\x64;..\GH-ConceptCore\output\lib\x64;..\OfficeOLE\output\lib\x64;C:\Program Files\Python312\libs;%(AdditionalLibraryDirectories) Version.lib;OfficeOLEd.lib;ConceptCoreLibraryd.lib;cclLangd.lib;RSlangd.lib;cclGraphd.lib;Htmlhelp.Lib;iphlpapi.lib;%(AdditionalDependencies) $(OutDir)$(TargetName)$(TargetExt) $(OutDir)$(ProjectName).pdb @@ -257,8 +254,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)" $(IntDir);%(AdditionalIncludeDirectories) - xcopy /y /s /q /i "import\cctext" "$(OutDir)\cctext\" -xcopy /y /s /q /i "distr\app" "$(OutDir)" + xcopy /y /s /q /i "distr\app" "$(OutDir)" @@ -437,7 +433,7 @@ xcopy /y /s /q /i "distr\app" "$(OutDir)" - + {b0aba27b-9d39-4b48-9977-aff20925b309} diff --git a/ExteorWithCCL.sln b/ExteorWithCCL.sln index e2a71bf..d0b8b76 100644 --- a/ExteorWithCCL.sln +++ b/ExteorWithCCL.sln @@ -8,9 +8,9 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Exteor", "Exteor.vcxproj", {B0ABA27B-9D39-4B48-9977-AFF20925B309} = {B0ABA27B-9D39-4B48-9977-AFF20925B309} EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ConceptCore", "..\ConceptCore\ccl\core\ConceptLibrary.vcxproj", "{B0ABA27B-9D39-4B48-9977-AFF20925B309}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ConceptCore", "..\GH-ConceptCore\ccl\core\ConceptLibrary.vcxproj", "{B0ABA27B-9D39-4B48-9977-AFF20925B309}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclTest", "..\ConceptCore\ccl\core\test\cclTest.vcxproj", "{F87048D4-952A-460E-96E8-1E2E1EAE34FC}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclTest", "..\GH-ConceptCore\ccl\core\test\cclTest.vcxproj", "{F87048D4-952A-460E-96E8-1E2E1EAE34FC}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "XTRCoreTest", "test\XTRCoreTest.vcxproj", "{576D16B8-96BF-4B5A-8B09-E1916375E34F}" ProjectSection(ProjectDependencies) = postProject @@ -18,19 +18,19 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "XTRCoreTest", "test\XTRCore {F87048D4-952A-460E-96E8-1E2E1EAE34FC} = {F87048D4-952A-460E-96E8-1E2E1EAE34FC} EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RSlang", "..\ConceptCore\ccl\rslang\RSlang.vcxproj", "{A8529C63-42F5-43E6-97B8-2EC83F23E1F9}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RSlang", "..\GH-ConceptCore\ccl\rslang\RSlang.vcxproj", "{A8529C63-42F5-43E6-97B8-2EC83F23E1F9}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RSLangTest", "..\ConceptCore\ccl\rslang\test\rslTest.vcxproj", "{32469CE1-303B-4DB4-8E03-B7EBED5851EB}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RSLangTest", "..\GH-ConceptCore\ccl\rslang\test\rslTest.vcxproj", "{32469CE1-303B-4DB4-8E03-B7EBED5851EB}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclGraph", "..\ConceptCore\ccl\cclGraph\cclGraph.vcxproj", "{7E1D5338-F819-4C96-B461-9EAAB8D02E1D}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclGraph", "..\GH-ConceptCore\ccl\cclGraph\cclGraph.vcxproj", "{7E1D5338-F819-4C96-B461-9EAAB8D02E1D}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclGraphTest", "..\ConceptCore\ccl\cclGraph\test\cclGraphTest.vcxproj", "{5A2501C1-FEFB-4B14-A94D-E8F19ADEA239}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclGraphTest", "..\GH-ConceptCore\ccl\cclGraph\test\cclGraphTest.vcxproj", "{5A2501C1-FEFB-4B14-A94D-E8F19ADEA239}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclCommonsTest", "..\ConceptCore\ccl\cclCommons\test\cclCommonsTest.vcxproj", "{53A380CF-B599-4170-89B1-642F1C3772E1}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclCommonsTest", "..\GH-ConceptCore\ccl\cclCommons\test\cclCommonsTest.vcxproj", "{53A380CF-B599-4170-89B1-642F1C3772E1}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclLang", "..\ConceptCore\ccl\cclLang\cclLang.vcxproj", "{76B03803-56CC-47C2-A8F0-2241FCAF2898}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclLang", "..\GH-ConceptCore\ccl\cclLang\cclLang.vcxproj", "{76B03803-56CC-47C2-A8F0-2241FCAF2898}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclLangTest", "..\ConceptCore\ccl\cclLang\test\cclLangTest.vcxproj", "{4754356B-DC01-4564-A035-270FFB72F6A0}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclLangTest", "..\GH-ConceptCore\ccl\cclLang\test\cclLangTest.vcxproj", "{4754356B-DC01-4564-A035-270FFB72F6A0}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "libs", "libs", "{DC591058-8A8A-460E-93D0-B57C848DF12B}" EndProject diff --git a/import/cctext/__init__.py b/import/cctext/__init__.py deleted file mode 100644 index 35bcebe..0000000 --- a/import/cctext/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -''' Concept core text processing library. ''' -# pylint: skip-file -from .syntax import RuSyntax, Capitalization -from .rumodel import Morphology, SemanticRole, WordTag, morpho, split_grams, combine_grams -from .ruparser import PhraseParser, WordToken, Collation -from .reference import EntityReference, ReferenceType, SyntacticReference, parse_reference -from .context import TermForm, Entity, TermContext -from .resolver import Reference, Position, Resolver, ResolvedReference, resolve_entity, resolve_syntactic, extract_entities - -from .conceptapi import ( - parse, normalize, - generate_lexeme, inflect, inflect_context, inflect_substitute, inflect_dependant, - match_all_morpho, find_substr -) - -# TODO: implement Part of speech transition for VERB <-> NOUN diff --git a/import/cctext/conceptapi.py b/import/cctext/conceptapi.py deleted file mode 100644 index f6f8e2d..0000000 --- a/import/cctext/conceptapi.py +++ /dev/null @@ -1,90 +0,0 @@ -''' -Concept API Python functions. - -::guarantee:: doesn't raise exceptions and returns workable outputs -''' -from cctext.rumodel import Morphology -from .syntax import RuSyntax -from .ruparser import PhraseParser -from .rumodel import split_grams - -parser = PhraseParser() - - -def parse(text: str, require_grams: str = '') -> str: - ''' Determine morpho tags for input text. - ::returns:: string of comma separated grammar tags or empty string ''' - model = parser.parse(text, require_grams=split_grams(require_grams)) - if model is None: - return '' - result = model.get_morpho().to_text() - return result if result != 'UNKN' else '' - - -# def parse_variants(text: str, require_grams: str = '') -> list[tuple[str, str]]: -# ''' Get all variants of a parse. -# ::returns:: string of comma separated grammar tags or empty string ''' - - -def generate_lexeme(text_normal: str) -> list[tuple[str, str]]: - ''' Get all inflected forms belonging to same Lexeme. ''' - model = parser.parse(text_normal) - if not model: - return [] - result = [] - for form in model.get_form().lexeme: - result.append((model.inflect(form.tag.grammemes), Morphology(form.tag).to_text())) - return result - - -def normalize(text: str) -> str: - ''' Generate normal form. - ::returns:: normal form of input text or text itself if no parse is available ''' - model = parser.parse(text) - if model is None: - return text - return model.normal_form() - - -def inflect(text: str, target_grams: str) -> str: - ''' Inflect text to match required tags. - ::returns:: infected text or initial text if infection failed ''' - target_set = split_grams(target_grams) - model = parser.parse(text) - if model is None: - return text - return model.inflect(target_set) - - -def inflect_context(target: str, before: str = '', after: str = '') -> str: - ''' Inflect text in accordance to context before and after. ''' - return parser.inflect_context(target, before, after) - - -def inflect_substitute(substitute_normal: str, original: str) -> str: - ''' Inflect substitute to match original form. ''' - return parser.inflect_substitute(substitute_normal, original) - - -def inflect_dependant(dependant_normal: str, master: str) -> str: - ''' Inflect dependant to coordinate with master text. ''' - return parser.inflect_dependant(dependant_normal, master) - - -def match_all_morpho(text: str, filter_grams: str) -> list[list[int]]: - ''' Search for all words corresponding to tags. ''' - target_set = split_grams(filter_grams) - if len(target_set) == 0: - return [] - - result = [] - for elem in RuSyntax.tokenize(text): - model = parser.parse(elem.text, require_grams=target_set) - if model: - result.append([elem.start, elem.stop]) - return result - - -def find_substr(text: str, sub: str) -> tuple[int, int]: - ''' Search for substring position in text regardless of morphology. ''' - return parser.find_substr(text, sub) diff --git a/import/cctext/context.py b/import/cctext/context.py deleted file mode 100644 index de487cd..0000000 --- a/import/cctext/context.py +++ /dev/null @@ -1,84 +0,0 @@ -''' Term context for reference resolution. ''' -from typing import Iterable, Optional, TypedDict - -from .ruparser import PhraseParser -from .rumodel import WordTag - - -parser = PhraseParser() - - -class TermForm(TypedDict): - ''' Represents term in a specific form. ''' - text: str - grams: Iterable[str] - - -def _match_grams(query: Iterable[str], test: Iterable[str]) -> bool: - ''' Check if grams from test fit query. ''' - for gram in test: - if not gram in query: - if not gram in WordTag.PARTS_OF_SPEECH: - return False - for pos in WordTag.PARTS_OF_SPEECH: - if pos in query: - return False - return True - - -def _search_form(query: Iterable[str], data: Iterable[TermForm]) -> Optional[str]: - for form in data: - if _match_grams(query, form['grams']): - return form['text'] - return None - - -class Entity: - ''' Represents text entity. ''' - def __init__(self, alias: str, nominal: str, manual_forms: Optional[Iterable[TermForm]]=None): - if manual_forms is None: - self.manual = [] - else: - self.manual = list(manual_forms) - self.alias = alias - self._nominal = nominal - self._cached: list[TermForm] = [] - - def get_nominal(self) -> str: - ''' Getter for _nominal. ''' - return self._nominal - - def set_nominal(self, new_text: str): - ''' Setter for _nominal. - Note: clears manual and cached forms. ''' - if self._nominal == new_text: - return - self._nominal = new_text - self.manual = [] - self._cached = [] - - def get_form(self, grams: Iterable[str]) -> str: - ''' Get specific term form. ''' - if all(False for _ in grams): - return self._nominal - text = _search_form(grams, self.manual) - if text is not None: - return text - text = _search_form(grams, self._cached) - if text is not None: - return text - - model = parser.parse(self._nominal) - if model is None: - text = self._nominal - else: - try: - text = model.inflect(grams) - except ValueError as error: - text = f'!{error}!'.replace('Unknown grammeme', 'Неизвестная граммема') - self._cached.append({'text': text, 'grams': grams}) - return text - - -# Represents term context for resolving entity references. -TermContext = dict[str, Entity] diff --git a/import/cctext/reference.py b/import/cctext/reference.py deleted file mode 100644 index c2733bb..0000000 --- a/import/cctext/reference.py +++ /dev/null @@ -1,60 +0,0 @@ -''' Text reference API. ''' -from enum import Enum, unique -from typing import Optional, Union - - -@unique -class ReferenceType(Enum): - ''' Text reference types. ''' - entity = 'entity' - syntactic = 'syntax' - - -class EntityReference: - ''' Reference to entity. ''' - - def __init__(self, identifier: str, form: str): - self.entity = identifier - self.form = form - - def get_type(self) -> ReferenceType: - return ReferenceType.entity - - def to_text(self) -> str: - return f'@{{{self.entity}|{self.form}}}' - - -class SyntacticReference: - ''' Reference to syntactic dependency on EntityReference. ''' - - def __init__(self, referral_offset: int, text: str): - self.nominal = text - self.offset = referral_offset - - def get_type(self) -> ReferenceType: - return ReferenceType.syntactic - - def to_text(self) -> str: - return f'@{{{self.offset}|{self.nominal}}}' - - -Reference = Union[EntityReference, SyntacticReference] - - -def parse_reference(text: str) -> Optional[Reference]: - if len(text) < 4 or text[-1] != '}' or text[0:2] != '@{': - return None - blocks: list[str] = [block.strip() for block in text[2:-1].split('|')] - if len(blocks) != 2 or blocks[0] == '' or blocks[0][0] in '0': - return None - if blocks[0][0] in '-123456789': - if blocks[1] == '': - return None - try: - offset = int(blocks[0]) - return SyntacticReference(offset, blocks[1]) - except ValueError: - return None - else: - form = blocks[1].replace(' ', '') - return EntityReference(blocks[0], form) diff --git a/import/cctext/resolver.py b/import/cctext/resolver.py deleted file mode 100644 index e9df3df..0000000 --- a/import/cctext/resolver.py +++ /dev/null @@ -1,140 +0,0 @@ -''' Reference resolution API. ''' -import re -from typing import cast, Optional -from dataclasses import dataclass - -from .rumodel import split_grams - -from .conceptapi import inflect_dependant -from .context import TermContext -from .reference import EntityReference, SyntacticReference, parse_reference, Reference - - -_REF_ENTITY_PATTERN = re.compile(r'@{([^0-9\-][^\}\|\{]*?)\|([^\}\|\{]*?)}') - - -def extract_entities(text: str) -> list[str]: - ''' Extract list of entities that are referenced. ''' - result: list[str] = [] - for segment in re.finditer(_REF_ENTITY_PATTERN, text): - entity = segment.group(1) - if entity not in result: - result.append(entity) - return result - - -def resolve_entity(ref: EntityReference, context: TermContext) -> str: - ''' Resolve entity reference. ''' - alias = ref.entity - if alias not in context: - return f'!Неизвестная сущность: {alias}!' - grams = split_grams(ref.form) - resolved = context[alias].get_form(grams) - if resolved == '': - return f'!Отсутствует термин: {alias}!' - else: - return resolved - - -def resolve_syntactic(ref: SyntacticReference, index: int, references: list['ResolvedReference']) -> str: - ''' Resolve syntactic reference. ''' - offset = ref.offset - master: Optional['ResolvedReference'] = None - if offset > 0: - index += 1 - while index < len(references): - if isinstance(references[index].ref, EntityReference): - if offset == 1: - master = references[index] - else: - offset -= 1 - index += 1 - else: - index -= 1 - while index >= 0: - if isinstance(references[index].ref, EntityReference): - if offset == -1: - master = references[index] - else: - offset += 1 - index -= 1 - if master is None: - return f'!Некорректное смещение: {ref.offset}!' - return inflect_dependant(ref.nominal, master.resolved) - - -@dataclass -class Position: - ''' 0-indexed contiguous segment position in text. ''' - start: int = 0 - finish: int = 0 - - def __hash__(self) -> int: - return hash((self.start, self.finish)) - - -@dataclass -class ResolvedReference: - ''' Resolved reference data ''' - ref: Reference - resolved: str = '' - pos_input: Position = Position() - pos_output: Position = Position() - - def __hash__(self) -> int: - return hash((self.resolved, self.pos_input, self.pos_output, self.ref.to_text())) - - -class Resolver: - ''' Text reference resolver ''' - REFERENCE_PATTERN = re.compile(r'@{[^\}\{]*?}') - - def __init__(self, context: TermContext): - self.context = context - self.refs = cast(list[ResolvedReference], []) - self.input = '' - self.output = '' - - def resolve(self, text: str) -> str: - ''' Resolve references in input text. - Note: data on references positions is accessed through class attributes ''' - self._reset(text) - self._parse_refs() - if len(self.refs) == 0: - self.output = self.input - return self.output - else: - self._resolve_refs() - self._combine_output() - return self.output - - def _reset(self, input_text: str): - self.refs = cast(list[ResolvedReference], []) - self.input = input_text - self.output = '' - - def _parse_refs(self): - for segment in re.finditer(Resolver.REFERENCE_PATTERN, self.input): - parse = parse_reference(segment[0]) - if parse is not None: - ref_info = ResolvedReference(ref=parse, - resolved='', - pos_input=Position(segment.start(0), segment.end(0))) - self.refs.append(ref_info) - - def _resolve_refs(self): - for ref in self.refs: - if isinstance(ref.ref, EntityReference): - ref.resolved = resolve_entity(ref.ref, self.context) - for (index, ref) in enumerate(self.refs): - if isinstance(ref.ref, SyntacticReference): - ref.resolved = resolve_syntactic(ref.ref, index, self.refs) - - def _combine_output(self): - pos_in = 0 - for ref in self.refs: - self.output += self.input[pos_in : ref.pos_input.start] - self.output += ref.resolved - ref.pos_output = Position(len(self.output) - len(ref.resolved), len(self.output)) - pos_in = ref.pos_input.finish - self.output += self.input[pos_in : len(self.input)] diff --git a/import/cctext/rumodel.py b/import/cctext/rumodel.py deleted file mode 100644 index 8f5b4cc..0000000 --- a/import/cctext/rumodel.py +++ /dev/null @@ -1,118 +0,0 @@ -''' Russian language models. ''' -from __future__ import annotations -from enum import Enum, unique -from typing import Iterable, Optional - -from pymorphy3 import MorphAnalyzer -from pymorphy3.tagset import OpencorporaTag as WordTag - -# ''' Morphology parser. ''' -morpho = MorphAnalyzer() -Grammemes = Iterable[str] - - -def split_grams(text: str) -> list[str]: - ''' Split grammemes string into set of items. ''' - return [tag.strip() for tag in filter(None, text.split(','))] - - -def combine_grams(tags: Iterable[str]) -> str: - ''' Combine grammemes into string. ''' - return ','.join(tags) - - -@unique -class SemanticRole(Enum): - ''' Enumerating semantic types for different parse patterns. ''' - unknwn = 0 - term = 1 - action = 2 - definition = 3 - - @staticmethod - def from_POS(pos: Optional[str]) -> SemanticRole: - ''' Production method: types from part of speech. ''' - if pos in ['NOUN', 'NPRO']: - return SemanticRole.term - elif pos in ['VERB', 'INFN', 'PRTF', 'PRTS']: - return SemanticRole.action - elif pos in ['ADJF', 'ADJS']: - return SemanticRole.definition - return SemanticRole.unknwn - - -class Morphology: - ''' Wrapper for OpencorporaTag expanding functionality for multiword. - Full morphology tags see http://opencorpora.org/dict.php?act=gram - ''' - def __init__(self, tag: WordTag, semantic=SemanticRole.unknwn): - self.tag = tag - self.semantic = semantic if semantic != SemanticRole.unknwn else SemanticRole.from_POS(tag.POS) - - _TAGS_IMMUTABLE = frozenset(['INFN', 'ADVB', 'COMP', 'PNCT', 'PREP', 'CONJ', 'PRCL', 'INTJ']) - - _TAGS_NO_TENSE = frozenset(['NOUN', 'NPRO', 'ADJF', 'ADJS']) - _TAGS_NO_CASE = frozenset(['GRND', 'VERB', 'ADJS', 'PRTS']) - _TAGS_NO_NUMBER = frozenset(['GRND']) - _TAGS_NO_GENDER = frozenset(['GRND', 'NOUN', 'NPRO', 'plur']) - _TAGS_NO_PERSON = frozenset(['GRND', 'NOUN', 'ADJF', 'ADJS', 'PRTF', 'PRTS', 'past']) - - @property - def can_coordinate(self) -> bool: - ''' Check if coordination can change text. ''' - return self.tag.POS in ['NOUN', 'NPRO', 'NUMR', 'ADJF', 'ADJS', 'PRTF', 'PRTS'] - - @staticmethod - def is_dependable(pos: str): - ''' Check if this morphology can be dependant. ''' - return pos in ['ADJF', 'ADJS', 'PRTF', 'PRTS'] - - @property - def effective_POS(self) -> Optional[str]: - ''' Access part of speech. Pronouns are considered as nouns ''' - pos: Optional[str] = self.tag.POS - if pos and self.tag.POS == 'NPRO': - return 'NOUN' - return pos - - def complete_grams(self, grams: Iterable[str]) -> set[str]: - ''' Add missing tags before inflection. ''' - result = set(grams) - pos = self.tag.POS - if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH): - result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB') - if not result.isdisjoint(self._TAGS_IMMUTABLE): - return result - if self.tag.case and result.isdisjoint(WordTag.CASES) and result.isdisjoint(self._TAGS_NO_CASE): - result.add(self.tag.case) - if self.tag.tense and result.isdisjoint(WordTag.TENSES) and result.isdisjoint(self._TAGS_NO_TENSE): - if (self.tag.tense != 'past' or result.isdisjoint(WordTag.PERSONS)) \ - and (self.tag.tense != 'pres' or result.isdisjoint(WordTag.GENDERS)): - result.add(self.tag.tense) - if self.tag.number and result.isdisjoint(WordTag.NUMBERS) and result.isdisjoint(self._TAGS_NO_NUMBER): - if self.tag.number != 'plur' or result.isdisjoint(WordTag.GENDERS): - result.add(self.tag.number) - if self.tag.gender and result.isdisjoint(WordTag.GENDERS) and result.isdisjoint(self._TAGS_NO_GENDER): - if 'PRTF' in result or 'pres' not in result: - result.add(self.tag.gender) - if self.tag.person and result.isdisjoint(WordTag.PERSONS) and result.isdisjoint(self._TAGS_NO_PERSON): - result.add(self.tag.person) - if 'plur' in result and not result.isdisjoint(WordTag.GENDERS): - result = result.difference(WordTag.GENDERS) - return result - - def coordination_grams(self) -> set[str]: - ''' Return set of grammemes for inflection to keep coordination . ''' - result = set() - if self.tag.case: - result.add(self.tag.case) - if self.tag: - number = self.tag.number - result.add(number) - if self.tag.gender and 'plur' not in result: - result.add(self.tag.gender) - return result - - def to_text(self) -> str: - ''' Produce string of all grammemes. ''' - return combine_grams(self.tag.grammemes) diff --git a/import/cctext/ruparser.py b/import/cctext/ruparser.py deleted file mode 100644 index 7b64cd2..0000000 --- a/import/cctext/ruparser.py +++ /dev/null @@ -1,486 +0,0 @@ -''' Parsing russian language using pymorphy3 library. ''' -from __future__ import annotations -from typing import Optional - -from razdel.substring import Substring as Segment -from pymorphy3.analyzer import Parse as WordParse - -from .syntax import RuSyntax, Capitalization -from .rumodel import SemanticRole, Morphology, WordTag, morpho, Grammemes - -INDEX_NONE = -1 -NO_COORDINATION = -1 -WORD_NONE = -1 - - -class WordToken: - ''' Atomic text token. ''' - def __init__(self, segment: Segment, parse: list[WordParse], main_parse: int = 0): - self.segment: Segment = segment - self.forms: list[WordParse] = parse - self.main: int = main_parse - - def get_morpho(self) -> Morphology: - ''' Return morphology for current token. ''' - return Morphology(self.get_parse().tag) - - def get_parse(self) -> WordParse: - ''' Access main form. ''' - return self.forms[self.main] - - def inflect(self, inflection_grams: set[str]) -> Optional[WordParse]: - ''' Apply inflection to segment text. Does not modify forms ''' - inflected = self.get_parse().inflect(inflection_grams) - if not inflected: - return None - self.segment.text = Capitalization.from_text(self.segment.text).apply_to(inflected.word) - return inflected - - -class Collation: - ''' Parsed data for input coordinated text. ''' - def __init__(self, text: str): - self.text = text - self.words: list[WordToken] = [] - self.coordination: list[int] = [] - self.main_word: int = WORD_NONE - - def is_valid(self) -> bool: - ''' Check if data is parsed correctly ''' - return self.main_word != WORD_NONE - - def get_form(self) -> WordParse: - ''' Access WordParse. ''' - return self.words[self.main_word].get_parse() - - def get_morpho(self) -> Morphology: - ''' Access parsed main morphology. ''' - return self.words[self.main_word].get_morpho() - - def add_word(self, segment, forms: list, main_form: int, need_coordination: bool = True): - ''' Add word information. ''' - self.words.append(WordToken(segment, forms, main_form)) - self.coordination.append(NO_COORDINATION if not need_coordination else 0) - - def inflect(self, target_grams: Grammemes) -> str: - ''' Inflect text to match required tags. ''' - if self.is_valid(): - origin = self.get_morpho() - if not origin.tag.grammemes.issuperset(target_grams): - if self._apply_inflection(origin, target_grams): - return self._generate_text() - return self.text - - def inflect_like(self, base_model: Collation) -> str: - ''' Create inflection to substitute base_model form. ''' - if self.is_valid(): - morph = base_model.get_morpho() - if morph.effective_POS: - tags = set() - tags.add(morph.effective_POS) - tags = morph.complete_grams(tags) - return self.inflect(tags) - return self.text - - def inflect_dependant(self, master_model: Collation) -> str: - ''' Create inflection to coordinate with master_model form. ''' - assert self.is_valid() - morph = master_model.get_morpho() - tags = morph.coordination_grams() - tags = self.get_morpho().complete_grams(tags) - return self.inflect(tags) - - def normal_form(self) -> str: - ''' Generate normal form. ''' - if self.is_valid(): - main_form = self.get_form() - new_morpho = Morphology(main_form.normalized.tag) - new_grams = new_morpho.complete_grams(frozenset()) - return self.inflect(new_grams) - return self.text - - def _iterate_coordinated(self): - words_count = len(self.words) - current_word = self.coordination[words_count] - while current_word != words_count: - yield self.words[current_word] - current_word += self.coordination[current_word] - - def _inflect_main_word(self, origin: Morphology, target_grams: Grammemes) -> Optional[Morphology]: - full_grams = origin.complete_grams(target_grams) - inflected = self.words[self.main_word].inflect(full_grams) - if not inflected: - return None - return Morphology(inflected.tag) - - def _apply_inflection(self, origin: Morphology, target_grams: Grammemes) -> bool: - new_moprho = self._inflect_main_word(origin, target_grams) - if not new_moprho: - return False - inflection_grams = new_moprho.coordination_grams() - if len(inflection_grams) == 0: - return True - for word in self._iterate_coordinated(): - word.inflect(inflection_grams) - return True - - def _generate_text(self) -> str: - current_pos = 0 - result = '' - for token in self.words: - if token.segment.start > current_pos: - result += self.text[current_pos: token.segment.start] - result += token.segment.text - current_pos = token.segment.stop - if current_pos + 1 < len(self.text): - result += self.text[current_pos:] - return result - - -class PhraseParser: - ''' Russian grammar parser. ''' - def __init__(self): - pass - - def __del__(self): - pass - - _FILTER_SCORE = 0.005 - _SINGLE_SCORE_SEARCH = 0.2 - _PRIORITY_NONE = NO_COORDINATION - - _MAIN_WAIT_LIMIT = 10 # count words until fixing main - _MAIN_MAX_FOLLOWERS = 3 # count words after main as coordination candidates - - def parse(self, text: str, - require_index: int = INDEX_NONE, - require_grams: Optional[Grammemes] = None) -> Optional[Collation]: - ''' - Determine morpho tags for input text. - ::returns:: Morphology of a text or None if no suitable form is available - ''' - segments = list(RuSyntax.tokenize(text)) - if len(segments) == 0: - return None - elif len(segments) == 1: - return self._parse_single(segments[0], require_index, require_grams) - else: - return self._parse_multiword(text, segments, require_index, require_grams) - - def normalize(self, text: str): - ''' Get normal form for target text. ''' - processed = self.parse(text) - if processed: - return processed.normal_form() - return text - - def find_substr(self, text: str, sub: str) -> tuple[int, int]: - ''' Search for substring position in text regardless of morphology. ''' - if not text or not sub: - return (0, 0) - query = [self.normalize(elem.text) for elem in RuSyntax.tokenize(sub)] - query_len = len(query) - start = 0 - current_index = 0 - for token in RuSyntax.tokenize(text): - text_word = self.normalize(token.text) - if text_word != query[current_index]: - current_index = 0 - else: - if current_index == 0: - start = token.start - current_index += 1 - if current_index == query_len: - return (start, token.stop) - return (0, 0) - - def inflect_context(self, text: str, before: str = '', after: str = '') -> str: - ''' Inflect text in accordance to context before and after. ''' - target = self.parse(text) - if not target: - return text - target_morpho = target.get_morpho() - if not target_morpho or not target_morpho.can_coordinate: - return text - - model_after = self.parse(after) - model_before = self.parse(before) - etalon = PhraseParser._choose_context_etalon(target_morpho, model_before, model_after) - if not etalon: - return text - etalon_moprho = etalon.get_morpho() - if not etalon_moprho.can_coordinate: - return text - - new_form = PhraseParser._combine_morpho(target_morpho, etalon_moprho.tag) - return target.inflect(new_form) - - def inflect_substitute(self, substitute_normal: str, original: str) -> str: - ''' Inflect substitute to match original form. ''' - original_model = self.parse(original) - if not original_model: - return substitute_normal - substitute_model = self.parse(substitute_normal) - if not substitute_model: - return substitute_normal - return substitute_model.inflect_like(original_model) - - def inflect_dependant(self, dependant_normal: str, master: str) -> str: - ''' Inflect dependant to coordinate with master text. ''' - master_model = self.parse(master) - if not master_model: - return dependant_normal - dependant_model = self.parse(dependant_normal) - if not dependant_model: - return dependant_normal - return dependant_model.inflect_dependant(master_model) - - def _parse_single(self, segment, require_index: int, require_grams: Optional[Grammemes]) -> Optional[Collation]: - forms = list(self._filtered_parse(segment.text)) - parse_index = INDEX_NONE - if len(forms) == 0 or require_index >= len(forms): - return None - - if require_index != INDEX_NONE: - tags = forms[require_index].tag - if require_grams and not tags.grammemes.issuperset(require_grams): - return None - parse_index = require_index - else: - current_score = 0 - for (index, form) in enumerate(forms): - if not require_grams or form.tag.grammemes.issuperset(require_grams): - if form.tag.case == 'nomn': - parse_index = index - break - elif parse_index == INDEX_NONE: - current_score = form.score - parse_index = index - elif form.score / current_score < self._SINGLE_SCORE_SEARCH: - break - - if parse_index == INDEX_NONE: - return None - result = Collation(segment.text) - result.add_word(segment, [forms[parse_index]], main_form=0, need_coordination=False) - result.coordination.append(len(result.words)) - result.main_word = 0 - return result - - def _parse_multiword(self, text: str, segments: list, require_index: int, - require_grams: Optional[Grammemes]) -> Optional[Collation]: - result = Collation(text) - priority_main: float = self._PRIORITY_NONE - segment_index = 0 - main_wait = 0 - word_index = 0 - for segment in segments: - if main_wait > PhraseParser._MAIN_WAIT_LIMIT: - break - segment_index += 1 - priority = self._parse_segment(result, segment, require_index, require_grams) - if priority is None: - continue # skip non-parsable entities - main_wait += 1 - if priority > priority_main: - result.main_word = word_index - priority_main = priority - word_index += 1 - if result.main_word == INDEX_NONE: - return None - self._finalize_coordination(result) - if segment_index < len(segments): - pass # finish to parse segments after main if needed - return result - - def _parse_segment(self, - output: Collation, - segment: Segment, - require_index: int, - require_grams: Optional[Grammemes]) -> Optional[float]: - ''' Return priority for this can be a new main word ''' - forms = list(self._filtered_parse(segment.text)) - if len(forms) == 0: - return None - main_index: int = INDEX_NONE - segment_score: float = self._PRIORITY_NONE - needs_coordination = False - local_sum: float = 0 - score_sum: float = 0 - if require_index != INDEX_NONE: - form = forms[require_index] - if not require_grams or form.tag.grammemes.issuperset(require_grams): - (local_max, segment_score) = PhraseParser._get_priorities_for(form.tag) - main_index = require_index - needs_coordination = Morphology.is_dependable(form.tag.POS) - else: - local_max = self._PRIORITY_NONE - for (index, form) in enumerate(forms): - if require_grams and not form.tag.grammemes.issuperset(require_grams): - continue - (local_priority, global_priority) = PhraseParser._get_priorities_for(form.tag) - needs_coordination = needs_coordination or Morphology.is_dependable(form.tag.POS) - local_sum += global_priority * form.score - score_sum += form.score - if local_priority > local_max: - local_max = local_priority - # segment_score = global_priority - main_index = index - if score_sum == 0: - return None - segment_score = local_sum / score_sum - output.add_word(segment, forms, main_index, needs_coordination) - return segment_score - # Alternative: return segment_score - # penalty_suspicious = 0 if local_max == 0 else (1 - local_sum / local_max) * self._PRIORITY_PENALTY - # return segment_score - penalty_suspicious - - @classmethod - def _finalize_coordination(cls, target: Collation): - main_morpho: Morphology = target.get_morpho() - main_coordinate = main_morpho.can_coordinate - target.coordination[target.main_word] = NO_COORDINATION - first_change = INDEX_NONE - current_len = 0 - for (index, word) in enumerate(target.words): - if target.coordination[index] == NO_COORDINATION or index - target.main_word > cls._MAIN_MAX_FOLLOWERS: - needs_change = False - if index != target.main_word: - word.main = INDEX_NONE - else: - word.main = PhraseParser._find_coordination(word.forms, main_morpho.tag, index < target.main_word) - needs_change = word.main != INDEX_NONE - if not needs_change or not main_coordinate: - target.coordination[index] = NO_COORDINATION - current_len += 1 - if needs_change and main_coordinate: - target.coordination[index] = current_len - current_len = 0 - if first_change == INDEX_NONE: - first_change = index - if first_change == INDEX_NONE: - target.coordination.append(len(target.words)) - return - previous_reference = first_change - current_word = len(target.words) - target.coordination.append(current_len + 1) - while target.coordination[current_word] != INDEX_NONE: - previous_word = current_word - target.coordination[current_word] - target.coordination[current_word] = previous_reference - previous_reference = current_word - previous_word - current_word = previous_word - if previous_reference == 0 or current_word < 0: - break - - @staticmethod - def _find_coordination(forms: list, main_tag: WordTag, before_main: bool) -> int: - for (index, form) in enumerate(forms): - pos = form.tag.POS - case = form.tag.case - if pos not in ['ADJF', 'ADJS', 'PRTF', 'PRTS']: - continue - if SemanticRole.from_POS(pos) == SemanticRole.term and case == 'gent': - if before_main: - continue - else: - return INDEX_NONE - if case == main_tag.case: - return index - elif main_tag.case in ['accs', 'gent'] and case in ['accs', 'gent']: - return index - return INDEX_NONE - - @staticmethod - def _filtered_parse(text: str): - capital = Capitalization.from_text(text) - score_filter = PhraseParser._filter_score(morpho.parse(text)) - yield from PhraseParser._filter_capital(score_filter, capital) - - @staticmethod - def _filter_score(generator): - for form in generator: - if form.score < PhraseParser._FILTER_SCORE: - break - yield form - - @staticmethod - def _filter_capital(generator, capital: Capitalization): - if capital in [Capitalization.upper_case, Capitalization.mixed]: - for form in generator: - if 'Abbr' not in form.tag.grammemes: - continue - yield form - else: - yield from generator - - @staticmethod - def _parse_word(text: str, require_index: int = INDEX_NONE, - require_grams: Optional[Grammemes] = None) -> Optional[Morphology]: - parsed_variants = morpho.parse(text) - if not parsed_variants or require_index >= len(parsed_variants): - return None - if require_index != INDEX_NONE: - tags = parsed_variants[require_index].tag - if not require_grams or tags.grammemes.issuperset(require_grams): - return Morphology(tags) - else: - return None - else: - for variant in parsed_variants: - tags = variant.tag - if not require_grams or tags.grammemes.issuperset(require_grams): - return Morphology(tags) - return None - - @staticmethod - def _get_priorities_for(tag: WordTag) -> tuple[float, float]: - ''' Return pair of local and global priorities. ''' - if tag.POS in ['VERB', 'INFN']: - return (9, 10) - if tag.POS in ['NOUN', 'NPRO']: - return (10, 9) if 'nomn' in tag.grammemes and 'Fixd' not in tag.grammemes else (8, 8) - if tag.POS in ['PRTF', 'PRTS']: - return (6, 6) - if tag.POS in ['ADJF', 'ADJS']: - return (5, 5) - if tag.POS == 'ADVB': - return (7, 4) - return (0, 0) - - @staticmethod - def _choose_context_etalon(target: Morphology, - before: Optional[Collation], - after: Optional[Collation]) -> Optional[Collation]: - if not before or not before.get_morpho().can_coordinate: - return after - if not after or not after.get_morpho().can_coordinate: - return before - - before_semantic = before.get_morpho().semantic - after_semantic = after.get_morpho().semantic - if target.semantic == SemanticRole.definition: - if after_semantic == SemanticRole.term: - return after - if before_semantic == SemanticRole.term: - return before - if before_semantic == SemanticRole.definition: - return before - return after - - if target.semantic == SemanticRole.term: - if before_semantic == SemanticRole.definition: - return before - if after_semantic == SemanticRole.definition: - return after - - return before - - @staticmethod - def _combine_morpho(target: Morphology, etalon: WordTag) -> frozenset[str]: - part_of_speech = target.tag.POS - number = etalon.number - if number == 'plur': - return frozenset([part_of_speech, number, etalon.case]) - else: - gender = etalon.gender if target.semantic != SemanticRole.term else target.tag.gender - return frozenset([part_of_speech, number, gender, etalon.case]) diff --git a/import/cctext/syntax.py b/import/cctext/syntax.py deleted file mode 100644 index 012b396..0000000 --- a/import/cctext/syntax.py +++ /dev/null @@ -1,87 +0,0 @@ -''' Russian language syntax incapsulation. ''' -from __future__ import annotations -from enum import Enum, unique - -from razdel import tokenize - - -@unique -class Capitalization(Enum): - ''' Enumerating capitalization types. ''' - unknwn = 0 - lower_case = 1 - upper_case = 2 - first_capital = 3 - mixed = 4 - - @staticmethod - def from_text(text: str) -> Capitalization: - ''' Fabric method to identify capitalization in text. ''' - if len(text) == 0: - return Capitalization.unknwn - first_capital = Capitalization._is_capital(text[0]) - has_mid_capital = False - has_lower = not first_capital - for symbol in text[1:]: - if Capitalization._is_capital(symbol): - if has_lower: - return Capitalization.mixed - has_mid_capital = True - else: - if has_mid_capital: - return Capitalization.mixed - else: - has_lower = True - if has_mid_capital: - return Capitalization.upper_case - elif first_capital: - return Capitalization.first_capital - else: - return Capitalization.lower_case - - def apply_to(self, text: str) -> str: - ''' Apply capitalization to text. ''' - if not text or self in [Capitalization.unknwn, Capitalization.mixed]: - return text - elif self == Capitalization.lower_case: - return text.lower() - elif self == Capitalization.upper_case: - return text.upper() - else: - return text[0].upper() + text[1:] - - @staticmethod - def _is_capital(symbol: str) -> bool: - return 'А' <= symbol <= 'Я' or 'A' <= symbol <= 'Z' - - -class RuSyntax: - ''' Russian language syntax parser. ''' - def __init__(self): - pass - - def __del__(self): - pass - - @staticmethod - def is_single_word(text: str) -> bool: - ''' Test if text is a single word. ''' - try: - gen = tokenize(text) - if next(gen) == '': - return True - if next(gen) == '': - return True - return False - except StopIteration: - return True - - @staticmethod - def tokenize(text: str): - ''' Split text into words. Returns list[(start, stop, text)]. ''' - return tokenize(text) - - @staticmethod - def split_words(text: str) -> list[str]: - ''' Split text into words. ''' - return [elem.text for elem in tokenize(text)] diff --git a/script/ExteorSetup_x64.iss b/script/ExteorSetup_x64.iss index 6eeacf8..d8a2385 100644 --- a/script/ExteorSetup_x64.iss +++ b/script/ExteorSetup_x64.iss @@ -60,11 +60,10 @@ Name: quicklaunchicon; Description: "{cm:CreateQuickLaunchIcon}"; GroupDescripti [Files] Source: "..\bin\x64\Exteor.exe"; DestDir: "{app}"; Flags: ignoreversion -Source: "..\import\cctext\*"; DestDir: "{app}\cctext"; Flags: ignoreversion Source: "app\*"; DestDir: "{app}"; Flags: ignoreversion recursesubdirs Source: "..\distr\app\DejaVu Sans.ttf"; DestDir: "{app}"; Flags: ignoreversion -Source: "..\script\PymorphyInstall.bat"; DestDir: {tmp}; Flags: deleteafterinstall +Source: "..\script\installDependencies.bat"; DestDir: {tmp}; Flags: deleteafterinstall [Icons] Name: "{group}\{#ExteorName}"; Filename: "{app}\{#ExteorEXE}" @@ -101,7 +100,7 @@ Filename: "{app}\{#ExteorEXE}"; Description: "{cm:LaunchProgram,{#StringChange(E Filename: "{app}\Документация\README.rtf"; Description: "{cm:ReadMe}"; Flags: nowait postinstall skipifsilent unchecked shellexec Filename: "{tmp}\{#VSRedist}"; Parameters: "/install /quiet /NORESTART"; StatusMsg: {cm:CppRedist}; Check: VCRedistNeedsInstall() Filename: "{tmp}\{#PythonRedist}"; Parameters: "/quiet InstallAllUsers=1 PrependPath=1 Include_test=0"; StatusMsg: {cm:PythonInstall}; Check: PythonNeedsInstall() -Filename: "{tmp}\PymorphyInstall.bat"; Parameters: """{code:PythonPath}""" +Filename: "{tmp}\installDependencies.bat"; Parameters: """{code:PythonPath}""" [Code] function IsPythonMissing: Boolean; forward; diff --git a/script/ExteorSetup_x86.iss b/script/ExteorSetup_x86.iss index 82aabbd..efc10be 100644 --- a/script/ExteorSetup_x86.iss +++ b/script/ExteorSetup_x86.iss @@ -60,11 +60,10 @@ Name: quicklaunchicon; Description: "{cm:CreateQuickLaunchIcon}"; GroupDescripti [Files] Source: "..\bin\x64\Exteor.exe"; DestDir: "{app}"; Flags: ignoreversion -Source: "..\import\cctext\*"; DestDir: "{app}\cctext"; Flags: ignoreversion Source: "app\*"; DestDir: "{app}"; Flags: ignoreversion recursesubdirs Source: "..\distr\app\DejaVu Sans.ttf"; DestDir: "{app}"; Flags: ignoreversion -Source: "..\script\PymorphyInstall.bat"; DestDir: {tmp}; Flags: deleteafterinstall +Source: "..\script\installDependencies.bat"; DestDir: {tmp}; Flags: deleteafterinstall [Icons] Name: "{group}\{#ExteorName}"; Filename: "{app}\{#ExteorEXE}" @@ -101,7 +100,7 @@ Filename: "{app}\{#ExteorEXE}"; Description: "{cm:LaunchProgram,{#StringChange(E Filename: "{app}\Документация\README.rtf"; Description: "{cm:ReadMe}"; Flags: nowait postinstall skipifsilent unchecked shellexec Filename: "{tmp}\{#VSRedist}"; Parameters: "/install /quiet /NORESTART"; StatusMsg: {cm:CppRedist}; Check: VCRedistNeedsInstall() Filename: "{tmp}\{#PythonRedist}"; Parameters: "/quiet InstallAllUsers=1 PrependPath=1 Include_test=0"; StatusMsg: {cm:PythonInstall}; Check: PythonNeedsInstall() -Filename: "{tmp}\PymorphyInstall.bat"; Parameters: """{code:PythonPath}""" +Filename: "{tmp}\installDependencies.bat"; Parameters: """{code:PythonPath}""" [Code] function IsPythonMissing: Boolean; forward; diff --git a/script/PymorphyInstall.bat b/script/installDependencies.bat similarity index 66% rename from script/PymorphyInstall.bat rename to script/installDependencies.bat index f14f9c2..55932f1 100644 --- a/script/PymorphyInstall.bat +++ b/script/installDependencies.bat @@ -7,8 +7,7 @@ PING -n 1 www.google.com > nul && (goto :SUCCESS) || (goto :FAILURE) :SUCCESS @echo off -"%python3Path%Scripts\pip.exe" install razdel || (goto :FAILURE) -"%python3Path%Scripts\pip.exe" install pymorphy3 || (goto :FAILURE) +"%python3Path%Scripts\pip.exe" install cctext || (goto :FAILURE) goto :END :FAILURE diff --git a/test/XTRCoreTest.vcxproj b/test/XTRCoreTest.vcxproj index 4a46958..5dcf90b 100644 --- a/test/XTRCoreTest.vcxproj +++ b/test/XTRCoreTest.vcxproj @@ -154,7 +154,7 @@ Level4 true $(IntDir)obj\ - Mock;./;..\include;..\import\include;..\..\ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories) + Mock;./;..\include;..\import\include;..\..\GH-ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories) stdcpplatest false @@ -163,12 +163,11 @@ Console ConceptCoreLibraryd.lib;RSlangd.lib;cclGraphd.lib;cclLangd.lib;iphlpapi.lib;%(AdditionalDependencies) false - C:\Program Files (x86)\Python312-32\libs;..\..\ConceptCore\output\lib\x86;..\..\OfficeOLE\output\lib\x86;%(AdditionalLibraryDirectories) + C:\Program Files (x86)\Python312-32\libs;..\..\GH-ConceptCore\output\lib\x86;..\..\OfficeOLE\output\lib\x86;%(AdditionalLibraryDirectories) /ignore:4099 %(AdditionalOptions) - xcopy /y /s /q /i "..\import\cctext" "$(OutDir)\cctext\" -copy "Data\sample_module.py" "$(OutDir)sample_module.py" + copy "Data\sample_module.py" "$(OutDir)sample_module.py" copy "C:\Program Files (x86)\Python312-32\Python312_d.dll" "$(OutDir)" copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)" @@ -184,7 +183,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)" Level4 true $(IntDir)obj\ - Mock;./;..\include;..\header;..\import\include;..\..\ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories) + Mock;./;..\include;..\header;..\import\include;..\..\GH-ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories) stdcpplatest false @@ -193,12 +192,11 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)" Console ConceptCoreLibraryd.lib;RSlangd.lib;cclGraphd.lib;cclLangd.lib;iphlpapi.lib;%(AdditionalDependencies) false - C:\Program Files\Python312\libs;..\..\ConceptCore\output\lib\x64;..\..\OfficeOLE\output\lib\x64;%(AdditionalLibraryDirectories) + C:\Program Files\Python312\libs;..\..\GH-ConceptCore\output\lib\x64;..\..\OfficeOLE\output\lib\x64;%(AdditionalLibraryDirectories) /ignore:4099 %(AdditionalOptions) - xcopy /y /s /q /i "..\import\cctext" "$(OutDir)\cctext\" -copy "Data\sample_module.py" "$(OutDir)sample_module.py" + copy "Data\sample_module.py" "$(OutDir)sample_module.py" @@ -212,7 +210,7 @@ copy "Data\sample_module.py" "$(OutDir)sample_module.py" false true $(IntDir)obj\ - Mock;./;..\include;..\header;..\import\include;..\import\include;..\..\ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories) + Mock;./;..\include;..\header;..\import\include;..\import\include;..\..\GH-ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories) stdcpplatest false @@ -223,12 +221,11 @@ copy "Data\sample_module.py" "$(OutDir)sample_module.py" true ConceptCoreLibrary.lib;RSlang.lib;cclGraph.lib;cclLang.lib;iphlpapi.lib;%(AdditionalDependencies) UseLinkTimeCodeGeneration - C:\Program Files (x86)\Python312-32\libs;..\..\ConceptCore\output\lib\x86;..\..\OfficeOLE\output\lib\x86;%(AdditionalLibraryDirectories) + C:\Program Files (x86)\Python312-32\libs;..\..\GH-ConceptCore\output\lib\x86;..\..\OfficeOLE\output\lib\x86;%(AdditionalLibraryDirectories) /ignore:4099 %(AdditionalOptions) - xcopy /y /s /q /i "..\import\cctext" "$(OutDir)\cctext\" -copy "Data\sample_module.py" "$(OutDir)sample_module.py" + copy "Data\sample_module.py" "$(OutDir)sample_module.py" copy "C:\Program Files (x86)\Python312-32\Python312.dll" "$(OutDir)" copy "C:\Program Files (x86)\Python312-32\Python312.pdb" "$(OutDir)" @@ -244,7 +241,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312.pdb" "$(OutDir)" false true $(IntDir)obj\ - Mock;./;..\include;..\header;..\import\include;..\import\include;..\..\ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories) + Mock;./;..\include;..\header;..\import\include;..\import\include;..\..\GH-ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories) stdcpplatest false @@ -256,12 +253,11 @@ copy "C:\Program Files (x86)\Python312-32\Python312.pdb" "$(OutDir)" ConceptCoreLibrary.lib;RSlang.lib;cclGraph.lib;cclLang.lib;iphlpapi.lib;%(AdditionalDependencies) UseLinkTimeCodeGeneration false - C:\Program Files\Python312\libs;..\..\ConceptCore\output\lib\x64;..\..\OfficeOLE\output\lib\x64;%(AdditionalLibraryDirectories) + C:\Program Files\Python312\libs;..\..\GH-ConceptCore\output\lib\x64;..\..\OfficeOLE\output\lib\x64;%(AdditionalLibraryDirectories) /ignore:4099 %(AdditionalOptions) - xcopy /y /s /q /i "..\import\cctext" "$(OutDir)\cctext\" -copy "Data\sample_module.py" "$(OutDir)sample_module.py" + copy "Data\sample_module.py" "$(OutDir)sample_module.py"