From a210de14ff2ee01f385f1f72882ad17cbf0e508a Mon Sep 17 00:00:00 2001
From: IRBorisov <8611739+IRBorisov@users.noreply.github.com>
Date: Fri, 14 Jun 2024 18:33:34 +0300
Subject: [PATCH] Update build process and remove cctext inlining
---
Exteor.vcxproj | 30 +-
ExteorWithCCL.sln | 18 +-
import/cctext/__init__.py | 16 -
import/cctext/conceptapi.py | 90 ----
import/cctext/context.py | 84 ---
import/cctext/reference.py | 60 ---
import/cctext/resolver.py | 140 -----
import/cctext/rumodel.py | 118 -----
import/cctext/ruparser.py | 486 ------------------
import/cctext/syntax.py | 87 ----
script/ExteorSetup_x64.iss | 5 +-
script/ExteorSetup_x86.iss | 5 +-
...phyInstall.bat => installDependencies.bat} | 3 +-
test/XTRCoreTest.vcxproj | 28 +-
14 files changed, 39 insertions(+), 1131 deletions(-)
delete mode 100644 import/cctext/__init__.py
delete mode 100644 import/cctext/conceptapi.py
delete mode 100644 import/cctext/context.py
delete mode 100644 import/cctext/reference.py
delete mode 100644 import/cctext/resolver.py
delete mode 100644 import/cctext/rumodel.py
delete mode 100644 import/cctext/ruparser.py
delete mode 100644 import/cctext/syntax.py
rename script/{PymorphyInstall.bat => installDependencies.bat} (66%)
diff --git a/Exteor.vcxproj b/Exteor.vcxproj
index 846e0e0..6e57d57 100644
--- a/Exteor.vcxproj
+++ b/Exteor.vcxproj
@@ -84,7 +84,7 @@
- include;header;src\pch;..\ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories)
+ include;header;src\pch;..\GH-ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories)
POCO_STATIC; NDEBUG;_WINDOWS;%(PreprocessorDefinitions)
Use
Level4
@@ -101,7 +101,7 @@
false
- lib\x86;..\ConceptCore\output\lib\x86;..\OfficeOLE\output\lib\x86;C:\Program Files (x86)\Python312-32\libs;%(AdditionalLibraryDirectories)
+ lib\x86;..\GH-ConceptCore\output\lib\x86;..\OfficeOLE\output\lib\x86;C:\Program Files (x86)\Python312-32\libs;%(AdditionalLibraryDirectories)
Version.lib;OfficeOLE.lib;ConceptCoreLibrary.lib;RSlang.lib;cclGraph.lib;cclLang.lib;oldnames.lib;Htmlhelp.Lib;iphlpapi.lib;%(AdditionalDependencies)
Windows
true
@@ -121,8 +121,7 @@
$(IntDir);%(AdditionalIncludeDirectories)
- xcopy /y /s /q /i "import\cctext\" "$(OutDir)\cctext"
-xcopy /y /s /q /i "distr\app" "$(OutDir)"
+ xcopy /y /s /q /i "distr\app" "$(OutDir)"
copy "C:\Program Files (x86)\Python312-32\Python312.dll" "$(OutDir)"
copy "C:\Program Files (x86)\Python312-32\Python312.pdb" "$(OutDir)"
if not exist bin\x86 mkdir bin\x86
@@ -131,7 +130,7 @@ copy "$(OutDir)Exteor.exe" "bin\x86\Exteor.exe"
- include;header;src\pch;..\ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories)
+ include;header;src\pch;..\GH-ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories)
POCO_STATIC;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)
Use
Level4
@@ -148,7 +147,7 @@ copy "$(OutDir)Exteor.exe" "bin\x86\Exteor.exe"
false
- lib\x64;..\ConceptCore\output\lib\x64;..\OfficeOLE\output\lib\x64;C:\Program Files\Python312\libs;%(AdditionalLibraryDirectories)
+ lib\x64;..\GH-ConceptCore\output\lib\x64;..\OfficeOLE\output\lib\x64;C:\Program Files\Python312\libs;%(AdditionalLibraryDirectories)
Version.lib;OfficeOLE.lib;ConceptCoreLibrary.lib;RSlang.lib;cclGraph.lib;cclLang.lib;oldnames.lib;Htmlhelp.Lib;iphlpapi.lib;%(AdditionalDependencies)
Windows
true
@@ -168,8 +167,7 @@ copy "$(OutDir)Exteor.exe" "bin\x86\Exteor.exe"
$(IntDir);%(AdditionalIncludeDirectories)
- xcopy /y /s /q /i "import\cctext" "$(OutDir)\cctext\"
-xcopy /y /s /q /i "distr\app" "$(OutDir)"
+ xcopy /y /s /q /i "distr\app" "$(OutDir)"
if not exist bin\x64 mkdir bin\x64
copy "$(OutDir)Exteor.exe" "bin\x64\Exteor.exe"
@@ -182,7 +180,7 @@ copy "$(OutDir)Exteor.exe" "bin\x64\Exteor.exe"
true
stdcpplatest
true
- include;header;src\pch;..\ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories)
+ include;header;src\pch;..\GH-ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories)
POCO_STATIC;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)
$(IntDir)
$(IntDir)obj\
@@ -191,7 +189,7 @@ copy "$(OutDir)Exteor.exe" "bin\x64\Exteor.exe"
Windows
- lib\x86;..\ConceptCore\output\lib\x86;..\OfficeOLE\output\lib\x86;C:\Program Files (x86)\Python312-32\libs;%(AdditionalLibraryDirectories)
+ lib\x86;..\GH-ConceptCore\output\lib\x86;..\OfficeOLE\output\lib\x86;C:\Program Files (x86)\Python312-32\libs;%(AdditionalLibraryDirectories)
Version.lib;OfficeOLEd.lib;ConceptCoreLibraryd.lib;cclLangd.lib;RSlangd.lib;cclGraphd.lib;Htmlhelp.Lib;iphlpapi.lib;%(AdditionalDependencies)
$(OutDir)$(TargetName)$(TargetExt)
$(OutDir)$(ProjectName).pdb
@@ -211,8 +209,7 @@ copy "$(OutDir)Exteor.exe" "bin\x64\Exteor.exe"
$(IntDir);%(AdditionalIncludeDirectories)
- xcopy /y /s /q /i "import\cctext" "$(OutDir)\cctext\"
-xcopy /y /s /q /i "distr\app" "$(OutDir)"
+ xcopy /y /s /q /i "distr\app" "$(OutDir)"
copy "C:\Program Files (x86)\Python312-32\Python312_d.dll" "$(OutDir)"
copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)"
@@ -229,7 +226,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)"
true
stdcpplatest
true
- include;header;src\pch;..\ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories)
+ include;header;src\pch;..\GH-ConceptCore\output\include;..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories)
POCO_STATIC; _DEBUG;_WINDOWS;%(PreprocessorDefinitions)
$(IntDir)
$(IntDir)obj\
@@ -238,7 +235,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)"
Windows
- lib\x64;..\ConceptCore\output\lib\x64;..\OfficeOLE\output\lib\x64;C:\Program Files\Python312\libs;%(AdditionalLibraryDirectories)
+ lib\x64;..\GH-ConceptCore\output\lib\x64;..\OfficeOLE\output\lib\x64;C:\Program Files\Python312\libs;%(AdditionalLibraryDirectories)
Version.lib;OfficeOLEd.lib;ConceptCoreLibraryd.lib;cclLangd.lib;RSlangd.lib;cclGraphd.lib;Htmlhelp.Lib;iphlpapi.lib;%(AdditionalDependencies)
$(OutDir)$(TargetName)$(TargetExt)
$(OutDir)$(ProjectName).pdb
@@ -257,8 +254,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)"
$(IntDir);%(AdditionalIncludeDirectories)
- xcopy /y /s /q /i "import\cctext" "$(OutDir)\cctext\"
-xcopy /y /s /q /i "distr\app" "$(OutDir)"
+ xcopy /y /s /q /i "distr\app" "$(OutDir)"
@@ -437,7 +433,7 @@ xcopy /y /s /q /i "distr\app" "$(OutDir)"
-
+
{b0aba27b-9d39-4b48-9977-aff20925b309}
diff --git a/ExteorWithCCL.sln b/ExteorWithCCL.sln
index e2a71bf..d0b8b76 100644
--- a/ExteorWithCCL.sln
+++ b/ExteorWithCCL.sln
@@ -8,9 +8,9 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Exteor", "Exteor.vcxproj",
{B0ABA27B-9D39-4B48-9977-AFF20925B309} = {B0ABA27B-9D39-4B48-9977-AFF20925B309}
EndProjectSection
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ConceptCore", "..\ConceptCore\ccl\core\ConceptLibrary.vcxproj", "{B0ABA27B-9D39-4B48-9977-AFF20925B309}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ConceptCore", "..\GH-ConceptCore\ccl\core\ConceptLibrary.vcxproj", "{B0ABA27B-9D39-4B48-9977-AFF20925B309}"
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclTest", "..\ConceptCore\ccl\core\test\cclTest.vcxproj", "{F87048D4-952A-460E-96E8-1E2E1EAE34FC}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclTest", "..\GH-ConceptCore\ccl\core\test\cclTest.vcxproj", "{F87048D4-952A-460E-96E8-1E2E1EAE34FC}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "XTRCoreTest", "test\XTRCoreTest.vcxproj", "{576D16B8-96BF-4B5A-8B09-E1916375E34F}"
ProjectSection(ProjectDependencies) = postProject
@@ -18,19 +18,19 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "XTRCoreTest", "test\XTRCore
{F87048D4-952A-460E-96E8-1E2E1EAE34FC} = {F87048D4-952A-460E-96E8-1E2E1EAE34FC}
EndProjectSection
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RSlang", "..\ConceptCore\ccl\rslang\RSlang.vcxproj", "{A8529C63-42F5-43E6-97B8-2EC83F23E1F9}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RSlang", "..\GH-ConceptCore\ccl\rslang\RSlang.vcxproj", "{A8529C63-42F5-43E6-97B8-2EC83F23E1F9}"
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RSLangTest", "..\ConceptCore\ccl\rslang\test\rslTest.vcxproj", "{32469CE1-303B-4DB4-8E03-B7EBED5851EB}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RSLangTest", "..\GH-ConceptCore\ccl\rslang\test\rslTest.vcxproj", "{32469CE1-303B-4DB4-8E03-B7EBED5851EB}"
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclGraph", "..\ConceptCore\ccl\cclGraph\cclGraph.vcxproj", "{7E1D5338-F819-4C96-B461-9EAAB8D02E1D}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclGraph", "..\GH-ConceptCore\ccl\cclGraph\cclGraph.vcxproj", "{7E1D5338-F819-4C96-B461-9EAAB8D02E1D}"
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclGraphTest", "..\ConceptCore\ccl\cclGraph\test\cclGraphTest.vcxproj", "{5A2501C1-FEFB-4B14-A94D-E8F19ADEA239}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclGraphTest", "..\GH-ConceptCore\ccl\cclGraph\test\cclGraphTest.vcxproj", "{5A2501C1-FEFB-4B14-A94D-E8F19ADEA239}"
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclCommonsTest", "..\ConceptCore\ccl\cclCommons\test\cclCommonsTest.vcxproj", "{53A380CF-B599-4170-89B1-642F1C3772E1}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclCommonsTest", "..\GH-ConceptCore\ccl\cclCommons\test\cclCommonsTest.vcxproj", "{53A380CF-B599-4170-89B1-642F1C3772E1}"
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclLang", "..\ConceptCore\ccl\cclLang\cclLang.vcxproj", "{76B03803-56CC-47C2-A8F0-2241FCAF2898}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclLang", "..\GH-ConceptCore\ccl\cclLang\cclLang.vcxproj", "{76B03803-56CC-47C2-A8F0-2241FCAF2898}"
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclLangTest", "..\ConceptCore\ccl\cclLang\test\cclLangTest.vcxproj", "{4754356B-DC01-4564-A035-270FFB72F6A0}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cclLangTest", "..\GH-ConceptCore\ccl\cclLang\test\cclLangTest.vcxproj", "{4754356B-DC01-4564-A035-270FFB72F6A0}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "libs", "libs", "{DC591058-8A8A-460E-93D0-B57C848DF12B}"
EndProject
diff --git a/import/cctext/__init__.py b/import/cctext/__init__.py
deleted file mode 100644
index 35bcebe..0000000
--- a/import/cctext/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-''' Concept core text processing library. '''
-# pylint: skip-file
-from .syntax import RuSyntax, Capitalization
-from .rumodel import Morphology, SemanticRole, WordTag, morpho, split_grams, combine_grams
-from .ruparser import PhraseParser, WordToken, Collation
-from .reference import EntityReference, ReferenceType, SyntacticReference, parse_reference
-from .context import TermForm, Entity, TermContext
-from .resolver import Reference, Position, Resolver, ResolvedReference, resolve_entity, resolve_syntactic, extract_entities
-
-from .conceptapi import (
- parse, normalize,
- generate_lexeme, inflect, inflect_context, inflect_substitute, inflect_dependant,
- match_all_morpho, find_substr
-)
-
-# TODO: implement Part of speech transition for VERB <-> NOUN
diff --git a/import/cctext/conceptapi.py b/import/cctext/conceptapi.py
deleted file mode 100644
index f6f8e2d..0000000
--- a/import/cctext/conceptapi.py
+++ /dev/null
@@ -1,90 +0,0 @@
-'''
-Concept API Python functions.
-
-::guarantee:: doesn't raise exceptions and returns workable outputs
-'''
-from cctext.rumodel import Morphology
-from .syntax import RuSyntax
-from .ruparser import PhraseParser
-from .rumodel import split_grams
-
-parser = PhraseParser()
-
-
-def parse(text: str, require_grams: str = '') -> str:
- ''' Determine morpho tags for input text.
- ::returns:: string of comma separated grammar tags or empty string '''
- model = parser.parse(text, require_grams=split_grams(require_grams))
- if model is None:
- return ''
- result = model.get_morpho().to_text()
- return result if result != 'UNKN' else ''
-
-
-# def parse_variants(text: str, require_grams: str = '') -> list[tuple[str, str]]:
-# ''' Get all variants of a parse.
-# ::returns:: string of comma separated grammar tags or empty string '''
-
-
-def generate_lexeme(text_normal: str) -> list[tuple[str, str]]:
- ''' Get all inflected forms belonging to same Lexeme. '''
- model = parser.parse(text_normal)
- if not model:
- return []
- result = []
- for form in model.get_form().lexeme:
- result.append((model.inflect(form.tag.grammemes), Morphology(form.tag).to_text()))
- return result
-
-
-def normalize(text: str) -> str:
- ''' Generate normal form.
- ::returns:: normal form of input text or text itself if no parse is available '''
- model = parser.parse(text)
- if model is None:
- return text
- return model.normal_form()
-
-
-def inflect(text: str, target_grams: str) -> str:
- ''' Inflect text to match required tags.
- ::returns:: infected text or initial text if infection failed '''
- target_set = split_grams(target_grams)
- model = parser.parse(text)
- if model is None:
- return text
- return model.inflect(target_set)
-
-
-def inflect_context(target: str, before: str = '', after: str = '') -> str:
- ''' Inflect text in accordance to context before and after. '''
- return parser.inflect_context(target, before, after)
-
-
-def inflect_substitute(substitute_normal: str, original: str) -> str:
- ''' Inflect substitute to match original form. '''
- return parser.inflect_substitute(substitute_normal, original)
-
-
-def inflect_dependant(dependant_normal: str, master: str) -> str:
- ''' Inflect dependant to coordinate with master text. '''
- return parser.inflect_dependant(dependant_normal, master)
-
-
-def match_all_morpho(text: str, filter_grams: str) -> list[list[int]]:
- ''' Search for all words corresponding to tags. '''
- target_set = split_grams(filter_grams)
- if len(target_set) == 0:
- return []
-
- result = []
- for elem in RuSyntax.tokenize(text):
- model = parser.parse(elem.text, require_grams=target_set)
- if model:
- result.append([elem.start, elem.stop])
- return result
-
-
-def find_substr(text: str, sub: str) -> tuple[int, int]:
- ''' Search for substring position in text regardless of morphology. '''
- return parser.find_substr(text, sub)
diff --git a/import/cctext/context.py b/import/cctext/context.py
deleted file mode 100644
index de487cd..0000000
--- a/import/cctext/context.py
+++ /dev/null
@@ -1,84 +0,0 @@
-''' Term context for reference resolution. '''
-from typing import Iterable, Optional, TypedDict
-
-from .ruparser import PhraseParser
-from .rumodel import WordTag
-
-
-parser = PhraseParser()
-
-
-class TermForm(TypedDict):
- ''' Represents term in a specific form. '''
- text: str
- grams: Iterable[str]
-
-
-def _match_grams(query: Iterable[str], test: Iterable[str]) -> bool:
- ''' Check if grams from test fit query. '''
- for gram in test:
- if not gram in query:
- if not gram in WordTag.PARTS_OF_SPEECH:
- return False
- for pos in WordTag.PARTS_OF_SPEECH:
- if pos in query:
- return False
- return True
-
-
-def _search_form(query: Iterable[str], data: Iterable[TermForm]) -> Optional[str]:
- for form in data:
- if _match_grams(query, form['grams']):
- return form['text']
- return None
-
-
-class Entity:
- ''' Represents text entity. '''
- def __init__(self, alias: str, nominal: str, manual_forms: Optional[Iterable[TermForm]]=None):
- if manual_forms is None:
- self.manual = []
- else:
- self.manual = list(manual_forms)
- self.alias = alias
- self._nominal = nominal
- self._cached: list[TermForm] = []
-
- def get_nominal(self) -> str:
- ''' Getter for _nominal. '''
- return self._nominal
-
- def set_nominal(self, new_text: str):
- ''' Setter for _nominal.
- Note: clears manual and cached forms. '''
- if self._nominal == new_text:
- return
- self._nominal = new_text
- self.manual = []
- self._cached = []
-
- def get_form(self, grams: Iterable[str]) -> str:
- ''' Get specific term form. '''
- if all(False for _ in grams):
- return self._nominal
- text = _search_form(grams, self.manual)
- if text is not None:
- return text
- text = _search_form(grams, self._cached)
- if text is not None:
- return text
-
- model = parser.parse(self._nominal)
- if model is None:
- text = self._nominal
- else:
- try:
- text = model.inflect(grams)
- except ValueError as error:
- text = f'!{error}!'.replace('Unknown grammeme', 'Неизвестная граммема')
- self._cached.append({'text': text, 'grams': grams})
- return text
-
-
-# Represents term context for resolving entity references.
-TermContext = dict[str, Entity]
diff --git a/import/cctext/reference.py b/import/cctext/reference.py
deleted file mode 100644
index c2733bb..0000000
--- a/import/cctext/reference.py
+++ /dev/null
@@ -1,60 +0,0 @@
-''' Text reference API. '''
-from enum import Enum, unique
-from typing import Optional, Union
-
-
-@unique
-class ReferenceType(Enum):
- ''' Text reference types. '''
- entity = 'entity'
- syntactic = 'syntax'
-
-
-class EntityReference:
- ''' Reference to entity. '''
-
- def __init__(self, identifier: str, form: str):
- self.entity = identifier
- self.form = form
-
- def get_type(self) -> ReferenceType:
- return ReferenceType.entity
-
- def to_text(self) -> str:
- return f'@{{{self.entity}|{self.form}}}'
-
-
-class SyntacticReference:
- ''' Reference to syntactic dependency on EntityReference. '''
-
- def __init__(self, referral_offset: int, text: str):
- self.nominal = text
- self.offset = referral_offset
-
- def get_type(self) -> ReferenceType:
- return ReferenceType.syntactic
-
- def to_text(self) -> str:
- return f'@{{{self.offset}|{self.nominal}}}'
-
-
-Reference = Union[EntityReference, SyntacticReference]
-
-
-def parse_reference(text: str) -> Optional[Reference]:
- if len(text) < 4 or text[-1] != '}' or text[0:2] != '@{':
- return None
- blocks: list[str] = [block.strip() for block in text[2:-1].split('|')]
- if len(blocks) != 2 or blocks[0] == '' or blocks[0][0] in '0':
- return None
- if blocks[0][0] in '-123456789':
- if blocks[1] == '':
- return None
- try:
- offset = int(blocks[0])
- return SyntacticReference(offset, blocks[1])
- except ValueError:
- return None
- else:
- form = blocks[1].replace(' ', '')
- return EntityReference(blocks[0], form)
diff --git a/import/cctext/resolver.py b/import/cctext/resolver.py
deleted file mode 100644
index e9df3df..0000000
--- a/import/cctext/resolver.py
+++ /dev/null
@@ -1,140 +0,0 @@
-''' Reference resolution API. '''
-import re
-from typing import cast, Optional
-from dataclasses import dataclass
-
-from .rumodel import split_grams
-
-from .conceptapi import inflect_dependant
-from .context import TermContext
-from .reference import EntityReference, SyntacticReference, parse_reference, Reference
-
-
-_REF_ENTITY_PATTERN = re.compile(r'@{([^0-9\-][^\}\|\{]*?)\|([^\}\|\{]*?)}')
-
-
-def extract_entities(text: str) -> list[str]:
- ''' Extract list of entities that are referenced. '''
- result: list[str] = []
- for segment in re.finditer(_REF_ENTITY_PATTERN, text):
- entity = segment.group(1)
- if entity not in result:
- result.append(entity)
- return result
-
-
-def resolve_entity(ref: EntityReference, context: TermContext) -> str:
- ''' Resolve entity reference. '''
- alias = ref.entity
- if alias not in context:
- return f'!Неизвестная сущность: {alias}!'
- grams = split_grams(ref.form)
- resolved = context[alias].get_form(grams)
- if resolved == '':
- return f'!Отсутствует термин: {alias}!'
- else:
- return resolved
-
-
-def resolve_syntactic(ref: SyntacticReference, index: int, references: list['ResolvedReference']) -> str:
- ''' Resolve syntactic reference. '''
- offset = ref.offset
- master: Optional['ResolvedReference'] = None
- if offset > 0:
- index += 1
- while index < len(references):
- if isinstance(references[index].ref, EntityReference):
- if offset == 1:
- master = references[index]
- else:
- offset -= 1
- index += 1
- else:
- index -= 1
- while index >= 0:
- if isinstance(references[index].ref, EntityReference):
- if offset == -1:
- master = references[index]
- else:
- offset += 1
- index -= 1
- if master is None:
- return f'!Некорректное смещение: {ref.offset}!'
- return inflect_dependant(ref.nominal, master.resolved)
-
-
-@dataclass
-class Position:
- ''' 0-indexed contiguous segment position in text. '''
- start: int = 0
- finish: int = 0
-
- def __hash__(self) -> int:
- return hash((self.start, self.finish))
-
-
-@dataclass
-class ResolvedReference:
- ''' Resolved reference data '''
- ref: Reference
- resolved: str = ''
- pos_input: Position = Position()
- pos_output: Position = Position()
-
- def __hash__(self) -> int:
- return hash((self.resolved, self.pos_input, self.pos_output, self.ref.to_text()))
-
-
-class Resolver:
- ''' Text reference resolver '''
- REFERENCE_PATTERN = re.compile(r'@{[^\}\{]*?}')
-
- def __init__(self, context: TermContext):
- self.context = context
- self.refs = cast(list[ResolvedReference], [])
- self.input = ''
- self.output = ''
-
- def resolve(self, text: str) -> str:
- ''' Resolve references in input text.
- Note: data on references positions is accessed through class attributes '''
- self._reset(text)
- self._parse_refs()
- if len(self.refs) == 0:
- self.output = self.input
- return self.output
- else:
- self._resolve_refs()
- self._combine_output()
- return self.output
-
- def _reset(self, input_text: str):
- self.refs = cast(list[ResolvedReference], [])
- self.input = input_text
- self.output = ''
-
- def _parse_refs(self):
- for segment in re.finditer(Resolver.REFERENCE_PATTERN, self.input):
- parse = parse_reference(segment[0])
- if parse is not None:
- ref_info = ResolvedReference(ref=parse,
- resolved='',
- pos_input=Position(segment.start(0), segment.end(0)))
- self.refs.append(ref_info)
-
- def _resolve_refs(self):
- for ref in self.refs:
- if isinstance(ref.ref, EntityReference):
- ref.resolved = resolve_entity(ref.ref, self.context)
- for (index, ref) in enumerate(self.refs):
- if isinstance(ref.ref, SyntacticReference):
- ref.resolved = resolve_syntactic(ref.ref, index, self.refs)
-
- def _combine_output(self):
- pos_in = 0
- for ref in self.refs:
- self.output += self.input[pos_in : ref.pos_input.start]
- self.output += ref.resolved
- ref.pos_output = Position(len(self.output) - len(ref.resolved), len(self.output))
- pos_in = ref.pos_input.finish
- self.output += self.input[pos_in : len(self.input)]
diff --git a/import/cctext/rumodel.py b/import/cctext/rumodel.py
deleted file mode 100644
index 8f5b4cc..0000000
--- a/import/cctext/rumodel.py
+++ /dev/null
@@ -1,118 +0,0 @@
-''' Russian language models. '''
-from __future__ import annotations
-from enum import Enum, unique
-from typing import Iterable, Optional
-
-from pymorphy3 import MorphAnalyzer
-from pymorphy3.tagset import OpencorporaTag as WordTag
-
-# ''' Morphology parser. '''
-morpho = MorphAnalyzer()
-Grammemes = Iterable[str]
-
-
-def split_grams(text: str) -> list[str]:
- ''' Split grammemes string into set of items. '''
- return [tag.strip() for tag in filter(None, text.split(','))]
-
-
-def combine_grams(tags: Iterable[str]) -> str:
- ''' Combine grammemes into string. '''
- return ','.join(tags)
-
-
-@unique
-class SemanticRole(Enum):
- ''' Enumerating semantic types for different parse patterns. '''
- unknwn = 0
- term = 1
- action = 2
- definition = 3
-
- @staticmethod
- def from_POS(pos: Optional[str]) -> SemanticRole:
- ''' Production method: types from part of speech. '''
- if pos in ['NOUN', 'NPRO']:
- return SemanticRole.term
- elif pos in ['VERB', 'INFN', 'PRTF', 'PRTS']:
- return SemanticRole.action
- elif pos in ['ADJF', 'ADJS']:
- return SemanticRole.definition
- return SemanticRole.unknwn
-
-
-class Morphology:
- ''' Wrapper for OpencorporaTag expanding functionality for multiword.
- Full morphology tags see http://opencorpora.org/dict.php?act=gram
- '''
- def __init__(self, tag: WordTag, semantic=SemanticRole.unknwn):
- self.tag = tag
- self.semantic = semantic if semantic != SemanticRole.unknwn else SemanticRole.from_POS(tag.POS)
-
- _TAGS_IMMUTABLE = frozenset(['INFN', 'ADVB', 'COMP', 'PNCT', 'PREP', 'CONJ', 'PRCL', 'INTJ'])
-
- _TAGS_NO_TENSE = frozenset(['NOUN', 'NPRO', 'ADJF', 'ADJS'])
- _TAGS_NO_CASE = frozenset(['GRND', 'VERB', 'ADJS', 'PRTS'])
- _TAGS_NO_NUMBER = frozenset(['GRND'])
- _TAGS_NO_GENDER = frozenset(['GRND', 'NOUN', 'NPRO', 'plur'])
- _TAGS_NO_PERSON = frozenset(['GRND', 'NOUN', 'ADJF', 'ADJS', 'PRTF', 'PRTS', 'past'])
-
- @property
- def can_coordinate(self) -> bool:
- ''' Check if coordination can change text. '''
- return self.tag.POS in ['NOUN', 'NPRO', 'NUMR', 'ADJF', 'ADJS', 'PRTF', 'PRTS']
-
- @staticmethod
- def is_dependable(pos: str):
- ''' Check if this morphology can be dependant. '''
- return pos in ['ADJF', 'ADJS', 'PRTF', 'PRTS']
-
- @property
- def effective_POS(self) -> Optional[str]:
- ''' Access part of speech. Pronouns are considered as nouns '''
- pos: Optional[str] = self.tag.POS
- if pos and self.tag.POS == 'NPRO':
- return 'NOUN'
- return pos
-
- def complete_grams(self, grams: Iterable[str]) -> set[str]:
- ''' Add missing tags before inflection. '''
- result = set(grams)
- pos = self.tag.POS
- if pos and result.isdisjoint(WordTag.PARTS_OF_SPEECH):
- result.add(pos if pos != 'INFN' or len(result) == 0 else 'VERB')
- if not result.isdisjoint(self._TAGS_IMMUTABLE):
- return result
- if self.tag.case and result.isdisjoint(WordTag.CASES) and result.isdisjoint(self._TAGS_NO_CASE):
- result.add(self.tag.case)
- if self.tag.tense and result.isdisjoint(WordTag.TENSES) and result.isdisjoint(self._TAGS_NO_TENSE):
- if (self.tag.tense != 'past' or result.isdisjoint(WordTag.PERSONS)) \
- and (self.tag.tense != 'pres' or result.isdisjoint(WordTag.GENDERS)):
- result.add(self.tag.tense)
- if self.tag.number and result.isdisjoint(WordTag.NUMBERS) and result.isdisjoint(self._TAGS_NO_NUMBER):
- if self.tag.number != 'plur' or result.isdisjoint(WordTag.GENDERS):
- result.add(self.tag.number)
- if self.tag.gender and result.isdisjoint(WordTag.GENDERS) and result.isdisjoint(self._TAGS_NO_GENDER):
- if 'PRTF' in result or 'pres' not in result:
- result.add(self.tag.gender)
- if self.tag.person and result.isdisjoint(WordTag.PERSONS) and result.isdisjoint(self._TAGS_NO_PERSON):
- result.add(self.tag.person)
- if 'plur' in result and not result.isdisjoint(WordTag.GENDERS):
- result = result.difference(WordTag.GENDERS)
- return result
-
- def coordination_grams(self) -> set[str]:
- ''' Return set of grammemes for inflection to keep coordination . '''
- result = set()
- if self.tag.case:
- result.add(self.tag.case)
- if self.tag:
- number = self.tag.number
- result.add(number)
- if self.tag.gender and 'plur' not in result:
- result.add(self.tag.gender)
- return result
-
- def to_text(self) -> str:
- ''' Produce string of all grammemes. '''
- return combine_grams(self.tag.grammemes)
diff --git a/import/cctext/ruparser.py b/import/cctext/ruparser.py
deleted file mode 100644
index 7b64cd2..0000000
--- a/import/cctext/ruparser.py
+++ /dev/null
@@ -1,486 +0,0 @@
-''' Parsing russian language using pymorphy3 library. '''
-from __future__ import annotations
-from typing import Optional
-
-from razdel.substring import Substring as Segment
-from pymorphy3.analyzer import Parse as WordParse
-
-from .syntax import RuSyntax, Capitalization
-from .rumodel import SemanticRole, Morphology, WordTag, morpho, Grammemes
-
-INDEX_NONE = -1
-NO_COORDINATION = -1
-WORD_NONE = -1
-
-
-class WordToken:
- ''' Atomic text token. '''
- def __init__(self, segment: Segment, parse: list[WordParse], main_parse: int = 0):
- self.segment: Segment = segment
- self.forms: list[WordParse] = parse
- self.main: int = main_parse
-
- def get_morpho(self) -> Morphology:
- ''' Return morphology for current token. '''
- return Morphology(self.get_parse().tag)
-
- def get_parse(self) -> WordParse:
- ''' Access main form. '''
- return self.forms[self.main]
-
- def inflect(self, inflection_grams: set[str]) -> Optional[WordParse]:
- ''' Apply inflection to segment text. Does not modify forms '''
- inflected = self.get_parse().inflect(inflection_grams)
- if not inflected:
- return None
- self.segment.text = Capitalization.from_text(self.segment.text).apply_to(inflected.word)
- return inflected
-
-
-class Collation:
- ''' Parsed data for input coordinated text. '''
- def __init__(self, text: str):
- self.text = text
- self.words: list[WordToken] = []
- self.coordination: list[int] = []
- self.main_word: int = WORD_NONE
-
- def is_valid(self) -> bool:
- ''' Check if data is parsed correctly '''
- return self.main_word != WORD_NONE
-
- def get_form(self) -> WordParse:
- ''' Access WordParse. '''
- return self.words[self.main_word].get_parse()
-
- def get_morpho(self) -> Morphology:
- ''' Access parsed main morphology. '''
- return self.words[self.main_word].get_morpho()
-
- def add_word(self, segment, forms: list, main_form: int, need_coordination: bool = True):
- ''' Add word information. '''
- self.words.append(WordToken(segment, forms, main_form))
- self.coordination.append(NO_COORDINATION if not need_coordination else 0)
-
- def inflect(self, target_grams: Grammemes) -> str:
- ''' Inflect text to match required tags. '''
- if self.is_valid():
- origin = self.get_morpho()
- if not origin.tag.grammemes.issuperset(target_grams):
- if self._apply_inflection(origin, target_grams):
- return self._generate_text()
- return self.text
-
- def inflect_like(self, base_model: Collation) -> str:
- ''' Create inflection to substitute base_model form. '''
- if self.is_valid():
- morph = base_model.get_morpho()
- if morph.effective_POS:
- tags = set()
- tags.add(morph.effective_POS)
- tags = morph.complete_grams(tags)
- return self.inflect(tags)
- return self.text
-
- def inflect_dependant(self, master_model: Collation) -> str:
- ''' Create inflection to coordinate with master_model form. '''
- assert self.is_valid()
- morph = master_model.get_morpho()
- tags = morph.coordination_grams()
- tags = self.get_morpho().complete_grams(tags)
- return self.inflect(tags)
-
- def normal_form(self) -> str:
- ''' Generate normal form. '''
- if self.is_valid():
- main_form = self.get_form()
- new_morpho = Morphology(main_form.normalized.tag)
- new_grams = new_morpho.complete_grams(frozenset())
- return self.inflect(new_grams)
- return self.text
-
- def _iterate_coordinated(self):
- words_count = len(self.words)
- current_word = self.coordination[words_count]
- while current_word != words_count:
- yield self.words[current_word]
- current_word += self.coordination[current_word]
-
- def _inflect_main_word(self, origin: Morphology, target_grams: Grammemes) -> Optional[Morphology]:
- full_grams = origin.complete_grams(target_grams)
- inflected = self.words[self.main_word].inflect(full_grams)
- if not inflected:
- return None
- return Morphology(inflected.tag)
-
- def _apply_inflection(self, origin: Morphology, target_grams: Grammemes) -> bool:
- new_moprho = self._inflect_main_word(origin, target_grams)
- if not new_moprho:
- return False
- inflection_grams = new_moprho.coordination_grams()
- if len(inflection_grams) == 0:
- return True
- for word in self._iterate_coordinated():
- word.inflect(inflection_grams)
- return True
-
- def _generate_text(self) -> str:
- current_pos = 0
- result = ''
- for token in self.words:
- if token.segment.start > current_pos:
- result += self.text[current_pos: token.segment.start]
- result += token.segment.text
- current_pos = token.segment.stop
- if current_pos + 1 < len(self.text):
- result += self.text[current_pos:]
- return result
-
-
-class PhraseParser:
- ''' Russian grammar parser. '''
- def __init__(self):
- pass
-
- def __del__(self):
- pass
-
- _FILTER_SCORE = 0.005
- _SINGLE_SCORE_SEARCH = 0.2
- _PRIORITY_NONE = NO_COORDINATION
-
- _MAIN_WAIT_LIMIT = 10 # count words until fixing main
- _MAIN_MAX_FOLLOWERS = 3 # count words after main as coordination candidates
-
- def parse(self, text: str,
- require_index: int = INDEX_NONE,
- require_grams: Optional[Grammemes] = None) -> Optional[Collation]:
- '''
- Determine morpho tags for input text.
- ::returns:: Morphology of a text or None if no suitable form is available
- '''
- segments = list(RuSyntax.tokenize(text))
- if len(segments) == 0:
- return None
- elif len(segments) == 1:
- return self._parse_single(segments[0], require_index, require_grams)
- else:
- return self._parse_multiword(text, segments, require_index, require_grams)
-
- def normalize(self, text: str):
- ''' Get normal form for target text. '''
- processed = self.parse(text)
- if processed:
- return processed.normal_form()
- return text
-
- def find_substr(self, text: str, sub: str) -> tuple[int, int]:
- ''' Search for substring position in text regardless of morphology. '''
- if not text or not sub:
- return (0, 0)
- query = [self.normalize(elem.text) for elem in RuSyntax.tokenize(sub)]
- query_len = len(query)
- start = 0
- current_index = 0
- for token in RuSyntax.tokenize(text):
- text_word = self.normalize(token.text)
- if text_word != query[current_index]:
- current_index = 0
- else:
- if current_index == 0:
- start = token.start
- current_index += 1
- if current_index == query_len:
- return (start, token.stop)
- return (0, 0)
-
- def inflect_context(self, text: str, before: str = '', after: str = '') -> str:
- ''' Inflect text in accordance to context before and after. '''
- target = self.parse(text)
- if not target:
- return text
- target_morpho = target.get_morpho()
- if not target_morpho or not target_morpho.can_coordinate:
- return text
-
- model_after = self.parse(after)
- model_before = self.parse(before)
- etalon = PhraseParser._choose_context_etalon(target_morpho, model_before, model_after)
- if not etalon:
- return text
- etalon_moprho = etalon.get_morpho()
- if not etalon_moprho.can_coordinate:
- return text
-
- new_form = PhraseParser._combine_morpho(target_morpho, etalon_moprho.tag)
- return target.inflect(new_form)
-
- def inflect_substitute(self, substitute_normal: str, original: str) -> str:
- ''' Inflect substitute to match original form. '''
- original_model = self.parse(original)
- if not original_model:
- return substitute_normal
- substitute_model = self.parse(substitute_normal)
- if not substitute_model:
- return substitute_normal
- return substitute_model.inflect_like(original_model)
-
- def inflect_dependant(self, dependant_normal: str, master: str) -> str:
- ''' Inflect dependant to coordinate with master text. '''
- master_model = self.parse(master)
- if not master_model:
- return dependant_normal
- dependant_model = self.parse(dependant_normal)
- if not dependant_model:
- return dependant_normal
- return dependant_model.inflect_dependant(master_model)
-
- def _parse_single(self, segment, require_index: int, require_grams: Optional[Grammemes]) -> Optional[Collation]:
- forms = list(self._filtered_parse(segment.text))
- parse_index = INDEX_NONE
- if len(forms) == 0 or require_index >= len(forms):
- return None
-
- if require_index != INDEX_NONE:
- tags = forms[require_index].tag
- if require_grams and not tags.grammemes.issuperset(require_grams):
- return None
- parse_index = require_index
- else:
- current_score = 0
- for (index, form) in enumerate(forms):
- if not require_grams or form.tag.grammemes.issuperset(require_grams):
- if form.tag.case == 'nomn':
- parse_index = index
- break
- elif parse_index == INDEX_NONE:
- current_score = form.score
- parse_index = index
- elif form.score / current_score < self._SINGLE_SCORE_SEARCH:
- break
-
- if parse_index == INDEX_NONE:
- return None
- result = Collation(segment.text)
- result.add_word(segment, [forms[parse_index]], main_form=0, need_coordination=False)
- result.coordination.append(len(result.words))
- result.main_word = 0
- return result
-
- def _parse_multiword(self, text: str, segments: list, require_index: int,
- require_grams: Optional[Grammemes]) -> Optional[Collation]:
- result = Collation(text)
- priority_main: float = self._PRIORITY_NONE
- segment_index = 0
- main_wait = 0
- word_index = 0
- for segment in segments:
- if main_wait > PhraseParser._MAIN_WAIT_LIMIT:
- break
- segment_index += 1
- priority = self._parse_segment(result, segment, require_index, require_grams)
- if priority is None:
- continue # skip non-parsable entities
- main_wait += 1
- if priority > priority_main:
- result.main_word = word_index
- priority_main = priority
- word_index += 1
- if result.main_word == INDEX_NONE:
- return None
- self._finalize_coordination(result)
- if segment_index < len(segments):
- pass # finish to parse segments after main if needed
- return result
-
- def _parse_segment(self,
- output: Collation,
- segment: Segment,
- require_index: int,
- require_grams: Optional[Grammemes]) -> Optional[float]:
- ''' Return priority for this can be a new main word '''
- forms = list(self._filtered_parse(segment.text))
- if len(forms) == 0:
- return None
- main_index: int = INDEX_NONE
- segment_score: float = self._PRIORITY_NONE
- needs_coordination = False
- local_sum: float = 0
- score_sum: float = 0
- if require_index != INDEX_NONE:
- form = forms[require_index]
- if not require_grams or form.tag.grammemes.issuperset(require_grams):
- (local_max, segment_score) = PhraseParser._get_priorities_for(form.tag)
- main_index = require_index
- needs_coordination = Morphology.is_dependable(form.tag.POS)
- else:
- local_max = self._PRIORITY_NONE
- for (index, form) in enumerate(forms):
- if require_grams and not form.tag.grammemes.issuperset(require_grams):
- continue
- (local_priority, global_priority) = PhraseParser._get_priorities_for(form.tag)
- needs_coordination = needs_coordination or Morphology.is_dependable(form.tag.POS)
- local_sum += global_priority * form.score
- score_sum += form.score
- if local_priority > local_max:
- local_max = local_priority
- # segment_score = global_priority
- main_index = index
- if score_sum == 0:
- return None
- segment_score = local_sum / score_sum
- output.add_word(segment, forms, main_index, needs_coordination)
- return segment_score
- # Alternative: return segment_score
- # penalty_suspicious = 0 if local_max == 0 else (1 - local_sum / local_max) * self._PRIORITY_PENALTY
- # return segment_score - penalty_suspicious
-
- @classmethod
- def _finalize_coordination(cls, target: Collation):
- main_morpho: Morphology = target.get_morpho()
- main_coordinate = main_morpho.can_coordinate
- target.coordination[target.main_word] = NO_COORDINATION
- first_change = INDEX_NONE
- current_len = 0
- for (index, word) in enumerate(target.words):
- if target.coordination[index] == NO_COORDINATION or index - target.main_word > cls._MAIN_MAX_FOLLOWERS:
- needs_change = False
- if index != target.main_word:
- word.main = INDEX_NONE
- else:
- word.main = PhraseParser._find_coordination(word.forms, main_morpho.tag, index < target.main_word)
- needs_change = word.main != INDEX_NONE
- if not needs_change or not main_coordinate:
- target.coordination[index] = NO_COORDINATION
- current_len += 1
- if needs_change and main_coordinate:
- target.coordination[index] = current_len
- current_len = 0
- if first_change == INDEX_NONE:
- first_change = index
- if first_change == INDEX_NONE:
- target.coordination.append(len(target.words))
- return
- previous_reference = first_change
- current_word = len(target.words)
- target.coordination.append(current_len + 1)
- while target.coordination[current_word] != INDEX_NONE:
- previous_word = current_word - target.coordination[current_word]
- target.coordination[current_word] = previous_reference
- previous_reference = current_word - previous_word
- current_word = previous_word
- if previous_reference == 0 or current_word < 0:
- break
-
- @staticmethod
- def _find_coordination(forms: list, main_tag: WordTag, before_main: bool) -> int:
- for (index, form) in enumerate(forms):
- pos = form.tag.POS
- case = form.tag.case
- if pos not in ['ADJF', 'ADJS', 'PRTF', 'PRTS']:
- continue
- if SemanticRole.from_POS(pos) == SemanticRole.term and case == 'gent':
- if before_main:
- continue
- else:
- return INDEX_NONE
- if case == main_tag.case:
- return index
- elif main_tag.case in ['accs', 'gent'] and case in ['accs', 'gent']:
- return index
- return INDEX_NONE
-
- @staticmethod
- def _filtered_parse(text: str):
- capital = Capitalization.from_text(text)
- score_filter = PhraseParser._filter_score(morpho.parse(text))
- yield from PhraseParser._filter_capital(score_filter, capital)
-
- @staticmethod
- def _filter_score(generator):
- for form in generator:
- if form.score < PhraseParser._FILTER_SCORE:
- break
- yield form
-
- @staticmethod
- def _filter_capital(generator, capital: Capitalization):
- if capital in [Capitalization.upper_case, Capitalization.mixed]:
- for form in generator:
- if 'Abbr' not in form.tag.grammemes:
- continue
- yield form
- else:
- yield from generator
-
- @staticmethod
- def _parse_word(text: str, require_index: int = INDEX_NONE,
- require_grams: Optional[Grammemes] = None) -> Optional[Morphology]:
- parsed_variants = morpho.parse(text)
- if not parsed_variants or require_index >= len(parsed_variants):
- return None
- if require_index != INDEX_NONE:
- tags = parsed_variants[require_index].tag
- if not require_grams or tags.grammemes.issuperset(require_grams):
- return Morphology(tags)
- else:
- return None
- else:
- for variant in parsed_variants:
- tags = variant.tag
- if not require_grams or tags.grammemes.issuperset(require_grams):
- return Morphology(tags)
- return None
-
- @staticmethod
- def _get_priorities_for(tag: WordTag) -> tuple[float, float]:
- ''' Return pair of local and global priorities. '''
- if tag.POS in ['VERB', 'INFN']:
- return (9, 10)
- if tag.POS in ['NOUN', 'NPRO']:
- return (10, 9) if 'nomn' in tag.grammemes and 'Fixd' not in tag.grammemes else (8, 8)
- if tag.POS in ['PRTF', 'PRTS']:
- return (6, 6)
- if tag.POS in ['ADJF', 'ADJS']:
- return (5, 5)
- if tag.POS == 'ADVB':
- return (7, 4)
- return (0, 0)
-
- @staticmethod
- def _choose_context_etalon(target: Morphology,
- before: Optional[Collation],
- after: Optional[Collation]) -> Optional[Collation]:
- if not before or not before.get_morpho().can_coordinate:
- return after
- if not after or not after.get_morpho().can_coordinate:
- return before
-
- before_semantic = before.get_morpho().semantic
- after_semantic = after.get_morpho().semantic
- if target.semantic == SemanticRole.definition:
- if after_semantic == SemanticRole.term:
- return after
- if before_semantic == SemanticRole.term:
- return before
- if before_semantic == SemanticRole.definition:
- return before
- return after
-
- if target.semantic == SemanticRole.term:
- if before_semantic == SemanticRole.definition:
- return before
- if after_semantic == SemanticRole.definition:
- return after
-
- return before
-
- @staticmethod
- def _combine_morpho(target: Morphology, etalon: WordTag) -> frozenset[str]:
- part_of_speech = target.tag.POS
- number = etalon.number
- if number == 'plur':
- return frozenset([part_of_speech, number, etalon.case])
- else:
- gender = etalon.gender if target.semantic != SemanticRole.term else target.tag.gender
- return frozenset([part_of_speech, number, gender, etalon.case])
diff --git a/import/cctext/syntax.py b/import/cctext/syntax.py
deleted file mode 100644
index 012b396..0000000
--- a/import/cctext/syntax.py
+++ /dev/null
@@ -1,87 +0,0 @@
-''' Russian language syntax incapsulation. '''
-from __future__ import annotations
-from enum import Enum, unique
-
-from razdel import tokenize
-
-
-@unique
-class Capitalization(Enum):
- ''' Enumerating capitalization types. '''
- unknwn = 0
- lower_case = 1
- upper_case = 2
- first_capital = 3
- mixed = 4
-
- @staticmethod
- def from_text(text: str) -> Capitalization:
- ''' Fabric method to identify capitalization in text. '''
- if len(text) == 0:
- return Capitalization.unknwn
- first_capital = Capitalization._is_capital(text[0])
- has_mid_capital = False
- has_lower = not first_capital
- for symbol in text[1:]:
- if Capitalization._is_capital(symbol):
- if has_lower:
- return Capitalization.mixed
- has_mid_capital = True
- else:
- if has_mid_capital:
- return Capitalization.mixed
- else:
- has_lower = True
- if has_mid_capital:
- return Capitalization.upper_case
- elif first_capital:
- return Capitalization.first_capital
- else:
- return Capitalization.lower_case
-
- def apply_to(self, text: str) -> str:
- ''' Apply capitalization to text. '''
- if not text or self in [Capitalization.unknwn, Capitalization.mixed]:
- return text
- elif self == Capitalization.lower_case:
- return text.lower()
- elif self == Capitalization.upper_case:
- return text.upper()
- else:
- return text[0].upper() + text[1:]
-
- @staticmethod
- def _is_capital(symbol: str) -> bool:
- return 'А' <= symbol <= 'Я' or 'A' <= symbol <= 'Z'
-
-
-class RuSyntax:
- ''' Russian language syntax parser. '''
- def __init__(self):
- pass
-
- def __del__(self):
- pass
-
- @staticmethod
- def is_single_word(text: str) -> bool:
- ''' Test if text is a single word. '''
- try:
- gen = tokenize(text)
- if next(gen) == '':
- return True
- if next(gen) == '':
- return True
- return False
- except StopIteration:
- return True
-
- @staticmethod
- def tokenize(text: str):
- ''' Split text into words. Returns list[(start, stop, text)]. '''
- return tokenize(text)
-
- @staticmethod
- def split_words(text: str) -> list[str]:
- ''' Split text into words. '''
- return [elem.text for elem in tokenize(text)]
diff --git a/script/ExteorSetup_x64.iss b/script/ExteorSetup_x64.iss
index 6eeacf8..d8a2385 100644
--- a/script/ExteorSetup_x64.iss
+++ b/script/ExteorSetup_x64.iss
@@ -60,11 +60,10 @@ Name: quicklaunchicon; Description: "{cm:CreateQuickLaunchIcon}"; GroupDescripti
[Files]
Source: "..\bin\x64\Exteor.exe"; DestDir: "{app}"; Flags: ignoreversion
-Source: "..\import\cctext\*"; DestDir: "{app}\cctext"; Flags: ignoreversion
Source: "app\*"; DestDir: "{app}"; Flags: ignoreversion recursesubdirs
Source: "..\distr\app\DejaVu Sans.ttf"; DestDir: "{app}"; Flags: ignoreversion
-Source: "..\script\PymorphyInstall.bat"; DestDir: {tmp}; Flags: deleteafterinstall
+Source: "..\script\installDependencies.bat"; DestDir: {tmp}; Flags: deleteafterinstall
[Icons]
Name: "{group}\{#ExteorName}"; Filename: "{app}\{#ExteorEXE}"
@@ -101,7 +100,7 @@ Filename: "{app}\{#ExteorEXE}"; Description: "{cm:LaunchProgram,{#StringChange(E
Filename: "{app}\Документация\README.rtf"; Description: "{cm:ReadMe}"; Flags: nowait postinstall skipifsilent unchecked shellexec
Filename: "{tmp}\{#VSRedist}"; Parameters: "/install /quiet /NORESTART"; StatusMsg: {cm:CppRedist}; Check: VCRedistNeedsInstall()
Filename: "{tmp}\{#PythonRedist}"; Parameters: "/quiet InstallAllUsers=1 PrependPath=1 Include_test=0"; StatusMsg: {cm:PythonInstall}; Check: PythonNeedsInstall()
-Filename: "{tmp}\PymorphyInstall.bat"; Parameters: """{code:PythonPath}"""
+Filename: "{tmp}\installDependencies.bat"; Parameters: """{code:PythonPath}"""
[Code]
function IsPythonMissing: Boolean; forward;
diff --git a/script/ExteorSetup_x86.iss b/script/ExteorSetup_x86.iss
index 82aabbd..efc10be 100644
--- a/script/ExteorSetup_x86.iss
+++ b/script/ExteorSetup_x86.iss
@@ -60,11 +60,10 @@ Name: quicklaunchicon; Description: "{cm:CreateQuickLaunchIcon}"; GroupDescripti
[Files]
Source: "..\bin\x64\Exteor.exe"; DestDir: "{app}"; Flags: ignoreversion
-Source: "..\import\cctext\*"; DestDir: "{app}\cctext"; Flags: ignoreversion
Source: "app\*"; DestDir: "{app}"; Flags: ignoreversion recursesubdirs
Source: "..\distr\app\DejaVu Sans.ttf"; DestDir: "{app}"; Flags: ignoreversion
-Source: "..\script\PymorphyInstall.bat"; DestDir: {tmp}; Flags: deleteafterinstall
+Source: "..\script\installDependencies.bat"; DestDir: {tmp}; Flags: deleteafterinstall
[Icons]
Name: "{group}\{#ExteorName}"; Filename: "{app}\{#ExteorEXE}"
@@ -101,7 +100,7 @@ Filename: "{app}\{#ExteorEXE}"; Description: "{cm:LaunchProgram,{#StringChange(E
Filename: "{app}\Документация\README.rtf"; Description: "{cm:ReadMe}"; Flags: nowait postinstall skipifsilent unchecked shellexec
Filename: "{tmp}\{#VSRedist}"; Parameters: "/install /quiet /NORESTART"; StatusMsg: {cm:CppRedist}; Check: VCRedistNeedsInstall()
Filename: "{tmp}\{#PythonRedist}"; Parameters: "/quiet InstallAllUsers=1 PrependPath=1 Include_test=0"; StatusMsg: {cm:PythonInstall}; Check: PythonNeedsInstall()
-Filename: "{tmp}\PymorphyInstall.bat"; Parameters: """{code:PythonPath}"""
+Filename: "{tmp}\installDependencies.bat"; Parameters: """{code:PythonPath}"""
[Code]
function IsPythonMissing: Boolean; forward;
diff --git a/script/PymorphyInstall.bat b/script/installDependencies.bat
similarity index 66%
rename from script/PymorphyInstall.bat
rename to script/installDependencies.bat
index f14f9c2..55932f1 100644
--- a/script/PymorphyInstall.bat
+++ b/script/installDependencies.bat
@@ -7,8 +7,7 @@ PING -n 1 www.google.com > nul && (goto :SUCCESS) || (goto :FAILURE)
:SUCCESS
@echo off
-"%python3Path%Scripts\pip.exe" install razdel || (goto :FAILURE)
-"%python3Path%Scripts\pip.exe" install pymorphy3 || (goto :FAILURE)
+"%python3Path%Scripts\pip.exe" install cctext || (goto :FAILURE)
goto :END
:FAILURE
diff --git a/test/XTRCoreTest.vcxproj b/test/XTRCoreTest.vcxproj
index 4a46958..5dcf90b 100644
--- a/test/XTRCoreTest.vcxproj
+++ b/test/XTRCoreTest.vcxproj
@@ -154,7 +154,7 @@
Level4
true
$(IntDir)obj\
- Mock;./;..\include;..\import\include;..\..\ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories)
+ Mock;./;..\include;..\import\include;..\..\GH-ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories)
stdcpplatest
false
@@ -163,12 +163,11 @@
Console
ConceptCoreLibraryd.lib;RSlangd.lib;cclGraphd.lib;cclLangd.lib;iphlpapi.lib;%(AdditionalDependencies)
false
- C:\Program Files (x86)\Python312-32\libs;..\..\ConceptCore\output\lib\x86;..\..\OfficeOLE\output\lib\x86;%(AdditionalLibraryDirectories)
+ C:\Program Files (x86)\Python312-32\libs;..\..\GH-ConceptCore\output\lib\x86;..\..\OfficeOLE\output\lib\x86;%(AdditionalLibraryDirectories)
/ignore:4099 %(AdditionalOptions)
- xcopy /y /s /q /i "..\import\cctext" "$(OutDir)\cctext\"
-copy "Data\sample_module.py" "$(OutDir)sample_module.py"
+ copy "Data\sample_module.py" "$(OutDir)sample_module.py"
copy "C:\Program Files (x86)\Python312-32\Python312_d.dll" "$(OutDir)"
copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)"
@@ -184,7 +183,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)"
Level4
true
$(IntDir)obj\
- Mock;./;..\include;..\header;..\import\include;..\..\ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories)
+ Mock;./;..\include;..\header;..\import\include;..\..\GH-ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories)
stdcpplatest
false
@@ -193,12 +192,11 @@ copy "C:\Program Files (x86)\Python312-32\Python312_d.pdb" "$(OutDir)"
Console
ConceptCoreLibraryd.lib;RSlangd.lib;cclGraphd.lib;cclLangd.lib;iphlpapi.lib;%(AdditionalDependencies)
false
- C:\Program Files\Python312\libs;..\..\ConceptCore\output\lib\x64;..\..\OfficeOLE\output\lib\x64;%(AdditionalLibraryDirectories)
+ C:\Program Files\Python312\libs;..\..\GH-ConceptCore\output\lib\x64;..\..\OfficeOLE\output\lib\x64;%(AdditionalLibraryDirectories)
/ignore:4099 %(AdditionalOptions)
- xcopy /y /s /q /i "..\import\cctext" "$(OutDir)\cctext\"
-copy "Data\sample_module.py" "$(OutDir)sample_module.py"
+ copy "Data\sample_module.py" "$(OutDir)sample_module.py"
@@ -212,7 +210,7 @@ copy "Data\sample_module.py" "$(OutDir)sample_module.py"
false
true
$(IntDir)obj\
- Mock;./;..\include;..\header;..\import\include;..\import\include;..\..\ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories)
+ Mock;./;..\include;..\header;..\import\include;..\import\include;..\..\GH-ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files (x86)\Python312-32\include;%(AdditionalIncludeDirectories)
stdcpplatest
false
@@ -223,12 +221,11 @@ copy "Data\sample_module.py" "$(OutDir)sample_module.py"
true
ConceptCoreLibrary.lib;RSlang.lib;cclGraph.lib;cclLang.lib;iphlpapi.lib;%(AdditionalDependencies)
UseLinkTimeCodeGeneration
- C:\Program Files (x86)\Python312-32\libs;..\..\ConceptCore\output\lib\x86;..\..\OfficeOLE\output\lib\x86;%(AdditionalLibraryDirectories)
+ C:\Program Files (x86)\Python312-32\libs;..\..\GH-ConceptCore\output\lib\x86;..\..\OfficeOLE\output\lib\x86;%(AdditionalLibraryDirectories)
/ignore:4099 %(AdditionalOptions)
- xcopy /y /s /q /i "..\import\cctext" "$(OutDir)\cctext\"
-copy "Data\sample_module.py" "$(OutDir)sample_module.py"
+ copy "Data\sample_module.py" "$(OutDir)sample_module.py"
copy "C:\Program Files (x86)\Python312-32\Python312.dll" "$(OutDir)"
copy "C:\Program Files (x86)\Python312-32\Python312.pdb" "$(OutDir)"
@@ -244,7 +241,7 @@ copy "C:\Program Files (x86)\Python312-32\Python312.pdb" "$(OutDir)"
false
true
$(IntDir)obj\
- Mock;./;..\include;..\header;..\import\include;..\import\include;..\..\ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories)
+ Mock;./;..\include;..\header;..\import\include;..\import\include;..\..\GH-ConceptCore\output\include;..\..\OfficeOLE\include;C:\Program Files\Python312\include;%(AdditionalIncludeDirectories)
stdcpplatest
false
@@ -256,12 +253,11 @@ copy "C:\Program Files (x86)\Python312-32\Python312.pdb" "$(OutDir)"
ConceptCoreLibrary.lib;RSlang.lib;cclGraph.lib;cclLang.lib;iphlpapi.lib;%(AdditionalDependencies)
UseLinkTimeCodeGeneration
false
- C:\Program Files\Python312\libs;..\..\ConceptCore\output\lib\x64;..\..\OfficeOLE\output\lib\x64;%(AdditionalLibraryDirectories)
+ C:\Program Files\Python312\libs;..\..\GH-ConceptCore\output\lib\x64;..\..\OfficeOLE\output\lib\x64;%(AdditionalLibraryDirectories)
/ignore:4099 %(AdditionalOptions)
- xcopy /y /s /q /i "..\import\cctext" "$(OutDir)\cctext\"
-copy "Data\sample_module.py" "$(OutDir)sample_module.py"
+ copy "Data\sample_module.py" "$(OutDir)sample_module.py"