76 lines
2.7 KiB
Python
76 lines
2.7 KiB
Python
![]() |
'''Processing docx files'''
|
|||
|
import os
|
|||
|
import subprocess
|
|||
|
import docx
|
|||
|
from docx.shared import Pt
|
|||
|
|
|||
|
|
|||
|
def _strip_stress_(text: str) -> str:
|
|||
|
''' Удаляет с текста ударения text: текст '''
|
|||
|
a = bytes(text, encoding='utf8').split(b"\xcc\x81")
|
|||
|
res = ""
|
|||
|
for i in a:
|
|||
|
res += str(i, encoding='utf-8')
|
|||
|
return res
|
|||
|
|
|||
|
|
|||
|
class DocxTextProcessor:
|
|||
|
'''Document processor class'''
|
|||
|
def __init__(self, file_name=None) -> None:
|
|||
|
'''
|
|||
|
file_name: название файла для считывания
|
|||
|
Конструктор автоматически создаёт Документ и парсит в текст, разделяя литературу и авторов
|
|||
|
'''
|
|||
|
self._docx = None
|
|||
|
self.text = ''
|
|||
|
self.authors = ''
|
|||
|
self.bibliography = ''
|
|||
|
self._name = ''
|
|||
|
if file_name is not None:
|
|||
|
self.process_document(file_name)
|
|||
|
|
|||
|
def process_document(self, file_name: str) -> bool:
|
|||
|
'''Document processing'''
|
|||
|
self._docx = docx.Document(file_name)
|
|||
|
self._name = file_name.split('/')[-1].split('.docx')[0]
|
|||
|
try:
|
|||
|
self.text = self.get_text__()
|
|||
|
except: # pylint: disable=bare-except
|
|||
|
return False
|
|||
|
return True
|
|||
|
|
|||
|
def get_text__(self):
|
|||
|
''' Получает текст - самый стрёмный метод '''
|
|||
|
res = ''
|
|||
|
f = self._docx
|
|||
|
name = self._name
|
|||
|
is_bib_part = False
|
|||
|
for p in f.paragraphs:
|
|||
|
if p.runs[0].font.size < Pt(15):
|
|||
|
for run in p.runs:
|
|||
|
if name == _strip_stress_(run.text):
|
|||
|
res += (run.text[:1] + run.text[1:].lower())
|
|||
|
continue
|
|||
|
if "Лит" in run.text or is_bib_part:
|
|||
|
is_bib_part = True
|
|||
|
self.bibliography += run.text
|
|||
|
continue
|
|||
|
res += (run.text)
|
|||
|
res += '\n'
|
|||
|
test = _strip_stress_(res[:150]).lower()
|
|||
|
if test.find(name.lower().split('/')[-1]) > 2:
|
|||
|
self.authors = res[:test.find(name.lower()) - 1]
|
|||
|
res = res[test.find(name.lower()):]
|
|||
|
return res
|
|||
|
|
|||
|
def process_typograph(self):
|
|||
|
''' Обработка типографом '''
|
|||
|
with open('test', 'wb+') as file:
|
|||
|
file.write(bytes(self.text, encoding='utf-8'))
|
|||
|
cmd = 'node process_typograph.js'
|
|||
|
subprocess.run(cmd, capture_output=True, check=True) # mind about try
|
|||
|
with open('temp.log', 'r+', encoding='utf8') as file:
|
|||
|
self.text = file.read()
|
|||
|
os.remove("test")
|
|||
|
os.remove("temp.log")
|