76 lines
2.7 KiB
Python
76 lines
2.7 KiB
Python
'''Processing docx files'''
|
||
import os
|
||
import subprocess
|
||
import docx
|
||
from docx.shared import Pt
|
||
|
||
|
||
def _strip_stress_(text: str) -> str:
|
||
''' Удаляет с текста ударения text: текст '''
|
||
a = bytes(text, encoding='utf8').split(b"\xcc\x81")
|
||
res = ""
|
||
for i in a:
|
||
res += str(i, encoding='utf-8')
|
||
return res
|
||
|
||
|
||
class DocxTextProcessor:
|
||
'''Document processor class'''
|
||
def __init__(self, file_name=None) -> None:
|
||
'''
|
||
file_name: название файла для считывания
|
||
Конструктор автоматически создаёт Документ и парсит в текст, разделяя литературу и авторов
|
||
'''
|
||
self._docx = None
|
||
self.text = ''
|
||
self.authors = ''
|
||
self.bibliography = ''
|
||
self._name = ''
|
||
if file_name is not None:
|
||
self.process_document(file_name)
|
||
|
||
def process_document(self, file_name: str) -> bool:
|
||
'''Document processing'''
|
||
self._docx = docx.Document(file_name)
|
||
self._name = file_name.split('/')[-1].split('.docx')[0]
|
||
try:
|
||
self.text = self.get_text__()
|
||
except: # pylint: disable=bare-except
|
||
return False
|
||
return True
|
||
|
||
def get_text__(self):
|
||
''' Получает текст - самый стрёмный метод '''
|
||
res = ''
|
||
f = self._docx
|
||
name = self._name
|
||
is_bib_part = False
|
||
for p in f.paragraphs:
|
||
if p.runs[0].font.size < Pt(15):
|
||
for run in p.runs:
|
||
if name == _strip_stress_(run.text):
|
||
res += (run.text[:1] + run.text[1:].lower())
|
||
continue
|
||
if "Лит" in run.text or is_bib_part:
|
||
is_bib_part = True
|
||
self.bibliography += run.text
|
||
continue
|
||
res += (run.text)
|
||
res += '\n'
|
||
test = _strip_stress_(res[:150]).lower()
|
||
if test.find(name.lower().split('/')[-1]) > 2:
|
||
self.authors = res[:test.find(name.lower()) - 1]
|
||
res = res[test.find(name.lower()):]
|
||
return res
|
||
|
||
def process_typograph(self):
|
||
''' Обработка типографом '''
|
||
with open('test', 'wb+') as file:
|
||
file.write(bytes(self.text, encoding='utf-8'))
|
||
cmd = 'node process_typograph.js'
|
||
subprocess.run(cmd, capture_output=True, check=True) # mind about try
|
||
with open('temp.log', 'r+', encoding='utf8') as file:
|
||
self.text = file.read()
|
||
os.remove("test")
|
||
os.remove("temp.log")
|