'''Processing docx files''' import os import subprocess import docx from docx.shared import Pt def _strip_stress_(text: str) -> str: ''' Удаляет с текста ударения text: текст ''' a = bytes(text, encoding='utf8').split(b"\xcc\x81") res = "" for i in a: res += str(i, encoding='utf-8') return res class DocxTextProcessor: '''Document processor class''' def __init__(self, file_name=None) -> None: ''' file_name: название файла для считывания Конструктор автоматически создаёт Документ и парсит в текст, разделяя литературу и авторов ''' self._docx = None self.text = '' self.authors = '' self.bibliography = '' self._name = '' if file_name is not None: self.process_document(file_name) def process_document(self, file_name: str) -> bool: '''Document processing''' self._docx = docx.Document(file_name) self._name = file_name.split('/')[-1].split('.docx')[0] try: self.text = self.get_text__() except: # pylint: disable=bare-except return False return True def get_text__(self): ''' Получает текст - самый стрёмный метод ''' res = '' f = self._docx name = self._name is_bib_part = False for p in f.paragraphs: if p.runs[0].font.size < Pt(15): for run in p.runs: if name == _strip_stress_(run.text): res += (run.text[:1] + run.text[1:].lower()) continue if "Лит" in run.text or is_bib_part: is_bib_part = True self.bibliography += run.text continue res += (run.text) res += '\n' test = _strip_stress_(res[:150]).lower() if test.find(name.lower().split('/')[-1]) > 2: self.authors = res[:test.find(name.lower()) - 1] res = res[test.find(name.lower()):] return res def process_typograph(self): ''' Обработка типографом ''' with open('test', 'wb+') as file: file.write(bytes(self.text, encoding='utf-8')) cmd = 'node process_typograph.js' subprocess.run(cmd, capture_output=True, check=True) # mind about try with open('temp.log', 'r+', encoding='utf8') as file: self.text = file.read() os.remove("test") os.remove("temp.log")