BRE/webapi/portal/document.py

76 lines
2.7 KiB
Python
Raw Normal View History

2024-06-07 19:50:21 +03:00
'''Processing docx files'''
import os
import subprocess
import docx
from docx.shared import Pt
def _strip_stress_(text: str) -> str:
''' Удаляет с текста ударения text: текст '''
a = bytes(text, encoding='utf8').split(b"\xcc\x81")
res = ""
for i in a:
res += str(i, encoding='utf-8')
return res
class DocxTextProcessor:
'''Document processor class'''
def __init__(self, file_name=None) -> None:
'''
file_name: название файла для считывания
Конструктор автоматически создаёт Документ и парсит в текст, разделяя литературу и авторов
'''
self._docx = None
self.text = ''
self.authors = ''
self.bibliography = ''
self._name = ''
if file_name is not None:
self.process_document(file_name)
def process_document(self, file_name: str) -> bool:
'''Document processing'''
self._docx = docx.Document(file_name)
self._name = file_name.split('/')[-1].split('.docx')[0]
try:
self.text = self.get_text__()
except: # pylint: disable=bare-except
return False
return True
def get_text__(self):
''' Получает текст - самый стрёмный метод '''
res = ''
f = self._docx
name = self._name
is_bib_part = False
for p in f.paragraphs:
if p.runs[0].font.size < Pt(15):
for run in p.runs:
if name == _strip_stress_(run.text):
res += (run.text[:1] + run.text[1:].lower())
continue
if "Лит" in run.text or is_bib_part:
is_bib_part = True
self.bibliography += run.text
continue
res += (run.text)
res += '\n'
test = _strip_stress_(res[:150]).lower()
if test.find(name.lower().split('/')[-1]) > 2:
self.authors = res[:test.find(name.lower()) - 1]
res = res[test.find(name.lower()):]
return res
def process_typograph(self):
''' Обработка типографом '''
with open('test', 'wb+') as file:
file.write(bytes(self.text, encoding='utf-8'))
cmd = 'node process_typograph.js'
subprocess.run(cmd, capture_output=True, check=True) # mind about try
with open('temp.log', 'r+', encoding='utf8') as file:
self.text = file.read()
os.remove("test")
os.remove("temp.log")