BRE/webapi/portal/document.py
2024-06-07 19:50:21 +03:00

76 lines
2.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

'''Processing docx files'''
import os
import subprocess
import docx
from docx.shared import Pt
def _strip_stress_(text: str) -> str:
''' Удаляет с текста ударения text: текст '''
a = bytes(text, encoding='utf8').split(b"\xcc\x81")
res = ""
for i in a:
res += str(i, encoding='utf-8')
return res
class DocxTextProcessor:
'''Document processor class'''
def __init__(self, file_name=None) -> None:
'''
file_name: название файла для считывания
Конструктор автоматически создаёт Документ и парсит в текст, разделяя литературу и авторов
'''
self._docx = None
self.text = ''
self.authors = ''
self.bibliography = ''
self._name = ''
if file_name is not None:
self.process_document(file_name)
def process_document(self, file_name: str) -> bool:
'''Document processing'''
self._docx = docx.Document(file_name)
self._name = file_name.split('/')[-1].split('.docx')[0]
try:
self.text = self.get_text__()
except: # pylint: disable=bare-except
return False
return True
def get_text__(self):
''' Получает текст - самый стрёмный метод '''
res = ''
f = self._docx
name = self._name
is_bib_part = False
for p in f.paragraphs:
if p.runs[0].font.size < Pt(15):
for run in p.runs:
if name == _strip_stress_(run.text):
res += (run.text[:1] + run.text[1:].lower())
continue
if "Лит" in run.text or is_bib_part:
is_bib_part = True
self.bibliography += run.text
continue
res += (run.text)
res += '\n'
test = _strip_stress_(res[:150]).lower()
if test.find(name.lower().split('/')[-1]) > 2:
self.authors = res[:test.find(name.lower()) - 1]
res = res[test.find(name.lower()):]
return res
def process_typograph(self):
''' Обработка типографом '''
with open('test', 'wb+') as file:
file.write(bytes(self.text, encoding='utf-8'))
cmd = 'node process_typograph.js'
subprocess.run(cmd, capture_output=True, check=True) # mind about try
with open('temp.log', 'r+', encoding='utf8') as file:
self.text = file.read()
os.remove("test")
os.remove("temp.log")