BRE/webapi/portal/document.py

'''Processing docx files'''
import os
import subprocess
import docx
from docx.shared import Pt


def _strip_stress_(text: str) -> str:
    ''' Удаляет с текста ударения text: текст '''
    a = bytes(text, encoding='utf8').split(b"\xcc\x81")
    res = ""
    for i in a:
        res += str(i, encoding='utf-8')
    return res


class DocxTextProcessor:
    '''Document processor class'''
    def __init__(self, file_name=None) -> None:
        '''
        file_name: название файла для считывания
        Конструктор автоматически создаёт Документ и парсит в текст, разделяя литературу и авторов
        '''
        self._docx = None
        self.text = ''
        self.authors = ''
        self.bibliography = ''
        self._name = ''
        if file_name is not None:
            self.process_document(file_name)

    def process_document(self, file_name: str) -> bool:
        '''Document processing'''
        self._docx = docx.Document(file_name)
        self._name = file_name.split('/')[-1].split('.docx')[0]
        try:
            self.text = self.get_text__()
        except:  # pylint: disable=bare-except
            return False
        return True

    def get_text__(self):
        ''' Получает текст - самый стрёмный метод '''
        res = ''
        f = self._docx
        name = self._name
        is_bib_part = False
        for p in f.paragraphs:
            if p.runs[0].font.size < Pt(15):
                for run in p.runs:
                    if name == _strip_stress_(run.text):
                        res += (run.text[:1] + run.text[1:].lower())
                        continue
                    if "Лит" in run.text or is_bib_part:
                        is_bib_part = True
                        self.bibliography += run.text
                        continue
                    res += (run.text)
                res += '\n'
        test = _strip_stress_(res[:150]).lower()
        if test.find(name.lower().split('/')[-1]) > 2:
            self.authors = res[:test.find(name.lower()) - 1]
            res = res[test.find(name.lower()):]
        return res

    def process_typograph(self):
        ''' Обработка типографом '''
        with open('test', 'wb+') as file:
            file.write(bytes(self.text, encoding='utf-8'))
        cmd = 'node process_typograph.js'
        subprocess.run(cmd, capture_output=True, check=True)    # mind about try
        with open('temp.log', 'r+', encoding='utf8') as file:
            self.text = file.read()
        os.remove("test")
        os.remove("temp.log")