BRE/webapi/portal/electron_loading.py

'''
автоматическая загруза с сайта посредством selenium
с использованием браузера Chrome (требуется драйвер)
'''
import csv

from selenium.webdriver.common.by import By

from .selenium_wrapper import WebBrowser

_LOGIN_PAGE = 'https://bigenc.ru/user/login'
_ARTICLE_PAGE = 'https://bigenc.ru/user/content/articles'


class GBDownloader:
    '''Downloader functionality'''
    def __init__(self, browser: WebBrowser, save_path: str, append: bool, suffix_seed: int):
        mode = 'a' if append else 'w'
        suffix = ('00' + str(suffix_seed))[-3:]
        self.browser = browser
        self.save_path = save_path

        self._articles = open('{}\\!out_bd_{}.csv'.format(save_path, suffix), mode, newline='')
        self._references = open('{}\\!ref_bd_{}.csv'.format(save_path, suffix), mode, newline='')
        self._images = open('{}\\!img_bd_{}.csv'.format(save_path, suffix), mode, newline='')
        self._bibliography = open('{}\\!bib_bd_{}.csv'.format(save_path, suffix), mode, newline='')

        self.writer_out = csv.writer(self._articles)
        self.writer_ref = csv.writer(self._references)
        self.writer_img = csv.writer(self._images)
        self.writer_bib = csv.writer(self._bibliography)
        if not append:
            self._create_headers()

    def __del__(self):
        self._articles.close()
        self._references.close()
        self._images.close()
        self._bibliography.close()

    def login(self, login: str, password: str) -> bool:
        '''Login to electron version of BRE'''
        self.browser.driver.get(_LOGIN_PAGE)
        if not self.browser.wait_presence('page-login-form'):
            return False

        self.browser.driver.find_element(By.NAME, '__login').send_keys(login)
        self.browser.driver.find_element(By.NAME, '__password').send_keys(password)
        self.browser.driver.find_element(By.TAG_NAME, 'button').click()
        return self.browser.wait_presence('userMenu')

    def scan(self, start: str, stop: str, max_items: int) -> str:
        '''Scan element from BRE'''
        next_id = start
        for i in range(max_items):
            print('{}-{}'.format(i, next_id))
            next_id = self._process_article(next_id, i)
            self._flush()
            if next_id == '':
                print("Stopped with no next")
                break
            if next_id == stop:
                print("Stopped at " + stop + " and worked " + str(i + 1) + "articles")
                break
        return next_id

    def _flush(self):
        self._articles.flush()
        self._references.flush()
        self._images.flush()
        self._bibliography.flush()

    def _create_headers(self):
        self.writer_out.writerow([
            'Indx', 'ArticleId', 'BlackWord', 'subWord', 'Rubrika', 'Year', 'Slovnik', 'Author(s)',
            'KolZnak', 'Status', 'Version', 'Litr', 'Soch', 'KolImg', 'KolCnt'
        ])
        # ИД слова статьи + ИД статьи ссылаемого слова + другое слово + ссылка
        self.writer_ref.writerow(['WordId', 'RefWordId', 'RefWord', 'RefURL'])

        # ИД слова + ИД ссылки на иллюстрацию
        self.writer_img.writerow(['WordId', 'ImgId'])

        # ИД слова + ИД библиоссылки + тип библиографии + текст библиографии
        self.writer_bib.writerow(['WordId', 'BibId', 'BibType', 'BibText'])

    def _process_article(self, article_id: str, index: int) -> str:
        self.browser.driver.get('{}/{}'.format(_ARTICLE_PAGE, article_id))
        if not self.browser.wait_presence('userMenu'):
            return ''

        self._download_rtf()

        image_count = self._extract_images(article_id)
        links_count = self._extract_links(article_id)
        literature, sochinenie = self._extract_biblio(article_id)
        self._extract_article_info(article_id, index, image_count, links_count, literature, sochinenie)
        ref = self.browser.by_xpath('//a[.="следующая >>"]').get_attribute('href')
        return ref.split('/')[-1]

    def _download_rtf(self):
        self.browser.by_xpath('//a[@title="Rich Text Format"]').click()
        return True

    def _extract_article_info(self, article_id: str, index: int, images: int, links: int, literature: int, soch: int):
        black_word = self.browser.by_xpath('//h1[@name="title"]').text
        sub_word = self.browser.by_xpath('//h2[@name="subtitle"]').text
        section_name = self.browser.by_xpath('//span[@name="section"]').text  # Рубрика сайта
        markers = self.browser.by_xpath('//span[@name="markers"]').text  # Словник
        authors = self.browser.by_xpath_m('//p[@class="Автор type-"]')  # Автор // !!два значения или не одного
        author = '' if len(authors) == 0 else authors[0].text
        char_num = self.browser.by_xpath('//span[@class="art_num_chars"]').text  # Количество знаков
        status = self.browser.by_xpath('//span[@class="status-text"]').text  # Статус ("опубликовано" и др.)
        status = status.replace('Статус: ', '')
        version = self.browser.by_xpath('//td[@id="art-cat"]').text  # Версия статьи ("Исходная" и др.)
        year = self.browser.by_xpath('//span[@class="year"]').text
        self.writer_out.writerow([
            index, article_id, black_word, sub_word, section_name, year, markers, author,
            char_num, status, version, literature, soch, images, links
        ])

    def _extract_images(self, article_id: str) -> int:
        # Гиперссылка на иллюстрации (src="")
        images = self.browser.by_xpath_m('//img')
        for img in images:
            source = img.get_attribute('src')
            self.writer_img.writerow([article_id, source])
        return len(images)

    def _extract_links(self, article_id: str) -> int:
        links = self.browser.by_xpath_m('//a[@class="processed-link"]')
        for link in links:
            self.writer_ref.writerow([
                article_id, link.get_attribute('data-art'),
                link.get_attribute('data-word'),
                link.get_attribute('href')
            ])
        return len(links)

    def _extract_biblio(self, article_id: str):
        # Библиография // (по куску текста) + удалить '&shy;' (\xad) заменить '&nbsp;' на пробел Возможно, подстрока 'type-' означает "Лит.:"
        biblios = self.browser.by_xpath_m('//div[@class="puretext type-biblio"]')
        litr = 0
        soch = 0
        for bib_item in biblios:
            biblio = bib_item.text.replace('\xad', '')
            itr = 0
            if biblio.find('Соч.:', 0, 5) != -1:
                soch = biblio.count(';') + 1
                for item in biblio.replace('Соч.: ', '').split('; '):
                    itr = itr + 1
                    self.writer_bib.writerow([article_id, itr, 'Soch', item])
                continue

            if biblio.find('Лит.:', 0, 5) != -1:
                litr = biblio.count(';') + 1
                for item in biblio.replace('Лит.: ', '').split('; '):
                    itr = itr + 1
                    self.writer_bib.writerow([article_id, itr, 'Litr', item])
                continue

            if biblio.find('Общие труды.', 0, 13) != -1:
                for item in biblio.replace('Общие труды. ', '').split('; '):
                    itr = itr + 1
                    self.writer_bib.writerow([article_id, itr, 'gnrl', item])
                continue

            for item in biblio.split('; '):
                itr = itr + 1
                self.writer_bib.writerow([article_id, itr, 'inoe', item])
        return litr, soch