BRE/webapi/portal/electron_loading.py
2024-06-07 19:50:21 +03:00

172 lines
7.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

'''
автоматическая загруза с сайта посредством selenium
с использованием браузера Chrome (требуется драйвер)
'''
import csv
from selenium.webdriver.common.by import By
from .selenium_wrapper import WebBrowser
_LOGIN_PAGE = 'https://bigenc.ru/user/login'
_ARTICLE_PAGE = 'https://bigenc.ru/user/content/articles'
class GBDownloader:
'''Downloader functionality'''
def __init__(self, browser: WebBrowser, save_path: str, append: bool, suffix_seed: int):
mode = 'a' if append else 'w'
suffix = ('00' + str(suffix_seed))[-3:]
self.browser = browser
self.save_path = save_path
self._articles = open('{}\\!out_bd_{}.csv'.format(save_path, suffix), mode, newline='')
self._references = open('{}\\!ref_bd_{}.csv'.format(save_path, suffix), mode, newline='')
self._images = open('{}\\!img_bd_{}.csv'.format(save_path, suffix), mode, newline='')
self._bibliography = open('{}\\!bib_bd_{}.csv'.format(save_path, suffix), mode, newline='')
self.writer_out = csv.writer(self._articles)
self.writer_ref = csv.writer(self._references)
self.writer_img = csv.writer(self._images)
self.writer_bib = csv.writer(self._bibliography)
if not append:
self._create_headers()
def __del__(self):
self._articles.close()
self._references.close()
self._images.close()
self._bibliography.close()
def login(self, login: str, password: str) -> bool:
'''Login to electron version of BRE'''
self.browser.driver.get(_LOGIN_PAGE)
if not self.browser.wait_presence('page-login-form'):
return False
self.browser.driver.find_element(By.NAME, '__login').send_keys(login)
self.browser.driver.find_element(By.NAME, '__password').send_keys(password)
self.browser.driver.find_element(By.TAG_NAME, 'button').click()
return self.browser.wait_presence('userMenu')
def scan(self, start: str, stop: str, max_items: int) -> str:
'''Scan element from BRE'''
next_id = start
for i in range(max_items):
print('{}-{}'.format(i, next_id))
next_id = self._process_article(next_id, i)
self._flush()
if next_id == '':
print("Stopped with no next")
break
if next_id == stop:
print("Stopped at " + stop + " and worked " + str(i + 1) + "articles")
break
return next_id
def _flush(self):
self._articles.flush()
self._references.flush()
self._images.flush()
self._bibliography.flush()
def _create_headers(self):
self.writer_out.writerow([
'Indx', 'ArticleId', 'BlackWord', 'subWord', 'Rubrika', 'Year', 'Slovnik', 'Author(s)',
'KolZnak', 'Status', 'Version', 'Litr', 'Soch', 'KolImg', 'KolCnt'
])
# ИД слова статьи + ИД статьи ссылаемого слова + другое слово + ссылка
self.writer_ref.writerow(['WordId', 'RefWordId', 'RefWord', 'RefURL'])
# ИД слова + ИД ссылки на иллюстрацию
self.writer_img.writerow(['WordId', 'ImgId'])
# ИД слова + ИД библиоссылки + тип библиографии + текст библиографии
self.writer_bib.writerow(['WordId', 'BibId', 'BibType', 'BibText'])
def _process_article(self, article_id: str, index: int) -> str:
self.browser.driver.get('{}/{}'.format(_ARTICLE_PAGE, article_id))
if not self.browser.wait_presence('userMenu'):
return ''
self._download_rtf()
image_count = self._extract_images(article_id)
links_count = self._extract_links(article_id)
literature, sochinenie = self._extract_biblio(article_id)
self._extract_article_info(article_id, index, image_count, links_count, literature, sochinenie)
ref = self.browser.by_xpath('//a[.="следующая >>"]').get_attribute('href')
return ref.split('/')[-1]
def _download_rtf(self):
self.browser.by_xpath('//a[@title="Rich Text Format"]').click()
return True
def _extract_article_info(self, article_id: str, index: int, images: int, links: int, literature: int, soch: int):
black_word = self.browser.by_xpath('//h1[@name="title"]').text
sub_word = self.browser.by_xpath('//h2[@name="subtitle"]').text
section_name = self.browser.by_xpath('//span[@name="section"]').text # Рубрика сайта
markers = self.browser.by_xpath('//span[@name="markers"]').text # Словник
authors = self.browser.by_xpath_m('//p[@class="Автор type-"]') # Автор // !!два значения или не одного
author = '' if len(authors) == 0 else authors[0].text
char_num = self.browser.by_xpath('//span[@class="art_num_chars"]').text # Количество знаков
status = self.browser.by_xpath('//span[@class="status-text"]').text # Статус ("опубликовано" и др.)
status = status.replace('Статус: ', '')
version = self.browser.by_xpath('//td[@id="art-cat"]').text # Версия статьи ("Исходная" и др.)
year = self.browser.by_xpath('//span[@class="year"]').text
self.writer_out.writerow([
index, article_id, black_word, sub_word, section_name, year, markers, author,
char_num, status, version, literature, soch, images, links
])
def _extract_images(self, article_id: str) -> int:
# Гиперссылка на иллюстрации (src="")
images = self.browser.by_xpath_m('//img')
for img in images:
source = img.get_attribute('src')
self.writer_img.writerow([article_id, source])
return len(images)
def _extract_links(self, article_id: str) -> int:
links = self.browser.by_xpath_m('//a[@class="processed-link"]')
for link in links:
self.writer_ref.writerow([
article_id, link.get_attribute('data-art'),
link.get_attribute('data-word'),
link.get_attribute('href')
])
return len(links)
def _extract_biblio(self, article_id: str):
# Библиография // (по куску текста) + удалить '­' (\xad) заменить ' ' на пробел Возможно, подстрока 'type-' означает "Лит.:"
biblios = self.browser.by_xpath_m('//div[@class="puretext type-biblio"]')
litr = 0
soch = 0
for bib_item in biblios:
biblio = bib_item.text.replace('\xad', '')
itr = 0
if biblio.find('Соч.:', 0, 5) != -1:
soch = biblio.count(';') + 1
for item in biblio.replace('Соч.: ', '').split('; '):
itr = itr + 1
self.writer_bib.writerow([article_id, itr, 'Soch', item])
continue
if biblio.find('Лит.:', 0, 5) != -1:
litr = biblio.count(';') + 1
for item in biblio.replace('Лит.: ', '').split('; '):
itr = itr + 1
self.writer_bib.writerow([article_id, itr, 'Litr', item])
continue
if biblio.find('Общие труды.', 0, 13) != -1:
for item in biblio.replace('Общие труды. ', '').split('; '):
itr = itr + 1
self.writer_bib.writerow([article_id, itr, 'gnrl', item])
continue
for item in biblio.split('; '):
itr = itr + 1
self.writer_bib.writerow([article_id, itr, 'inoe', item])
return litr, soch