BRE/webapi/portal/electron_loading.py

172 lines
7.8 KiB
Python
Raw Normal View History

2024-06-07 19:50:21 +03:00
'''
автоматическая загруза с сайта посредством selenium
с использованием браузера Chrome (требуется драйвер)
'''
import csv
from selenium.webdriver.common.by import By
from .selenium_wrapper import WebBrowser
_LOGIN_PAGE = 'https://bigenc.ru/user/login'
_ARTICLE_PAGE = 'https://bigenc.ru/user/content/articles'
class GBDownloader:
'''Downloader functionality'''
def __init__(self, browser: WebBrowser, save_path: str, append: bool, suffix_seed: int):
mode = 'a' if append else 'w'
suffix = ('00' + str(suffix_seed))[-3:]
self.browser = browser
self.save_path = save_path
self._articles = open('{}\\!out_bd_{}.csv'.format(save_path, suffix), mode, newline='')
self._references = open('{}\\!ref_bd_{}.csv'.format(save_path, suffix), mode, newline='')
self._images = open('{}\\!img_bd_{}.csv'.format(save_path, suffix), mode, newline='')
self._bibliography = open('{}\\!bib_bd_{}.csv'.format(save_path, suffix), mode, newline='')
self.writer_out = csv.writer(self._articles)
self.writer_ref = csv.writer(self._references)
self.writer_img = csv.writer(self._images)
self.writer_bib = csv.writer(self._bibliography)
if not append:
self._create_headers()
def __del__(self):
self._articles.close()
self._references.close()
self._images.close()
self._bibliography.close()
def login(self, login: str, password: str) -> bool:
'''Login to electron version of BRE'''
self.browser.driver.get(_LOGIN_PAGE)
if not self.browser.wait_presence('page-login-form'):
return False
self.browser.driver.find_element(By.NAME, '__login').send_keys(login)
self.browser.driver.find_element(By.NAME, '__password').send_keys(password)
self.browser.driver.find_element(By.TAG_NAME, 'button').click()
return self.browser.wait_presence('userMenu')
def scan(self, start: str, stop: str, max_items: int) -> str:
'''Scan element from BRE'''
next_id = start
for i in range(max_items):
print('{}-{}'.format(i, next_id))
next_id = self._process_article(next_id, i)
self._flush()
if next_id == '':
print("Stopped with no next")
break
if next_id == stop:
print("Stopped at " + stop + " and worked " + str(i + 1) + "articles")
break
return next_id
def _flush(self):
self._articles.flush()
self._references.flush()
self._images.flush()
self._bibliography.flush()
def _create_headers(self):
self.writer_out.writerow([
'Indx', 'ArticleId', 'BlackWord', 'subWord', 'Rubrika', 'Year', 'Slovnik', 'Author(s)',
'KolZnak', 'Status', 'Version', 'Litr', 'Soch', 'KolImg', 'KolCnt'
])
# ИД слова статьи + ИД статьи ссылаемого слова + другое слово + ссылка
self.writer_ref.writerow(['WordId', 'RefWordId', 'RefWord', 'RefURL'])
# ИД слова + ИД ссылки на иллюстрацию
self.writer_img.writerow(['WordId', 'ImgId'])
# ИД слова + ИД библиоссылки + тип библиографии + текст библиографии
self.writer_bib.writerow(['WordId', 'BibId', 'BibType', 'BibText'])
def _process_article(self, article_id: str, index: int) -> str:
self.browser.driver.get('{}/{}'.format(_ARTICLE_PAGE, article_id))
if not self.browser.wait_presence('userMenu'):
return ''
self._download_rtf()
image_count = self._extract_images(article_id)
links_count = self._extract_links(article_id)
literature, sochinenie = self._extract_biblio(article_id)
self._extract_article_info(article_id, index, image_count, links_count, literature, sochinenie)
ref = self.browser.by_xpath('//a[.="следующая >>"]').get_attribute('href')
return ref.split('/')[-1]
def _download_rtf(self):
self.browser.by_xpath('//a[@title="Rich Text Format"]').click()
return True
def _extract_article_info(self, article_id: str, index: int, images: int, links: int, literature: int, soch: int):
black_word = self.browser.by_xpath('//h1[@name="title"]').text
sub_word = self.browser.by_xpath('//h2[@name="subtitle"]').text
section_name = self.browser.by_xpath('//span[@name="section"]').text # Рубрика сайта
markers = self.browser.by_xpath('//span[@name="markers"]').text # Словник
authors = self.browser.by_xpath_m('//p[@class="Автор type-"]') # Автор // !!два значения или не одного
author = '' if len(authors) == 0 else authors[0].text
char_num = self.browser.by_xpath('//span[@class="art_num_chars"]').text # Количество знаков
status = self.browser.by_xpath('//span[@class="status-text"]').text # Статус ("опубликовано" и др.)
status = status.replace('Статус: ', '')
version = self.browser.by_xpath('//td[@id="art-cat"]').text # Версия статьи ("Исходная" и др.)
year = self.browser.by_xpath('//span[@class="year"]').text
self.writer_out.writerow([
index, article_id, black_word, sub_word, section_name, year, markers, author,
char_num, status, version, literature, soch, images, links
])
def _extract_images(self, article_id: str) -> int:
# Гиперссылка на иллюстрации (src="")
images = self.browser.by_xpath_m('//img')
for img in images:
source = img.get_attribute('src')
self.writer_img.writerow([article_id, source])
return len(images)
def _extract_links(self, article_id: str) -> int:
links = self.browser.by_xpath_m('//a[@class="processed-link"]')
for link in links:
self.writer_ref.writerow([
article_id, link.get_attribute('data-art'),
link.get_attribute('data-word'),
link.get_attribute('href')
])
return len(links)
def _extract_biblio(self, article_id: str):
# Библиография // (по куску текста) + удалить '­' (\xad) заменить ' ' на пробел Возможно, подстрока 'type-' означает "Лит.:"
biblios = self.browser.by_xpath_m('//div[@class="puretext type-biblio"]')
litr = 0
soch = 0
for bib_item in biblios:
biblio = bib_item.text.replace('\xad', '')
itr = 0
if biblio.find('Соч.:', 0, 5) != -1:
soch = biblio.count(';') + 1
for item in biblio.replace('Соч.: ', '').split('; '):
itr = itr + 1
self.writer_bib.writerow([article_id, itr, 'Soch', item])
continue
if biblio.find('Лит.:', 0, 5) != -1:
litr = biblio.count(';') + 1
for item in biblio.replace('Лит.: ', '').split('; '):
itr = itr + 1
self.writer_bib.writerow([article_id, itr, 'Litr', item])
continue
if biblio.find('Общие труды.', 0, 13) != -1:
for item in biblio.replace('Общие труды. ', '').split('; '):
itr = itr + 1
self.writer_bib.writerow([article_id, itr, 'gnrl', item])
continue
for item in biblio.split('; '):
itr = itr + 1
self.writer_bib.writerow([article_id, itr, 'inoe', item])
return litr, soch