172 lines
7.8 KiB
Python
172 lines
7.8 KiB
Python
![]() |
'''
|
|||
|
автоматическая загруза с сайта посредством selenium
|
|||
|
с использованием браузера Chrome (требуется драйвер)
|
|||
|
'''
|
|||
|
import csv
|
|||
|
|
|||
|
from selenium.webdriver.common.by import By
|
|||
|
|
|||
|
from .selenium_wrapper import WebBrowser
|
|||
|
|
|||
|
_LOGIN_PAGE = 'https://bigenc.ru/user/login'
|
|||
|
_ARTICLE_PAGE = 'https://bigenc.ru/user/content/articles'
|
|||
|
|
|||
|
|
|||
|
class GBDownloader:
|
|||
|
'''Downloader functionality'''
|
|||
|
def __init__(self, browser: WebBrowser, save_path: str, append: bool, suffix_seed: int):
|
|||
|
mode = 'a' if append else 'w'
|
|||
|
suffix = ('00' + str(suffix_seed))[-3:]
|
|||
|
self.browser = browser
|
|||
|
self.save_path = save_path
|
|||
|
|
|||
|
self._articles = open('{}\\!out_bd_{}.csv'.format(save_path, suffix), mode, newline='')
|
|||
|
self._references = open('{}\\!ref_bd_{}.csv'.format(save_path, suffix), mode, newline='')
|
|||
|
self._images = open('{}\\!img_bd_{}.csv'.format(save_path, suffix), mode, newline='')
|
|||
|
self._bibliography = open('{}\\!bib_bd_{}.csv'.format(save_path, suffix), mode, newline='')
|
|||
|
|
|||
|
self.writer_out = csv.writer(self._articles)
|
|||
|
self.writer_ref = csv.writer(self._references)
|
|||
|
self.writer_img = csv.writer(self._images)
|
|||
|
self.writer_bib = csv.writer(self._bibliography)
|
|||
|
if not append:
|
|||
|
self._create_headers()
|
|||
|
|
|||
|
def __del__(self):
|
|||
|
self._articles.close()
|
|||
|
self._references.close()
|
|||
|
self._images.close()
|
|||
|
self._bibliography.close()
|
|||
|
|
|||
|
def login(self, login: str, password: str) -> bool:
|
|||
|
'''Login to electron version of BRE'''
|
|||
|
self.browser.driver.get(_LOGIN_PAGE)
|
|||
|
if not self.browser.wait_presence('page-login-form'):
|
|||
|
return False
|
|||
|
|
|||
|
self.browser.driver.find_element(By.NAME, '__login').send_keys(login)
|
|||
|
self.browser.driver.find_element(By.NAME, '__password').send_keys(password)
|
|||
|
self.browser.driver.find_element(By.TAG_NAME, 'button').click()
|
|||
|
return self.browser.wait_presence('userMenu')
|
|||
|
|
|||
|
def scan(self, start: str, stop: str, max_items: int) -> str:
|
|||
|
'''Scan element from BRE'''
|
|||
|
next_id = start
|
|||
|
for i in range(max_items):
|
|||
|
print('{}-{}'.format(i, next_id))
|
|||
|
next_id = self._process_article(next_id, i)
|
|||
|
self._flush()
|
|||
|
if next_id == '':
|
|||
|
print("Stopped with no next")
|
|||
|
break
|
|||
|
if next_id == stop:
|
|||
|
print("Stopped at " + stop + " and worked " + str(i + 1) + "articles")
|
|||
|
break
|
|||
|
return next_id
|
|||
|
|
|||
|
def _flush(self):
|
|||
|
self._articles.flush()
|
|||
|
self._references.flush()
|
|||
|
self._images.flush()
|
|||
|
self._bibliography.flush()
|
|||
|
|
|||
|
def _create_headers(self):
|
|||
|
self.writer_out.writerow([
|
|||
|
'Indx', 'ArticleId', 'BlackWord', 'subWord', 'Rubrika', 'Year', 'Slovnik', 'Author(s)',
|
|||
|
'KolZnak', 'Status', 'Version', 'Litr', 'Soch', 'KolImg', 'KolCnt'
|
|||
|
])
|
|||
|
# ИД слова статьи + ИД статьи ссылаемого слова + другое слово + ссылка
|
|||
|
self.writer_ref.writerow(['WordId', 'RefWordId', 'RefWord', 'RefURL'])
|
|||
|
|
|||
|
# ИД слова + ИД ссылки на иллюстрацию
|
|||
|
self.writer_img.writerow(['WordId', 'ImgId'])
|
|||
|
|
|||
|
# ИД слова + ИД библиоссылки + тип библиографии + текст библиографии
|
|||
|
self.writer_bib.writerow(['WordId', 'BibId', 'BibType', 'BibText'])
|
|||
|
|
|||
|
def _process_article(self, article_id: str, index: int) -> str:
|
|||
|
self.browser.driver.get('{}/{}'.format(_ARTICLE_PAGE, article_id))
|
|||
|
if not self.browser.wait_presence('userMenu'):
|
|||
|
return ''
|
|||
|
|
|||
|
self._download_rtf()
|
|||
|
|
|||
|
image_count = self._extract_images(article_id)
|
|||
|
links_count = self._extract_links(article_id)
|
|||
|
literature, sochinenie = self._extract_biblio(article_id)
|
|||
|
self._extract_article_info(article_id, index, image_count, links_count, literature, sochinenie)
|
|||
|
ref = self.browser.by_xpath('//a[.="следующая >>"]').get_attribute('href')
|
|||
|
return ref.split('/')[-1]
|
|||
|
|
|||
|
def _download_rtf(self):
|
|||
|
self.browser.by_xpath('//a[@title="Rich Text Format"]').click()
|
|||
|
return True
|
|||
|
|
|||
|
def _extract_article_info(self, article_id: str, index: int, images: int, links: int, literature: int, soch: int):
|
|||
|
black_word = self.browser.by_xpath('//h1[@name="title"]').text
|
|||
|
sub_word = self.browser.by_xpath('//h2[@name="subtitle"]').text
|
|||
|
section_name = self.browser.by_xpath('//span[@name="section"]').text # Рубрика сайта
|
|||
|
markers = self.browser.by_xpath('//span[@name="markers"]').text # Словник
|
|||
|
authors = self.browser.by_xpath_m('//p[@class="Автор type-"]') # Автор // !!два значения или не одного
|
|||
|
author = '' if len(authors) == 0 else authors[0].text
|
|||
|
char_num = self.browser.by_xpath('//span[@class="art_num_chars"]').text # Количество знаков
|
|||
|
status = self.browser.by_xpath('//span[@class="status-text"]').text # Статус ("опубликовано" и др.)
|
|||
|
status = status.replace('Статус: ', '')
|
|||
|
version = self.browser.by_xpath('//td[@id="art-cat"]').text # Версия статьи ("Исходная" и др.)
|
|||
|
year = self.browser.by_xpath('//span[@class="year"]').text
|
|||
|
self.writer_out.writerow([
|
|||
|
index, article_id, black_word, sub_word, section_name, year, markers, author,
|
|||
|
char_num, status, version, literature, soch, images, links
|
|||
|
])
|
|||
|
|
|||
|
def _extract_images(self, article_id: str) -> int:
|
|||
|
# Гиперссылка на иллюстрации (src="")
|
|||
|
images = self.browser.by_xpath_m('//img')
|
|||
|
for img in images:
|
|||
|
source = img.get_attribute('src')
|
|||
|
self.writer_img.writerow([article_id, source])
|
|||
|
return len(images)
|
|||
|
|
|||
|
def _extract_links(self, article_id: str) -> int:
|
|||
|
links = self.browser.by_xpath_m('//a[@class="processed-link"]')
|
|||
|
for link in links:
|
|||
|
self.writer_ref.writerow([
|
|||
|
article_id, link.get_attribute('data-art'),
|
|||
|
link.get_attribute('data-word'),
|
|||
|
link.get_attribute('href')
|
|||
|
])
|
|||
|
return len(links)
|
|||
|
|
|||
|
def _extract_biblio(self, article_id: str):
|
|||
|
# Библиография // (по куску текста) + удалить '­' (\xad) заменить ' ' на пробел Возможно, подстрока 'type-' означает "Лит.:"
|
|||
|
biblios = self.browser.by_xpath_m('//div[@class="puretext type-biblio"]')
|
|||
|
litr = 0
|
|||
|
soch = 0
|
|||
|
for bib_item in biblios:
|
|||
|
biblio = bib_item.text.replace('\xad', '')
|
|||
|
itr = 0
|
|||
|
if biblio.find('Соч.:', 0, 5) != -1:
|
|||
|
soch = biblio.count(';') + 1
|
|||
|
for item in biblio.replace('Соч.: ', '').split('; '):
|
|||
|
itr = itr + 1
|
|||
|
self.writer_bib.writerow([article_id, itr, 'Soch', item])
|
|||
|
continue
|
|||
|
|
|||
|
if biblio.find('Лит.:', 0, 5) != -1:
|
|||
|
litr = biblio.count(';') + 1
|
|||
|
for item in biblio.replace('Лит.: ', '').split('; '):
|
|||
|
itr = itr + 1
|
|||
|
self.writer_bib.writerow([article_id, itr, 'Litr', item])
|
|||
|
continue
|
|||
|
|
|||
|
if biblio.find('Общие труды.', 0, 13) != -1:
|
|||
|
for item in biblio.replace('Общие труды. ', '').split('; '):
|
|||
|
itr = itr + 1
|
|||
|
self.writer_bib.writerow([article_id, itr, 'gnrl', item])
|
|||
|
continue
|
|||
|
|
|||
|
for item in biblio.split('; '):
|
|||
|
itr = itr + 1
|
|||
|
self.writer_bib.writerow([article_id, itr, 'inoe', item])
|
|||
|
return litr, soch
|