114 lines
3.4 KiB
TypeScript
114 lines
3.4 KiB
TypeScript
import { Page } from '@playwright/test';
|
|
import fs from 'fs/promises';
|
|
import Papa from 'papaparse';
|
|
import { test } from './setup';
|
|
import { executeSearch } from './search';
|
|
|
|
/** Input text prompt */
|
|
const IN_TEXT_PROMPT = 'Представительство Республики Коми в Северо-Западном регионе Российской Федерации';
|
|
const IN_TITLE_PROMPT = '';
|
|
|
|
/** Output file naming */
|
|
const OUT_NAME = 'output';
|
|
const OUT_SUFFIX = '_22';
|
|
const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`;
|
|
|
|
interface DocumentInfo {
|
|
name: string;
|
|
docID: string;
|
|
url: string;
|
|
href: string;
|
|
}
|
|
|
|
/** ===== RUN Auth FIRST before running this! ===== */
|
|
test('scrape documents list', async ({ page }) => {
|
|
// await page.goto("https://demo.garant.ru/");
|
|
await page.goto('/');
|
|
await executeSearch(page, {
|
|
textPrompt: IN_TEXT_PROMPT,
|
|
titlePrompt: IN_TITLE_PROMPT,
|
|
onlyActive: true
|
|
});
|
|
|
|
const documents = await readData(page);
|
|
|
|
if (documents.length === 0) {
|
|
console.log('No data found');
|
|
return;
|
|
} else {
|
|
const csv = Papa.unparse(documents);
|
|
await fs.writeFile(OUT_FILENAME, csv, 'utf-8');
|
|
console.log('✅ Saved to documents.csv');
|
|
}
|
|
});
|
|
|
|
async function readData(page: Page): Promise<DocumentInfo[]> {
|
|
const documents: DocumentInfo[] = [];
|
|
while (true) {
|
|
await page.waitForSelector('ul.list');
|
|
const newDocuments = await page.$$eval('ul.list > li', items =>
|
|
items
|
|
.map(li => {
|
|
const anchor = li.querySelector('a');
|
|
const href = anchor?.getAttribute('href') ?? '';
|
|
|
|
const nameEl = anchor?.querySelector('.name p');
|
|
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
|
|
|
|
if (cleanText) {
|
|
cleanText.querySelectorAll('em').forEach(em => {
|
|
const parent = em.parentNode;
|
|
if (parent) {
|
|
while (em.firstChild) {
|
|
parent.insertBefore(em.firstChild, em);
|
|
}
|
|
parent.removeChild(em);
|
|
}
|
|
});
|
|
}
|
|
|
|
const name = cleanText?.textContent?.trim() ?? '';
|
|
return { name: name, href: 'https://internet.garant.ru' + href };
|
|
})
|
|
.filter(item => !!item.href && !!item.name)
|
|
);
|
|
documents.push(
|
|
...newDocuments.map(item => ({
|
|
...item,
|
|
docID: extractDocumentID(item.href) ?? '',
|
|
url: convertHrefToUrl(item.href) ?? ''
|
|
}))
|
|
);
|
|
const nextButton = await page.getByText('Следующая');
|
|
const classAttr = await nextButton.getAttribute('class');
|
|
if (!classAttr || classAttr.includes('disabled')) {
|
|
break;
|
|
}
|
|
|
|
const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href'));
|
|
await nextButton.click();
|
|
|
|
await page.waitForFunction(
|
|
prevHref => {
|
|
const first = document.querySelector('ul.list > li a');
|
|
return first && first.getAttribute('href') !== prevHref;
|
|
},
|
|
firstHref,
|
|
{ timeout: 5000 }
|
|
);
|
|
}
|
|
|
|
return documents;
|
|
}
|
|
|
|
// ========== INTERNALS ===========
|
|
function extractDocumentID(href: string): string | null {
|
|
const match = href.match(/\/#\/document\/(\d+)\//);
|
|
return !match ? null : match[1];
|
|
}
|
|
|
|
function convertHrefToUrl(href: string): string | null {
|
|
const docId = extractDocumentID(href);
|
|
return !docId ? null : `https://internet.garant.ru/#/document/${docId}`;
|
|
}
|