Concept-Garant/src/scrape.spec.ts

105 lines
3.1 KiB
TypeScript
Raw Normal View History

2025-05-22 19:50:19 +03:00
import { Page } from '@playwright/test';
import fs from 'fs/promises';
import Papa from 'papaparse';
import { test } from './setup';
import { executeSearch } from './search';
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
/** Input text prompt */
const IN_TEXT_PROMPT = 'Министерство труда, занятости и социальной защиты Республики Коми';
const IN_TITLE_PROMPT = '';
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
/** Output file naming */
const OUT_NAME = 'output';
const OUT_SUFFIX = '_1';
const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`;
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
interface DocumentInfo {
name: string;
docID: string;
url: string;
href: string;
}
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
/** ===== RUN Auth FIRST before running this! ===== */
test('scrape documents list', async ({ page }) => {
// await page.goto("https://demo.garant.ru/");
await page.goto('/');
await executeSearch(page, {
textPrompt: IN_TEXT_PROMPT,
titlePrompt: IN_TITLE_PROMPT,
onlyActive: true
2025-05-22 13:02:10 +03:00
});
2025-05-22 19:50:19 +03:00
const documents = await readData(page);
if (documents.length === 0) {
console.log('No data found');
return;
} else {
const csv = Papa.unparse(documents);
await fs.writeFile(OUT_FILENAME, csv, 'utf-8');
console.log('✅ Saved to documents.csv');
}
});
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
async function readData(page: Page): Promise<DocumentInfo[]> {
const documents: DocumentInfo[] = [];
while (true) {
await page.waitForSelector('ul.list');
const newDocuments = await page.$$eval('ul.list > li', items =>
items
.map(li => {
const anchor = li.querySelector('a');
const href = anchor?.getAttribute('href') ?? '';
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
const nameEl = anchor?.querySelector('.name p');
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
if (cleanText) {
cleanText.querySelectorAll('em').forEach(em => em.remove());
}
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
const name = cleanText?.textContent?.trim() ?? '';
return { name: name, href: href };
})
.filter(item => !!item.href && !!item.name)
);
documents.push(
...newDocuments.map(item => ({
...item,
docID: extractDocumentID(item.href) ?? '',
url: convertHrefToUrl(item.href) ?? ''
}))
);
const nextButton = await page.getByText('Следующая');
const classAttr = await nextButton.getAttribute('class');
if (!classAttr || classAttr.includes('disabled')) {
break;
}
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href'));
await nextButton.click();
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
await page.waitForFunction(
prevHref => {
const first = document.querySelector('ul.list > li a');
return first && first.getAttribute('href') !== prevHref;
},
firstHref,
{ timeout: 5000 }
);
}
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
return documents;
}
// ========== INTERNALS ===========
function extractDocumentID(href: string): string | null {
const match = href.match(/\/#\/document\/(\d+)\//);
return !match ? null : match[1];
}
function convertHrefToUrl(href: string): string | null {
const docId = extractDocumentID(href);
return !docId ? null : `https://internet.garant.ru/#/document/${docId}`;
}