import { Page } from '@playwright/test'; import fs from 'fs/promises'; import Papa from 'papaparse'; import { test } from './setup'; import { executeSearch } from './search'; /** Input text prompt */ const IN_TEXT_PROMPT = 'Министерство труда, занятости и социальной защиты Республики Коми'; const IN_TITLE_PROMPT = ''; /** Output file naming */ const OUT_NAME = 'output'; const OUT_SUFFIX = '_1'; const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`; interface DocumentInfo { name: string; docID: string; url: string; href: string; } /** ===== RUN Auth FIRST before running this! ===== */ test('scrape documents list', async ({ page }) => { // await page.goto("https://demo.garant.ru/"); await page.goto('/'); await executeSearch(page, { textPrompt: IN_TEXT_PROMPT, titlePrompt: IN_TITLE_PROMPT, onlyActive: true }); const documents = await readData(page); if (documents.length === 0) { console.log('No data found'); return; } else { const csv = Papa.unparse(documents); await fs.writeFile(OUT_FILENAME, csv, 'utf-8'); console.log('✅ Saved to documents.csv'); } }); async function readData(page: Page): Promise { const documents: DocumentInfo[] = []; while (true) { await page.waitForSelector('ul.list'); const newDocuments = await page.$$eval('ul.list > li', items => items .map(li => { const anchor = li.querySelector('a'); const href = anchor?.getAttribute('href') ?? ''; const nameEl = anchor?.querySelector('.name p'); const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null; if (cleanText) { cleanText.querySelectorAll('em').forEach(em => em.remove()); } const name = cleanText?.textContent?.trim() ?? ''; return { name: name, href: href }; }) .filter(item => !!item.href && !!item.name) ); documents.push( ...newDocuments.map(item => ({ ...item, docID: extractDocumentID(item.href) ?? '', url: convertHrefToUrl(item.href) ?? '' })) ); const nextButton = await page.getByText('Следующая'); const classAttr = await nextButton.getAttribute('class'); if (!classAttr || classAttr.includes('disabled')) { break; } const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href')); await nextButton.click(); await page.waitForFunction( prevHref => { const first = document.querySelector('ul.list > li a'); return first && first.getAttribute('href') !== prevHref; }, firstHref, { timeout: 5000 } ); } return documents; } // ========== INTERNALS =========== function extractDocumentID(href: string): string | null { const match = href.match(/\/#\/document\/(\d+)\//); return !match ? null : match[1]; } function convertHrefToUrl(href: string): string | null { const docId = extractDocumentID(href); return !docId ? null : `https://internet.garant.ru/#/document/${docId}`; }