Concept-Garant/src/scrape.spec.ts

114 lines
3.4 KiB
TypeScript
Raw Normal View History

2025-05-22 19:50:19 +03:00
import { Page } from '@playwright/test';
import fs from 'fs/promises';
import Papa from 'papaparse';
import { test } from './setup';
import { executeSearch } from './search';
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
/** Input text prompt */
2025-05-23 20:37:41 +03:00
const IN_TEXT_PROMPT = 'Представительство Республики Коми в Северо-Западном регионе Российской Федерации';
2025-05-22 19:50:19 +03:00
const IN_TITLE_PROMPT = '';
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
/** Output file naming */
const OUT_NAME = 'output';
2025-05-23 20:37:41 +03:00
const OUT_SUFFIX = '_22';
2025-05-22 19:50:19 +03:00
const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`;
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
interface DocumentInfo {
name: string;
docID: string;
url: string;
href: string;
}
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
/** ===== RUN Auth FIRST before running this! ===== */
test('scrape documents list', async ({ page }) => {
// await page.goto("https://demo.garant.ru/");
await page.goto('/');
await executeSearch(page, {
textPrompt: IN_TEXT_PROMPT,
titlePrompt: IN_TITLE_PROMPT,
onlyActive: true
2025-05-22 13:02:10 +03:00
});
2025-05-22 19:50:19 +03:00
const documents = await readData(page);
2025-05-23 20:37:41 +03:00
2025-05-22 19:50:19 +03:00
if (documents.length === 0) {
console.log('No data found');
return;
} else {
const csv = Papa.unparse(documents);
await fs.writeFile(OUT_FILENAME, csv, 'utf-8');
console.log('✅ Saved to documents.csv');
}
});
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
async function readData(page: Page): Promise<DocumentInfo[]> {
const documents: DocumentInfo[] = [];
while (true) {
await page.waitForSelector('ul.list');
const newDocuments = await page.$$eval('ul.list > li', items =>
items
.map(li => {
const anchor = li.querySelector('a');
const href = anchor?.getAttribute('href') ?? '';
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
const nameEl = anchor?.querySelector('.name p');
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
if (cleanText) {
2025-05-23 20:37:41 +03:00
cleanText.querySelectorAll('em').forEach(em => {
const parent = em.parentNode;
if (parent) {
while (em.firstChild) {
parent.insertBefore(em.firstChild, em);
}
parent.removeChild(em);
}
});
2025-05-22 19:50:19 +03:00
}
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
const name = cleanText?.textContent?.trim() ?? '';
2025-05-23 20:37:41 +03:00
return { name: name, href: 'https://internet.garant.ru' + href };
2025-05-22 19:50:19 +03:00
})
.filter(item => !!item.href && !!item.name)
);
documents.push(
...newDocuments.map(item => ({
...item,
docID: extractDocumentID(item.href) ?? '',
url: convertHrefToUrl(item.href) ?? ''
}))
);
const nextButton = await page.getByText('Следующая');
const classAttr = await nextButton.getAttribute('class');
if (!classAttr || classAttr.includes('disabled')) {
break;
}
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href'));
await nextButton.click();
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
await page.waitForFunction(
prevHref => {
const first = document.querySelector('ul.list > li a');
return first && first.getAttribute('href') !== prevHref;
},
firstHref,
{ timeout: 5000 }
);
}
2025-05-22 13:02:10 +03:00
2025-05-22 19:50:19 +03:00
return documents;
}
// ========== INTERNALS ===========
function extractDocumentID(href: string): string | null {
const match = href.match(/\/#\/document\/(\d+)\//);
return !match ? null : match[1];
}
function convertHrefToUrl(href: string): string | null {
const docId = extractDocumentID(href);
return !docId ? null : `https://internet.garant.ru/#/document/${docId}`;
}