F: Implement basic search

This commit is contained in:
Ivan 2025-05-22 19:50:19 +03:00
parent 9b1c8a79e8
commit 721358aee8
11 changed files with 310 additions and 55 deletions

8
.gitignore vendored
View File

@ -1,3 +1,8 @@
# output
output/
auth.json
.user-data-dir
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@ -18,6 +23,7 @@ coverage.xml
.pytest_cache/
cover/
.mypy_cache/
test-results/
# React
.DS_*
@ -32,4 +38,4 @@ bower_components
# Environments
/GitExtensions.settings
/playwright-report
/playwright-report

6
.prettierignore Normal file
View File

@ -0,0 +1,6 @@
build/
node_modules/
package-lock.json
yarn.lock
package.json
coverage

13
.prettierrc.json Normal file
View File

@ -0,0 +1,13 @@
{
"semi": true,
"useTabs": false,
"printWidth": 120,
"tabWidth": 2,
"trailingComma": "none",
"arrowParens": "avoid",
"singleQuote": true,
"jsxSingleQuote": true,
"quoteProps": "consistent",
"bracketSameLine": false,
"bracketSpacing": true
}

10
.vscode/launch.json vendored
View File

@ -4,6 +4,16 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
// Setup Auth
"name": "Auth",
"type": "node",
"request": "launch",
"cwd": "${workspaceFolder}",
"runtimeExecutable": "npx",
"runtimeArgs": ["ts-node", "src/save-auth.ts"],
"console": "integratedTerminal"
},
{
// Run all
"name": "Run all",

View File

@ -1,3 +1,4 @@
{
"cSpell.words": ["papaparse", "unparse"]
"cSpell.words": ["papaparse", "unparse"],
"isort.args": ["--line-length", "100", "--multi-line", "3", "--project", "apps", "--project", "shared"]
}

View File

@ -1,3 +0,0 @@
{
"heading": "Example Domain"
}

View File

@ -1,18 +1,19 @@
import { defineConfig, devices } from "@playwright/test";
import { defineConfig, devices } from '@playwright/test';
export default defineConfig({
testDir: "src",
testDir: 'src',
retries: 0,
reporter: "list",
reporter: 'list',
timeout: 30 * 1000,
fullyParallel: true,
projects: [
{
name: "Desktop Chrome",
use: { ...devices["Desktop Chrome"] },
},
name: 'Desktop Chrome',
use: { ...devices['Desktop Chrome'] }
}
],
use: {
baseURL: "https://internet.garant.ru/",
trace: "on-first-retry",
},
baseURL: 'https://internet.garant.ru/',
storageState: 'auth.json'
}
});

36
src/save-auth.ts Normal file
View File

@ -0,0 +1,36 @@
import { chromium } from '@playwright/test';
import fs from 'fs/promises';
import { USER_DATA_DIR } from './setup';
const IN_LOGIN = 'iborisov@acconcept.ru';
const IN_PASSWORD = 'PuNS8br2';
// const IN_LOGIN = "dm9175025694";
// const IN_PASSWORD = "52547";
(async () => {
// Launch persistent context with visible browser
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
headless: false
});
const page = await context.newPage();
await page.goto('https://account.garant.ru/login');
await page.getByRole('textbox').fill(IN_LOGIN);
await page.getByRole('button', { name: 'Войти' }).click();
await page.getByRole('textbox').fill(IN_PASSWORD);
await page.getByRole('button', { name: 'Войти' }).click();
await page
.getByRole('button', {
name: 'Выйти из текущего сеанса и сменить пользователя'
})
.waitFor();
// Save storage state to file
await fs.writeFile('auth.json', JSON.stringify(await page.context().storageState()));
console.log('✅ Saved login state to auth.json');
})();

View File

@ -1,48 +1,104 @@
import { test, expect } from "@playwright/test";
import fs from "fs/promises";
import Papa from "papaparse";
import { Page } from '@playwright/test';
import fs from 'fs/promises';
import Papa from 'papaparse';
import { test } from './setup';
import { executeSearch } from './search';
const OUTPUT_FILE = "output.csv";
/** Input text prompt */
const IN_TEXT_PROMPT = 'Министерство труда, занятости и социальной защиты Республики Коми';
const IN_TITLE_PROMPT = '';
test("scrape data from example site", async ({ page }) => {
await page.goto("https://example.com");
/** Output file naming */
const OUT_NAME = 'output';
const OUT_SUFFIX = '_1';
const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`;
const heading = await page.locator("h1").textContent();
console.log({ heading });
interface DocumentInfo {
name: string;
docID: string;
url: string;
href: string;
}
expect(heading).toBeTruthy(); // optional check
const data = await page.evaluate(() => {
const heading = document.querySelector("h1")?.textContent?.trim() ?? "";
return { heading };
/** ===== RUN Auth FIRST before running this! ===== */
test('scrape documents list', async ({ page }) => {
// await page.goto("https://demo.garant.ru/");
await page.goto('/');
await executeSearch(page, {
textPrompt: IN_TEXT_PROMPT,
titlePrompt: IN_TITLE_PROMPT,
onlyActive: true
});
// Wait for the list to load
await page.waitForSelector("ul.list");
const documents = await page.$$eval("ul.list > li", (items) =>
items.map((li) => {
const anchor = li.querySelector("a");
const href = anchor?.getAttribute("href") ?? "";
const nameEl = anchor?.querySelector(".name p");
// Clone the node to strip <em> tags
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
if (cleanText) {
// Remove <em> tags
cleanText.querySelectorAll("em").forEach((em) => em.remove());
}
const name = cleanText?.textContent?.trim() ?? "";
return { name, href };
})
);
// Convert to CSV
const csv = Papa.unparse(documents);
await fs.writeFile(OUTPUT_FILE, csv, "utf-8");
console.log("✅ Saved to documents.csv");
const documents = await readData(page);
if (documents.length === 0) {
console.log('No data found');
return;
} else {
const csv = Papa.unparse(documents);
await fs.writeFile(OUT_FILENAME, csv, 'utf-8');
console.log('✅ Saved to documents.csv');
}
});
async function readData(page: Page): Promise<DocumentInfo[]> {
const documents: DocumentInfo[] = [];
while (true) {
await page.waitForSelector('ul.list');
const newDocuments = await page.$$eval('ul.list > li', items =>
items
.map(li => {
const anchor = li.querySelector('a');
const href = anchor?.getAttribute('href') ?? '';
const nameEl = anchor?.querySelector('.name p');
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
if (cleanText) {
cleanText.querySelectorAll('em').forEach(em => em.remove());
}
const name = cleanText?.textContent?.trim() ?? '';
return { name: name, href: href };
})
.filter(item => !!item.href && !!item.name)
);
documents.push(
...newDocuments.map(item => ({
...item,
docID: extractDocumentID(item.href) ?? '',
url: convertHrefToUrl(item.href) ?? ''
}))
);
const nextButton = await page.getByText('Следующая');
const classAttr = await nextButton.getAttribute('class');
if (!classAttr || classAttr.includes('disabled')) {
break;
}
const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href'));
await nextButton.click();
await page.waitForFunction(
prevHref => {
const first = document.querySelector('ul.list > li a');
return first && first.getAttribute('href') !== prevHref;
},
firstHref,
{ timeout: 5000 }
);
}
return documents;
}
// ========== INTERNALS ===========
function extractDocumentID(href: string): string | null {
const match = href.match(/\/#\/document\/(\d+)\//);
return !match ? null : match[1];
}
function convertHrefToUrl(href: string): string | null {
const docId = extractDocumentID(href);
return !docId ? null : `https://internet.garant.ru/#/document/${docId}`;
}

100
src/search.ts Normal file
View File

@ -0,0 +1,100 @@
import { Locator, Page } from 'playwright';
const GLOBAL_FILTERS = [
'Федеральные министерства и ведомства',
'Правительство России и СССР',
'Президент России и СССР',
'Органы судебной власти РФ и СССР',
'Органы законодательной власти России и СССР'
] as const;
const LOCAL_TITLE = 'Органы власти Республики Коми';
const LOCAL_FILTERS = [
'Правительство Республики Коми',
'Государственный Совет Республики Коми',
'Глава Республики Коми',
'Президиум Верховного Совета Республики Коми',
'Президиум Верховного Совета Коми ССР',
'Верховный Совет Коми АССР',
'Верховный Совет Коми ССР',
'Верховный Совет Республики Коми'
] as const;
export interface SearchOptions {
textPrompt?: string;
titlePrompt?: string;
onlyActive?: boolean;
}
export async function executeSearch(page: Page, options: SearchOptions) {
await page.getByRole('button', { name: 'Расширенный поиск' }).click();
const clearButton = page.getByRole('link', { name: 'Очистить' });
if ((await clearButton.count()) > 0) {
await clearButton.first().click();
await page.waitForTimeout(1000);
}
if (!!options.textPrompt) {
await page
.locator('label:has-text("Слова в тексте")')
.locator('..')
.locator('..')
.locator('textarea')
.fill(options.textPrompt);
}
if (!!options.titlePrompt) {
await page
.locator('label:has-text("Слова в названии")')
.locator('..')
.locator('..')
.locator('textarea')
.fill(options.titlePrompt);
}
if (options.onlyActive) {
const status = page.getByText('Статус', { exact: true });
const isStatusVisible = await status.isVisible().catch(() => false);
if (!isStatusVisible) {
await page.getByText('Дополнительные реквизиты документа').click();
}
await status.click();
const includeActive = page.getByRole('cell', { name: 'Действующие', exact: true });
if (!(await isItemChecked(includeActive))) {
await includeActive.locator('img').nth(1).click();
}
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
}
await page.getByText('Орган / источник').click();
const localItem = page.getByRole('cell', { name: LOCAL_FILTERS[0], exact: true });
const isLocalExpanded = await localItem.isVisible().catch(() => false);
if (!isLocalExpanded) {
await page.getByRole('cell', { name: LOCAL_TITLE, exact: true }).click();
}
for (const filter of LOCAL_FILTERS) {
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
if (!(await isItemChecked(listItem))) {
await listItem.locator('img').nth(2).click();
}
}
for (const filter of GLOBAL_FILTERS) {
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
if (!(await isItemChecked(listItem))) {
await listItem.locator('img').nth(1).click();
}
}
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
await page.getByRole('button', { name: 'Найти (Enter)' }).click();
}
// ========== INTERNALS ===========
async function isItemChecked(item: Locator) {
const classAttr = await item.getAttribute('class');
return !!classAttr && classAttr.includes('checked');
}

29
src/setup.ts Normal file
View File

@ -0,0 +1,29 @@
// tests/scraper-with-user-data.spec.ts
import { test as base, BrowserContext } from "@playwright/test";
import { chromium } from "playwright";
import path from "path";
export const USER_DATA_DIR = path.resolve(__dirname, "../.user-data-dir");
export const test = base.extend<{
context: BrowserContext;
page: Awaited<ReturnType<BrowserContext["newPage"]>>;
}>({
context: async ({}, use) => {
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
headless: false,
args: [
"--disable-session-crashed-bubble", // Don't show restore bubble
"--disable-restore-session-state", // Don't try to restore session
"--disable-background-networking", // Optional: cleaner launch
"--disable-default-apps", // Optional: skip default Chrome apps
],
});
await use(context);
await context.close();
},
page: async ({ context }, use) => {
const page = context.pages()[0] ?? (await context.newPage());
await use(page);
},
});