Compare commits

..

2 Commits

Author SHA1 Message Date
Ivan
4927837ac6 F: Improve features 2025-05-23 20:37:41 +03:00
Ivan
721358aee8 F: Implement basic search 2025-05-22 19:50:19 +03:00
11 changed files with 319 additions and 54 deletions

6
.gitignore vendored
View File

@ -1,3 +1,8 @@
# output
output/
auth.json
.user-data-dir
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@ -18,6 +23,7 @@ coverage.xml
.pytest_cache/
cover/
.mypy_cache/
test-results/
# React
.DS_*

6
.prettierignore Normal file
View File

@ -0,0 +1,6 @@
build/
node_modules/
package-lock.json
yarn.lock
package.json
coverage

13
.prettierrc.json Normal file
View File

@ -0,0 +1,13 @@
{
"semi": true,
"useTabs": false,
"printWidth": 120,
"tabWidth": 2,
"trailingComma": "none",
"arrowParens": "avoid",
"singleQuote": true,
"jsxSingleQuote": true,
"quoteProps": "consistent",
"bracketSameLine": false,
"bracketSpacing": true
}

10
.vscode/launch.json vendored
View File

@ -4,6 +4,16 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
// Setup Auth
"name": "Auth",
"type": "node",
"request": "launch",
"cwd": "${workspaceFolder}",
"runtimeExecutable": "npx",
"runtimeArgs": ["ts-node", "src/save-auth.ts"],
"console": "integratedTerminal"
},
{
// Run all
"name": "Run all",

View File

@ -1,3 +1,4 @@
{
"cSpell.words": ["papaparse", "unparse"]
"cSpell.words": ["papaparse", "unparse"],
"isort.args": ["--line-length", "100", "--multi-line", "3", "--project", "apps", "--project", "shared"]
}

View File

@ -1,3 +0,0 @@
{
"heading": "Example Domain"
}

View File

@ -1,18 +1,19 @@
import { defineConfig, devices } from "@playwright/test";
import { defineConfig, devices } from '@playwright/test';
export default defineConfig({
testDir: "src",
testDir: 'src',
retries: 0,
reporter: "list",
reporter: 'list',
timeout: 30 * 1000,
fullyParallel: true,
projects: [
{
name: "Desktop Chrome",
use: { ...devices["Desktop Chrome"] },
},
name: 'Desktop Chrome',
use: { ...devices['Desktop Chrome'] }
}
],
use: {
baseURL: "https://internet.garant.ru/",
trace: "on-first-retry",
},
baseURL: 'https://internet.garant.ru/',
storageState: 'auth.json'
}
});

38
src/save-auth.ts Normal file
View File

@ -0,0 +1,38 @@
import { chromium } from '@playwright/test';
import fs from 'fs/promises';
import { USER_DATA_DIR } from './setup';
const IN_LOGIN = 'iborisov@acconcept.ru';
const IN_PASSWORD = 'PuNS8br2';
// const IN_LOGIN = 'dm9175025694';
// const IN_PASSWORD = '52547';
(async () => {
// Launch persistent context with visible browser
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
headless: false
});
try {
const page = context.pages().length > 0 ? context.pages()[0] : await context.newPage();
await page.goto('https://account.garant.ru/login');
await page.getByRole('textbox').fill(IN_LOGIN);
await page.getByRole('button', { name: 'Войти' }).click();
await page.getByRole('textbox').fill(IN_PASSWORD);
await page.getByRole('button', { name: 'Войти' }).click();
await page
.getByRole('button', {
name: 'Выйти из текущего сеанса и сменить пользователя'
})
.waitFor();
await fs.writeFile('auth.json', JSON.stringify(await page.context().storageState()));
console.log('✅ Saved login state to auth.json');
} finally {
await context.close();
}
})();

View File

@ -1,48 +1,113 @@
import { test, expect } from "@playwright/test";
import fs from "fs/promises";
import Papa from "papaparse";
import { Page } from '@playwright/test';
import fs from 'fs/promises';
import Papa from 'papaparse';
import { test } from './setup';
import { executeSearch } from './search';
const OUTPUT_FILE = "output.csv";
/** Input text prompt */
const IN_TEXT_PROMPT = 'Представительство Республики Коми в Северо-Западном регионе Российской Федерации';
const IN_TITLE_PROMPT = '';
test("scrape data from example site", async ({ page }) => {
await page.goto("https://example.com");
/** Output file naming */
const OUT_NAME = 'output';
const OUT_SUFFIX = '_22';
const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`;
const heading = await page.locator("h1").textContent();
console.log({ heading });
interface DocumentInfo {
name: string;
docID: string;
url: string;
href: string;
}
expect(heading).toBeTruthy(); // optional check
const data = await page.evaluate(() => {
const heading = document.querySelector("h1")?.textContent?.trim() ?? "";
return { heading };
/** ===== RUN Auth FIRST before running this! ===== */
test('scrape documents list', async ({ page }) => {
// await page.goto("https://demo.garant.ru/");
await page.goto('/');
await executeSearch(page, {
textPrompt: IN_TEXT_PROMPT,
titlePrompt: IN_TITLE_PROMPT,
onlyActive: true
});
// Wait for the list to load
await page.waitForSelector("ul.list");
const documents = await readData(page);
const documents = await page.$$eval("ul.list > li", (items) =>
items.map((li) => {
const anchor = li.querySelector("a");
const href = anchor?.getAttribute("href") ?? "";
if (documents.length === 0) {
console.log('No data found');
return;
} else {
const csv = Papa.unparse(documents);
await fs.writeFile(OUT_FILENAME, csv, 'utf-8');
console.log('✅ Saved to documents.csv');
}
});
const nameEl = anchor?.querySelector(".name p");
// Clone the node to strip <em> tags
async function readData(page: Page): Promise<DocumentInfo[]> {
const documents: DocumentInfo[] = [];
while (true) {
await page.waitForSelector('ul.list');
const newDocuments = await page.$$eval('ul.list > li', items =>
items
.map(li => {
const anchor = li.querySelector('a');
const href = anchor?.getAttribute('href') ?? '';
const nameEl = anchor?.querySelector('.name p');
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
if (cleanText) {
// Remove <em> tags
cleanText.querySelectorAll("em").forEach((em) => em.remove());
cleanText.querySelectorAll('em').forEach(em => {
const parent = em.parentNode;
if (parent) {
while (em.firstChild) {
parent.insertBefore(em.firstChild, em);
}
parent.removeChild(em);
}
});
}
const name = cleanText?.textContent?.trim() ?? "";
return { name, href };
const name = cleanText?.textContent?.trim() ?? '';
return { name: name, href: 'https://internet.garant.ru' + href };
})
.filter(item => !!item.href && !!item.name)
);
documents.push(
...newDocuments.map(item => ({
...item,
docID: extractDocumentID(item.href) ?? '',
url: convertHrefToUrl(item.href) ?? ''
}))
);
const nextButton = await page.getByText('Следующая');
const classAttr = await nextButton.getAttribute('class');
if (!classAttr || classAttr.includes('disabled')) {
break;
}
// Convert to CSV
const csv = Papa.unparse(documents);
await fs.writeFile(OUTPUT_FILE, csv, "utf-8");
const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href'));
await nextButton.click();
console.log("✅ Saved to documents.csv");
});
await page.waitForFunction(
prevHref => {
const first = document.querySelector('ul.list > li a');
return first && first.getAttribute('href') !== prevHref;
},
firstHref,
{ timeout: 5000 }
);
}
return documents;
}
// ========== INTERNALS ===========
function extractDocumentID(href: string): string | null {
const match = href.match(/\/#\/document\/(\d+)\//);
return !match ? null : match[1];
}
function convertHrefToUrl(href: string): string | null {
const docId = extractDocumentID(href);
return !docId ? null : `https://internet.garant.ru/#/document/${docId}`;
}

99
src/search.ts Normal file
View File

@ -0,0 +1,99 @@
import { Locator, Page } from 'playwright';
const GLOBAL_FILTERS = [
'Федеральные министерства и ведомства',
'Правительство России и СССР',
'Президент России и СССР',
'Органы законодательной власти России и СССР'
] as const;
const LOCAL_TITLE = 'Органы власти Республики Коми';
const LOCAL_FILTERS = [
'Правительство Республики Коми',
'Государственный Совет Республики Коми',
'Глава Республики Коми',
'Президиум Верховного Совета Республики Коми',
'Президиум Верховного Совета Коми ССР',
'Верховный Совет Коми АССР',
'Верховный Совет Коми ССР',
'Верховный Совет Республики Коми'
] as const;
export interface SearchOptions {
textPrompt?: string;
titlePrompt?: string;
onlyActive?: boolean;
}
export async function executeSearch(page: Page, options: SearchOptions) {
await page.getByRole('button', { name: 'Расширенный поиск' }).click();
const clearButton = page.getByRole('link', { name: 'Очистить' });
if ((await clearButton.count()) > 0) {
await clearButton.first().click();
await page.waitForTimeout(1000);
}
if (!!options.textPrompt) {
await page
.locator('label:has-text("Слова в тексте")')
.locator('..')
.locator('..')
.locator('textarea')
.fill(options.textPrompt);
}
if (!!options.titlePrompt) {
await page
.locator('label:has-text("Слова в названии")')
.locator('..')
.locator('..')
.locator('textarea')
.fill(options.titlePrompt);
}
if (options.onlyActive) {
const status = page.getByText('Статус', { exact: true });
const isStatusVisible = await status.isVisible().catch(() => false);
if (!isStatusVisible) {
await page.getByText('Дополнительные реквизиты документа').click();
}
await status.click();
const includeActive = page.getByRole('cell', { name: 'Действующие', exact: true });
if (!(await isItemChecked(includeActive))) {
await includeActive.locator('img').nth(1).click();
}
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
}
await page.getByText('Орган / источник').click();
const localItem = page.getByRole('cell', { name: LOCAL_FILTERS[0], exact: true });
const isLocalExpanded = await localItem.isVisible().catch(() => false);
if (!isLocalExpanded) {
await page.getByRole('cell', { name: LOCAL_TITLE, exact: true }).click();
}
for (const filter of LOCAL_FILTERS) {
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
if (!(await isItemChecked(listItem))) {
await listItem.locator('img').nth(2).click();
}
}
for (const filter of GLOBAL_FILTERS) {
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
if (!(await isItemChecked(listItem))) {
await listItem.locator('img').nth(1).click();
}
}
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
await page.getByRole('button', { name: 'Найти (Enter)' }).click();
}
// ========== INTERNALS ===========
async function isItemChecked(item: Locator) {
const classAttr = await item.getAttribute('class');
return !!classAttr && classAttr.includes('checked');
}

29
src/setup.ts Normal file
View File

@ -0,0 +1,29 @@
// tests/scraper-with-user-data.spec.ts
import { test as base, BrowserContext } from "@playwright/test";
import { chromium } from "playwright";
import path from "path";
export const USER_DATA_DIR = path.resolve(__dirname, "../.user-data-dir");
export const test = base.extend<{
context: BrowserContext;
page: Awaited<ReturnType<BrowserContext["newPage"]>>;
}>({
context: async ({}, use) => {
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
headless: false,
args: [
"--disable-session-crashed-bubble", // Don't show restore bubble
"--disable-restore-session-state", // Don't try to restore session
"--disable-background-networking", // Optional: cleaner launch
"--disable-default-apps", // Optional: skip default Chrome apps
],
});
await use(context);
await context.close();
},
page: async ({ context }, use) => {
const page = context.pages()[0] ?? (await context.newPage());
await use(page);
},
});