Compare commits

..

No commits in common. "4927837ac661ab5eeaed353861a236d8a6a7e4ae" and "9b1c8a79e85e45426e58ced07e147a66d29b1fb1" have entirely different histories.

11 changed files with 54 additions and 319 deletions

8
.gitignore vendored
View File

@ -1,8 +1,3 @@
# output
output/
auth.json
.user-data-dir
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
@ -23,7 +18,6 @@ coverage.xml
.pytest_cache/ .pytest_cache/
cover/ cover/
.mypy_cache/ .mypy_cache/
test-results/
# React # React
.DS_* .DS_*
@ -38,4 +32,4 @@ bower_components
# Environments # Environments
/GitExtensions.settings /GitExtensions.settings
/playwright-report /playwright-report

View File

@ -1,6 +0,0 @@
build/
node_modules/
package-lock.json
yarn.lock
package.json
coverage

View File

@ -1,13 +0,0 @@
{
"semi": true,
"useTabs": false,
"printWidth": 120,
"tabWidth": 2,
"trailingComma": "none",
"arrowParens": "avoid",
"singleQuote": true,
"jsxSingleQuote": true,
"quoteProps": "consistent",
"bracketSameLine": false,
"bracketSpacing": true
}

10
.vscode/launch.json vendored
View File

@ -4,16 +4,6 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0", "version": "0.2.0",
"configurations": [ "configurations": [
{
// Setup Auth
"name": "Auth",
"type": "node",
"request": "launch",
"cwd": "${workspaceFolder}",
"runtimeExecutable": "npx",
"runtimeArgs": ["ts-node", "src/save-auth.ts"],
"console": "integratedTerminal"
},
{ {
// Run all // Run all
"name": "Run all", "name": "Run all",

View File

@ -1,4 +1,3 @@
{ {
"cSpell.words": ["papaparse", "unparse"], "cSpell.words": ["papaparse", "unparse"]
"isort.args": ["--line-length", "100", "--multi-line", "3", "--project", "apps", "--project", "shared"]
} }

3
output.json Normal file
View File

@ -0,0 +1,3 @@
{
"heading": "Example Domain"
}

View File

@ -1,19 +1,18 @@
import { defineConfig, devices } from '@playwright/test'; import { defineConfig, devices } from "@playwright/test";
export default defineConfig({ export default defineConfig({
testDir: 'src', testDir: "src",
retries: 0, retries: 0,
reporter: 'list', reporter: "list",
timeout: 30 * 1000,
fullyParallel: true, fullyParallel: true,
projects: [ projects: [
{ {
name: 'Desktop Chrome', name: "Desktop Chrome",
use: { ...devices['Desktop Chrome'] } use: { ...devices["Desktop Chrome"] },
} },
], ],
use: { use: {
baseURL: 'https://internet.garant.ru/', baseURL: "https://internet.garant.ru/",
storageState: 'auth.json' trace: "on-first-retry",
} },
}); });

View File

@ -1,38 +0,0 @@
import { chromium } from '@playwright/test';
import fs from 'fs/promises';
import { USER_DATA_DIR } from './setup';
const IN_LOGIN = 'iborisov@acconcept.ru';
const IN_PASSWORD = 'PuNS8br2';
// const IN_LOGIN = 'dm9175025694';
// const IN_PASSWORD = '52547';
(async () => {
// Launch persistent context with visible browser
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
headless: false
});
try {
const page = context.pages().length > 0 ? context.pages()[0] : await context.newPage();
await page.goto('https://account.garant.ru/login');
await page.getByRole('textbox').fill(IN_LOGIN);
await page.getByRole('button', { name: 'Войти' }).click();
await page.getByRole('textbox').fill(IN_PASSWORD);
await page.getByRole('button', { name: 'Войти' }).click();
await page
.getByRole('button', {
name: 'Выйти из текущего сеанса и сменить пользователя'
})
.waitFor();
await fs.writeFile('auth.json', JSON.stringify(await page.context().storageState()));
console.log('✅ Saved login state to auth.json');
} finally {
await context.close();
}
})();

View File

@ -1,113 +1,48 @@
import { Page } from '@playwright/test'; import { test, expect } from "@playwright/test";
import fs from 'fs/promises'; import fs from "fs/promises";
import Papa from 'papaparse'; import Papa from "papaparse";
import { test } from './setup';
import { executeSearch } from './search';
/** Input text prompt */ const OUTPUT_FILE = "output.csv";
const IN_TEXT_PROMPT = 'Представительство Республики Коми в Северо-Западном регионе Российской Федерации';
const IN_TITLE_PROMPT = '';
/** Output file naming */ test("scrape data from example site", async ({ page }) => {
const OUT_NAME = 'output'; await page.goto("https://example.com");
const OUT_SUFFIX = '_22';
const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`;
interface DocumentInfo { const heading = await page.locator("h1").textContent();
name: string; console.log({ heading });
docID: string;
url: string;
href: string;
}
/** ===== RUN Auth FIRST before running this! ===== */ expect(heading).toBeTruthy(); // optional check
test('scrape documents list', async ({ page }) => {
// await page.goto("https://demo.garant.ru/"); const data = await page.evaluate(() => {
await page.goto('/'); const heading = document.querySelector("h1")?.textContent?.trim() ?? "";
await executeSearch(page, { return { heading };
textPrompt: IN_TEXT_PROMPT,
titlePrompt: IN_TITLE_PROMPT,
onlyActive: true
}); });
const documents = await readData(page); // Wait for the list to load
await page.waitForSelector("ul.list");
if (documents.length === 0) { const documents = await page.$$eval("ul.list > li", (items) =>
console.log('No data found'); items.map((li) => {
return; const anchor = li.querySelector("a");
} else { const href = anchor?.getAttribute("href") ?? "";
const csv = Papa.unparse(documents);
await fs.writeFile(OUT_FILENAME, csv, 'utf-8'); const nameEl = anchor?.querySelector(".name p");
console.log('✅ Saved to documents.csv'); // Clone the node to strip <em> tags
} const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
if (cleanText) {
// Remove <em> tags
cleanText.querySelectorAll("em").forEach((em) => em.remove());
}
const name = cleanText?.textContent?.trim() ?? "";
return { name, href };
})
);
// Convert to CSV
const csv = Papa.unparse(documents);
await fs.writeFile(OUTPUT_FILE, csv, "utf-8");
console.log("✅ Saved to documents.csv");
}); });
async function readData(page: Page): Promise<DocumentInfo[]> {
const documents: DocumentInfo[] = [];
while (true) {
await page.waitForSelector('ul.list');
const newDocuments = await page.$$eval('ul.list > li', items =>
items
.map(li => {
const anchor = li.querySelector('a');
const href = anchor?.getAttribute('href') ?? '';
const nameEl = anchor?.querySelector('.name p');
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
if (cleanText) {
cleanText.querySelectorAll('em').forEach(em => {
const parent = em.parentNode;
if (parent) {
while (em.firstChild) {
parent.insertBefore(em.firstChild, em);
}
parent.removeChild(em);
}
});
}
const name = cleanText?.textContent?.trim() ?? '';
return { name: name, href: 'https://internet.garant.ru' + href };
})
.filter(item => !!item.href && !!item.name)
);
documents.push(
...newDocuments.map(item => ({
...item,
docID: extractDocumentID(item.href) ?? '',
url: convertHrefToUrl(item.href) ?? ''
}))
);
const nextButton = await page.getByText('Следующая');
const classAttr = await nextButton.getAttribute('class');
if (!classAttr || classAttr.includes('disabled')) {
break;
}
const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href'));
await nextButton.click();
await page.waitForFunction(
prevHref => {
const first = document.querySelector('ul.list > li a');
return first && first.getAttribute('href') !== prevHref;
},
firstHref,
{ timeout: 5000 }
);
}
return documents;
}
// ========== INTERNALS ===========
function extractDocumentID(href: string): string | null {
const match = href.match(/\/#\/document\/(\d+)\//);
return !match ? null : match[1];
}
function convertHrefToUrl(href: string): string | null {
const docId = extractDocumentID(href);
return !docId ? null : `https://internet.garant.ru/#/document/${docId}`;
}

View File

@ -1,99 +0,0 @@
import { Locator, Page } from 'playwright';
const GLOBAL_FILTERS = [
'Федеральные министерства и ведомства',
'Правительство России и СССР',
'Президент России и СССР',
'Органы законодательной власти России и СССР'
] as const;
const LOCAL_TITLE = 'Органы власти Республики Коми';
const LOCAL_FILTERS = [
'Правительство Республики Коми',
'Государственный Совет Республики Коми',
'Глава Республики Коми',
'Президиум Верховного Совета Республики Коми',
'Президиум Верховного Совета Коми ССР',
'Верховный Совет Коми АССР',
'Верховный Совет Коми ССР',
'Верховный Совет Республики Коми'
] as const;
export interface SearchOptions {
textPrompt?: string;
titlePrompt?: string;
onlyActive?: boolean;
}
export async function executeSearch(page: Page, options: SearchOptions) {
await page.getByRole('button', { name: 'Расширенный поиск' }).click();
const clearButton = page.getByRole('link', { name: 'Очистить' });
if ((await clearButton.count()) > 0) {
await clearButton.first().click();
await page.waitForTimeout(1000);
}
if (!!options.textPrompt) {
await page
.locator('label:has-text("Слова в тексте")')
.locator('..')
.locator('..')
.locator('textarea')
.fill(options.textPrompt);
}
if (!!options.titlePrompt) {
await page
.locator('label:has-text("Слова в названии")')
.locator('..')
.locator('..')
.locator('textarea')
.fill(options.titlePrompt);
}
if (options.onlyActive) {
const status = page.getByText('Статус', { exact: true });
const isStatusVisible = await status.isVisible().catch(() => false);
if (!isStatusVisible) {
await page.getByText('Дополнительные реквизиты документа').click();
}
await status.click();
const includeActive = page.getByRole('cell', { name: 'Действующие', exact: true });
if (!(await isItemChecked(includeActive))) {
await includeActive.locator('img').nth(1).click();
}
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
}
await page.getByText('Орган / источник').click();
const localItem = page.getByRole('cell', { name: LOCAL_FILTERS[0], exact: true });
const isLocalExpanded = await localItem.isVisible().catch(() => false);
if (!isLocalExpanded) {
await page.getByRole('cell', { name: LOCAL_TITLE, exact: true }).click();
}
for (const filter of LOCAL_FILTERS) {
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
if (!(await isItemChecked(listItem))) {
await listItem.locator('img').nth(2).click();
}
}
for (const filter of GLOBAL_FILTERS) {
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
if (!(await isItemChecked(listItem))) {
await listItem.locator('img').nth(1).click();
}
}
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
await page.getByRole('button', { name: 'Найти (Enter)' }).click();
}
// ========== INTERNALS ===========
async function isItemChecked(item: Locator) {
const classAttr = await item.getAttribute('class');
return !!classAttr && classAttr.includes('checked');
}

View File

@ -1,29 +0,0 @@
// tests/scraper-with-user-data.spec.ts
import { test as base, BrowserContext } from "@playwright/test";
import { chromium } from "playwright";
import path from "path";
export const USER_DATA_DIR = path.resolve(__dirname, "../.user-data-dir");
export const test = base.extend<{
context: BrowserContext;
page: Awaited<ReturnType<BrowserContext["newPage"]>>;
}>({
context: async ({}, use) => {
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
headless: false,
args: [
"--disable-session-crashed-bubble", // Don't show restore bubble
"--disable-restore-session-state", // Don't try to restore session
"--disable-background-networking", // Optional: cleaner launch
"--disable-default-apps", // Optional: skip default Chrome apps
],
});
await use(context);
await context.close();
},
page: async ({ context }, use) => {
const page = context.pages()[0] ?? (await context.newPage());
await use(page);
},
});