F: Implement basic search
This commit is contained in:
parent
9b1c8a79e8
commit
721358aee8
8
.gitignore
vendored
8
.gitignore
vendored
|
@ -1,3 +1,8 @@
|
|||
# output
|
||||
output/
|
||||
auth.json
|
||||
.user-data-dir
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
@ -18,6 +23,7 @@ coverage.xml
|
|||
.pytest_cache/
|
||||
cover/
|
||||
.mypy_cache/
|
||||
test-results/
|
||||
|
||||
# React
|
||||
.DS_*
|
||||
|
@ -32,4 +38,4 @@ bower_components
|
|||
|
||||
# Environments
|
||||
/GitExtensions.settings
|
||||
/playwright-report
|
||||
/playwright-report
|
6
.prettierignore
Normal file
6
.prettierignore
Normal file
|
@ -0,0 +1,6 @@
|
|||
build/
|
||||
node_modules/
|
||||
package-lock.json
|
||||
yarn.lock
|
||||
package.json
|
||||
coverage
|
13
.prettierrc.json
Normal file
13
.prettierrc.json
Normal file
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"semi": true,
|
||||
"useTabs": false,
|
||||
"printWidth": 120,
|
||||
"tabWidth": 2,
|
||||
"trailingComma": "none",
|
||||
"arrowParens": "avoid",
|
||||
"singleQuote": true,
|
||||
"jsxSingleQuote": true,
|
||||
"quoteProps": "consistent",
|
||||
"bracketSameLine": false,
|
||||
"bracketSpacing": true
|
||||
}
|
10
.vscode/launch.json
vendored
10
.vscode/launch.json
vendored
|
@ -4,6 +4,16 @@
|
|||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
// Setup Auth
|
||||
"name": "Auth",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"runtimeExecutable": "npx",
|
||||
"runtimeArgs": ["ts-node", "src/save-auth.ts"],
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
// Run all
|
||||
"name": "Run all",
|
||||
|
|
3
.vscode/settings.json
vendored
3
.vscode/settings.json
vendored
|
@ -1,3 +1,4 @@
|
|||
{
|
||||
"cSpell.words": ["papaparse", "unparse"]
|
||||
"cSpell.words": ["papaparse", "unparse"],
|
||||
"isort.args": ["--line-length", "100", "--multi-line", "3", "--project", "apps", "--project", "shared"]
|
||||
}
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
{
|
||||
"heading": "Example Domain"
|
||||
}
|
|
@ -1,18 +1,19 @@
|
|||
import { defineConfig, devices } from "@playwright/test";
|
||||
import { defineConfig, devices } from '@playwright/test';
|
||||
|
||||
export default defineConfig({
|
||||
testDir: "src",
|
||||
testDir: 'src',
|
||||
retries: 0,
|
||||
reporter: "list",
|
||||
reporter: 'list',
|
||||
timeout: 30 * 1000,
|
||||
fullyParallel: true,
|
||||
projects: [
|
||||
{
|
||||
name: "Desktop Chrome",
|
||||
use: { ...devices["Desktop Chrome"] },
|
||||
},
|
||||
name: 'Desktop Chrome',
|
||||
use: { ...devices['Desktop Chrome'] }
|
||||
}
|
||||
],
|
||||
use: {
|
||||
baseURL: "https://internet.garant.ru/",
|
||||
trace: "on-first-retry",
|
||||
},
|
||||
baseURL: 'https://internet.garant.ru/',
|
||||
storageState: 'auth.json'
|
||||
}
|
||||
});
|
||||
|
|
36
src/save-auth.ts
Normal file
36
src/save-auth.ts
Normal file
|
@ -0,0 +1,36 @@
|
|||
import { chromium } from '@playwright/test';
|
||||
import fs from 'fs/promises';
|
||||
import { USER_DATA_DIR } from './setup';
|
||||
|
||||
const IN_LOGIN = 'iborisov@acconcept.ru';
|
||||
const IN_PASSWORD = 'PuNS8br2';
|
||||
|
||||
// const IN_LOGIN = "dm9175025694";
|
||||
// const IN_PASSWORD = "52547";
|
||||
|
||||
(async () => {
|
||||
// Launch persistent context with visible browser
|
||||
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
|
||||
headless: false
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
await page.goto('https://account.garant.ru/login');
|
||||
|
||||
await page.getByRole('textbox').fill(IN_LOGIN);
|
||||
await page.getByRole('button', { name: 'Войти' }).click();
|
||||
await page.getByRole('textbox').fill(IN_PASSWORD);
|
||||
await page.getByRole('button', { name: 'Войти' }).click();
|
||||
|
||||
await page
|
||||
.getByRole('button', {
|
||||
name: 'Выйти из текущего сеанса и сменить пользователя'
|
||||
})
|
||||
.waitFor();
|
||||
|
||||
// Save storage state to file
|
||||
await fs.writeFile('auth.json', JSON.stringify(await page.context().storageState()));
|
||||
|
||||
console.log('✅ Saved login state to auth.json');
|
||||
})();
|
|
@ -1,48 +1,104 @@
|
|||
import { test, expect } from "@playwright/test";
|
||||
import fs from "fs/promises";
|
||||
import Papa from "papaparse";
|
||||
import { Page } from '@playwright/test';
|
||||
import fs from 'fs/promises';
|
||||
import Papa from 'papaparse';
|
||||
import { test } from './setup';
|
||||
import { executeSearch } from './search';
|
||||
|
||||
const OUTPUT_FILE = "output.csv";
|
||||
/** Input text prompt */
|
||||
const IN_TEXT_PROMPT = 'Министерство труда, занятости и социальной защиты Республики Коми';
|
||||
const IN_TITLE_PROMPT = '';
|
||||
|
||||
test("scrape data from example site", async ({ page }) => {
|
||||
await page.goto("https://example.com");
|
||||
/** Output file naming */
|
||||
const OUT_NAME = 'output';
|
||||
const OUT_SUFFIX = '_1';
|
||||
const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`;
|
||||
|
||||
const heading = await page.locator("h1").textContent();
|
||||
console.log({ heading });
|
||||
interface DocumentInfo {
|
||||
name: string;
|
||||
docID: string;
|
||||
url: string;
|
||||
href: string;
|
||||
}
|
||||
|
||||
expect(heading).toBeTruthy(); // optional check
|
||||
|
||||
const data = await page.evaluate(() => {
|
||||
const heading = document.querySelector("h1")?.textContent?.trim() ?? "";
|
||||
return { heading };
|
||||
/** ===== RUN Auth FIRST before running this! ===== */
|
||||
test('scrape documents list', async ({ page }) => {
|
||||
// await page.goto("https://demo.garant.ru/");
|
||||
await page.goto('/');
|
||||
await executeSearch(page, {
|
||||
textPrompt: IN_TEXT_PROMPT,
|
||||
titlePrompt: IN_TITLE_PROMPT,
|
||||
onlyActive: true
|
||||
});
|
||||
|
||||
// Wait for the list to load
|
||||
await page.waitForSelector("ul.list");
|
||||
|
||||
const documents = await page.$$eval("ul.list > li", (items) =>
|
||||
items.map((li) => {
|
||||
const anchor = li.querySelector("a");
|
||||
const href = anchor?.getAttribute("href") ?? "";
|
||||
|
||||
const nameEl = anchor?.querySelector(".name p");
|
||||
// Clone the node to strip <em> tags
|
||||
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
|
||||
|
||||
if (cleanText) {
|
||||
// Remove <em> tags
|
||||
cleanText.querySelectorAll("em").forEach((em) => em.remove());
|
||||
}
|
||||
|
||||
const name = cleanText?.textContent?.trim() ?? "";
|
||||
|
||||
return { name, href };
|
||||
})
|
||||
);
|
||||
|
||||
// Convert to CSV
|
||||
const csv = Papa.unparse(documents);
|
||||
await fs.writeFile(OUTPUT_FILE, csv, "utf-8");
|
||||
|
||||
console.log("✅ Saved to documents.csv");
|
||||
const documents = await readData(page);
|
||||
if (documents.length === 0) {
|
||||
console.log('No data found');
|
||||
return;
|
||||
} else {
|
||||
const csv = Papa.unparse(documents);
|
||||
await fs.writeFile(OUT_FILENAME, csv, 'utf-8');
|
||||
console.log('✅ Saved to documents.csv');
|
||||
}
|
||||
});
|
||||
|
||||
async function readData(page: Page): Promise<DocumentInfo[]> {
|
||||
const documents: DocumentInfo[] = [];
|
||||
while (true) {
|
||||
await page.waitForSelector('ul.list');
|
||||
const newDocuments = await page.$$eval('ul.list > li', items =>
|
||||
items
|
||||
.map(li => {
|
||||
const anchor = li.querySelector('a');
|
||||
const href = anchor?.getAttribute('href') ?? '';
|
||||
|
||||
const nameEl = anchor?.querySelector('.name p');
|
||||
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
|
||||
|
||||
if (cleanText) {
|
||||
cleanText.querySelectorAll('em').forEach(em => em.remove());
|
||||
}
|
||||
|
||||
const name = cleanText?.textContent?.trim() ?? '';
|
||||
return { name: name, href: href };
|
||||
})
|
||||
.filter(item => !!item.href && !!item.name)
|
||||
);
|
||||
documents.push(
|
||||
...newDocuments.map(item => ({
|
||||
...item,
|
||||
docID: extractDocumentID(item.href) ?? '',
|
||||
url: convertHrefToUrl(item.href) ?? ''
|
||||
}))
|
||||
);
|
||||
const nextButton = await page.getByText('Следующая');
|
||||
const classAttr = await nextButton.getAttribute('class');
|
||||
if (!classAttr || classAttr.includes('disabled')) {
|
||||
break;
|
||||
}
|
||||
|
||||
const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href'));
|
||||
await nextButton.click();
|
||||
|
||||
await page.waitForFunction(
|
||||
prevHref => {
|
||||
const first = document.querySelector('ul.list > li a');
|
||||
return first && first.getAttribute('href') !== prevHref;
|
||||
},
|
||||
firstHref,
|
||||
{ timeout: 5000 }
|
||||
);
|
||||
}
|
||||
|
||||
return documents;
|
||||
}
|
||||
|
||||
// ========== INTERNALS ===========
|
||||
function extractDocumentID(href: string): string | null {
|
||||
const match = href.match(/\/#\/document\/(\d+)\//);
|
||||
return !match ? null : match[1];
|
||||
}
|
||||
|
||||
function convertHrefToUrl(href: string): string | null {
|
||||
const docId = extractDocumentID(href);
|
||||
return !docId ? null : `https://internet.garant.ru/#/document/${docId}`;
|
||||
}
|
||||
|
|
100
src/search.ts
Normal file
100
src/search.ts
Normal file
|
@ -0,0 +1,100 @@
|
|||
import { Locator, Page } from 'playwright';
|
||||
|
||||
const GLOBAL_FILTERS = [
|
||||
'Федеральные министерства и ведомства',
|
||||
'Правительство России и СССР',
|
||||
'Президент России и СССР',
|
||||
'Органы судебной власти РФ и СССР',
|
||||
'Органы законодательной власти России и СССР'
|
||||
] as const;
|
||||
|
||||
const LOCAL_TITLE = 'Органы власти Республики Коми';
|
||||
const LOCAL_FILTERS = [
|
||||
'Правительство Республики Коми',
|
||||
'Государственный Совет Республики Коми',
|
||||
'Глава Республики Коми',
|
||||
'Президиум Верховного Совета Республики Коми',
|
||||
'Президиум Верховного Совета Коми ССР',
|
||||
'Верховный Совет Коми АССР',
|
||||
'Верховный Совет Коми ССР',
|
||||
'Верховный Совет Республики Коми'
|
||||
] as const;
|
||||
|
||||
export interface SearchOptions {
|
||||
textPrompt?: string;
|
||||
titlePrompt?: string;
|
||||
onlyActive?: boolean;
|
||||
}
|
||||
|
||||
export async function executeSearch(page: Page, options: SearchOptions) {
|
||||
await page.getByRole('button', { name: 'Расширенный поиск' }).click();
|
||||
|
||||
const clearButton = page.getByRole('link', { name: 'Очистить' });
|
||||
if ((await clearButton.count()) > 0) {
|
||||
await clearButton.first().click();
|
||||
await page.waitForTimeout(1000);
|
||||
}
|
||||
|
||||
if (!!options.textPrompt) {
|
||||
await page
|
||||
.locator('label:has-text("Слова в тексте")')
|
||||
.locator('..')
|
||||
.locator('..')
|
||||
.locator('textarea')
|
||||
.fill(options.textPrompt);
|
||||
}
|
||||
|
||||
if (!!options.titlePrompt) {
|
||||
await page
|
||||
.locator('label:has-text("Слова в названии")')
|
||||
.locator('..')
|
||||
.locator('..')
|
||||
.locator('textarea')
|
||||
.fill(options.titlePrompt);
|
||||
}
|
||||
|
||||
if (options.onlyActive) {
|
||||
const status = page.getByText('Статус', { exact: true });
|
||||
const isStatusVisible = await status.isVisible().catch(() => false);
|
||||
if (!isStatusVisible) {
|
||||
await page.getByText('Дополнительные реквизиты документа').click();
|
||||
}
|
||||
await status.click();
|
||||
|
||||
const includeActive = page.getByRole('cell', { name: 'Действующие', exact: true });
|
||||
if (!(await isItemChecked(includeActive))) {
|
||||
await includeActive.locator('img').nth(1).click();
|
||||
}
|
||||
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
|
||||
}
|
||||
|
||||
await page.getByText('Орган / источник').click();
|
||||
|
||||
const localItem = page.getByRole('cell', { name: LOCAL_FILTERS[0], exact: true });
|
||||
const isLocalExpanded = await localItem.isVisible().catch(() => false);
|
||||
if (!isLocalExpanded) {
|
||||
await page.getByRole('cell', { name: LOCAL_TITLE, exact: true }).click();
|
||||
}
|
||||
for (const filter of LOCAL_FILTERS) {
|
||||
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
|
||||
if (!(await isItemChecked(listItem))) {
|
||||
await listItem.locator('img').nth(2).click();
|
||||
}
|
||||
}
|
||||
|
||||
for (const filter of GLOBAL_FILTERS) {
|
||||
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
|
||||
if (!(await isItemChecked(listItem))) {
|
||||
await listItem.locator('img').nth(1).click();
|
||||
}
|
||||
}
|
||||
|
||||
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
|
||||
await page.getByRole('button', { name: 'Найти (Enter)' }).click();
|
||||
}
|
||||
|
||||
// ========== INTERNALS ===========
|
||||
async function isItemChecked(item: Locator) {
|
||||
const classAttr = await item.getAttribute('class');
|
||||
return !!classAttr && classAttr.includes('checked');
|
||||
}
|
29
src/setup.ts
Normal file
29
src/setup.ts
Normal file
|
@ -0,0 +1,29 @@
|
|||
// tests/scraper-with-user-data.spec.ts
|
||||
import { test as base, BrowserContext } from "@playwright/test";
|
||||
import { chromium } from "playwright";
|
||||
import path from "path";
|
||||
|
||||
export const USER_DATA_DIR = path.resolve(__dirname, "../.user-data-dir");
|
||||
|
||||
export const test = base.extend<{
|
||||
context: BrowserContext;
|
||||
page: Awaited<ReturnType<BrowserContext["newPage"]>>;
|
||||
}>({
|
||||
context: async ({}, use) => {
|
||||
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
|
||||
headless: false,
|
||||
args: [
|
||||
"--disable-session-crashed-bubble", // Don't show restore bubble
|
||||
"--disable-restore-session-state", // Don't try to restore session
|
||||
"--disable-background-networking", // Optional: cleaner launch
|
||||
"--disable-default-apps", // Optional: skip default Chrome apps
|
||||
],
|
||||
});
|
||||
await use(context);
|
||||
await context.close();
|
||||
},
|
||||
page: async ({ context }, use) => {
|
||||
const page = context.pages()[0] ?? (await context.newPage());
|
||||
await use(page);
|
||||
},
|
||||
});
|
Loading…
Reference in New Issue
Block a user