F: Implement basic search
This commit is contained in:
parent
9b1c8a79e8
commit
721358aee8
6
.gitignore
vendored
6
.gitignore
vendored
|
@ -1,3 +1,8 @@
|
||||||
|
# output
|
||||||
|
output/
|
||||||
|
auth.json
|
||||||
|
.user-data-dir
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
@ -18,6 +23,7 @@ coverage.xml
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
cover/
|
cover/
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
test-results/
|
||||||
|
|
||||||
# React
|
# React
|
||||||
.DS_*
|
.DS_*
|
||||||
|
|
6
.prettierignore
Normal file
6
.prettierignore
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
build/
|
||||||
|
node_modules/
|
||||||
|
package-lock.json
|
||||||
|
yarn.lock
|
||||||
|
package.json
|
||||||
|
coverage
|
13
.prettierrc.json
Normal file
13
.prettierrc.json
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
{
|
||||||
|
"semi": true,
|
||||||
|
"useTabs": false,
|
||||||
|
"printWidth": 120,
|
||||||
|
"tabWidth": 2,
|
||||||
|
"trailingComma": "none",
|
||||||
|
"arrowParens": "avoid",
|
||||||
|
"singleQuote": true,
|
||||||
|
"jsxSingleQuote": true,
|
||||||
|
"quoteProps": "consistent",
|
||||||
|
"bracketSameLine": false,
|
||||||
|
"bracketSpacing": true
|
||||||
|
}
|
10
.vscode/launch.json
vendored
10
.vscode/launch.json
vendored
|
@ -4,6 +4,16 @@
|
||||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
"version": "0.2.0",
|
"version": "0.2.0",
|
||||||
"configurations": [
|
"configurations": [
|
||||||
|
{
|
||||||
|
// Setup Auth
|
||||||
|
"name": "Auth",
|
||||||
|
"type": "node",
|
||||||
|
"request": "launch",
|
||||||
|
"cwd": "${workspaceFolder}",
|
||||||
|
"runtimeExecutable": "npx",
|
||||||
|
"runtimeArgs": ["ts-node", "src/save-auth.ts"],
|
||||||
|
"console": "integratedTerminal"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
// Run all
|
// Run all
|
||||||
"name": "Run all",
|
"name": "Run all",
|
||||||
|
|
3
.vscode/settings.json
vendored
3
.vscode/settings.json
vendored
|
@ -1,3 +1,4 @@
|
||||||
{
|
{
|
||||||
"cSpell.words": ["papaparse", "unparse"]
|
"cSpell.words": ["papaparse", "unparse"],
|
||||||
|
"isort.args": ["--line-length", "100", "--multi-line", "3", "--project", "apps", "--project", "shared"]
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
{
|
|
||||||
"heading": "Example Domain"
|
|
||||||
}
|
|
|
@ -1,18 +1,19 @@
|
||||||
import { defineConfig, devices } from "@playwright/test";
|
import { defineConfig, devices } from '@playwright/test';
|
||||||
|
|
||||||
export default defineConfig({
|
export default defineConfig({
|
||||||
testDir: "src",
|
testDir: 'src',
|
||||||
retries: 0,
|
retries: 0,
|
||||||
reporter: "list",
|
reporter: 'list',
|
||||||
|
timeout: 30 * 1000,
|
||||||
fullyParallel: true,
|
fullyParallel: true,
|
||||||
projects: [
|
projects: [
|
||||||
{
|
{
|
||||||
name: "Desktop Chrome",
|
name: 'Desktop Chrome',
|
||||||
use: { ...devices["Desktop Chrome"] },
|
use: { ...devices['Desktop Chrome'] }
|
||||||
},
|
}
|
||||||
],
|
],
|
||||||
use: {
|
use: {
|
||||||
baseURL: "https://internet.garant.ru/",
|
baseURL: 'https://internet.garant.ru/',
|
||||||
trace: "on-first-retry",
|
storageState: 'auth.json'
|
||||||
},
|
}
|
||||||
});
|
});
|
||||||
|
|
36
src/save-auth.ts
Normal file
36
src/save-auth.ts
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
import { chromium } from '@playwright/test';
|
||||||
|
import fs from 'fs/promises';
|
||||||
|
import { USER_DATA_DIR } from './setup';
|
||||||
|
|
||||||
|
const IN_LOGIN = 'iborisov@acconcept.ru';
|
||||||
|
const IN_PASSWORD = 'PuNS8br2';
|
||||||
|
|
||||||
|
// const IN_LOGIN = "dm9175025694";
|
||||||
|
// const IN_PASSWORD = "52547";
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
// Launch persistent context with visible browser
|
||||||
|
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
|
||||||
|
headless: false
|
||||||
|
});
|
||||||
|
|
||||||
|
const page = await context.newPage();
|
||||||
|
|
||||||
|
await page.goto('https://account.garant.ru/login');
|
||||||
|
|
||||||
|
await page.getByRole('textbox').fill(IN_LOGIN);
|
||||||
|
await page.getByRole('button', { name: 'Войти' }).click();
|
||||||
|
await page.getByRole('textbox').fill(IN_PASSWORD);
|
||||||
|
await page.getByRole('button', { name: 'Войти' }).click();
|
||||||
|
|
||||||
|
await page
|
||||||
|
.getByRole('button', {
|
||||||
|
name: 'Выйти из текущего сеанса и сменить пользователя'
|
||||||
|
})
|
||||||
|
.waitFor();
|
||||||
|
|
||||||
|
// Save storage state to file
|
||||||
|
await fs.writeFile('auth.json', JSON.stringify(await page.context().storageState()));
|
||||||
|
|
||||||
|
console.log('✅ Saved login state to auth.json');
|
||||||
|
})();
|
|
@ -1,48 +1,104 @@
|
||||||
import { test, expect } from "@playwright/test";
|
import { Page } from '@playwright/test';
|
||||||
import fs from "fs/promises";
|
import fs from 'fs/promises';
|
||||||
import Papa from "papaparse";
|
import Papa from 'papaparse';
|
||||||
|
import { test } from './setup';
|
||||||
|
import { executeSearch } from './search';
|
||||||
|
|
||||||
const OUTPUT_FILE = "output.csv";
|
/** Input text prompt */
|
||||||
|
const IN_TEXT_PROMPT = 'Министерство труда, занятости и социальной защиты Республики Коми';
|
||||||
|
const IN_TITLE_PROMPT = '';
|
||||||
|
|
||||||
test("scrape data from example site", async ({ page }) => {
|
/** Output file naming */
|
||||||
await page.goto("https://example.com");
|
const OUT_NAME = 'output';
|
||||||
|
const OUT_SUFFIX = '_1';
|
||||||
|
const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`;
|
||||||
|
|
||||||
const heading = await page.locator("h1").textContent();
|
interface DocumentInfo {
|
||||||
console.log({ heading });
|
name: string;
|
||||||
|
docID: string;
|
||||||
|
url: string;
|
||||||
|
href: string;
|
||||||
|
}
|
||||||
|
|
||||||
expect(heading).toBeTruthy(); // optional check
|
/** ===== RUN Auth FIRST before running this! ===== */
|
||||||
|
test('scrape documents list', async ({ page }) => {
|
||||||
const data = await page.evaluate(() => {
|
// await page.goto("https://demo.garant.ru/");
|
||||||
const heading = document.querySelector("h1")?.textContent?.trim() ?? "";
|
await page.goto('/');
|
||||||
return { heading };
|
await executeSearch(page, {
|
||||||
|
textPrompt: IN_TEXT_PROMPT,
|
||||||
|
titlePrompt: IN_TITLE_PROMPT,
|
||||||
|
onlyActive: true
|
||||||
});
|
});
|
||||||
|
|
||||||
// Wait for the list to load
|
const documents = await readData(page);
|
||||||
await page.waitForSelector("ul.list");
|
if (documents.length === 0) {
|
||||||
|
console.log('No data found');
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
const csv = Papa.unparse(documents);
|
||||||
|
await fs.writeFile(OUT_FILENAME, csv, 'utf-8');
|
||||||
|
console.log('✅ Saved to documents.csv');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
const documents = await page.$$eval("ul.list > li", (items) =>
|
async function readData(page: Page): Promise<DocumentInfo[]> {
|
||||||
items.map((li) => {
|
const documents: DocumentInfo[] = [];
|
||||||
const anchor = li.querySelector("a");
|
while (true) {
|
||||||
const href = anchor?.getAttribute("href") ?? "";
|
await page.waitForSelector('ul.list');
|
||||||
|
const newDocuments = await page.$$eval('ul.list > li', items =>
|
||||||
|
items
|
||||||
|
.map(li => {
|
||||||
|
const anchor = li.querySelector('a');
|
||||||
|
const href = anchor?.getAttribute('href') ?? '';
|
||||||
|
|
||||||
const nameEl = anchor?.querySelector(".name p");
|
const nameEl = anchor?.querySelector('.name p');
|
||||||
// Clone the node to strip <em> tags
|
|
||||||
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
|
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
|
||||||
|
|
||||||
if (cleanText) {
|
if (cleanText) {
|
||||||
// Remove <em> tags
|
cleanText.querySelectorAll('em').forEach(em => em.remove());
|
||||||
cleanText.querySelectorAll("em").forEach((em) => em.remove());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const name = cleanText?.textContent?.trim() ?? "";
|
const name = cleanText?.textContent?.trim() ?? '';
|
||||||
|
return { name: name, href: href };
|
||||||
return { name, href };
|
|
||||||
})
|
})
|
||||||
|
.filter(item => !!item.href && !!item.name)
|
||||||
);
|
);
|
||||||
|
documents.push(
|
||||||
|
...newDocuments.map(item => ({
|
||||||
|
...item,
|
||||||
|
docID: extractDocumentID(item.href) ?? '',
|
||||||
|
url: convertHrefToUrl(item.href) ?? ''
|
||||||
|
}))
|
||||||
|
);
|
||||||
|
const nextButton = await page.getByText('Следующая');
|
||||||
|
const classAttr = await nextButton.getAttribute('class');
|
||||||
|
if (!classAttr || classAttr.includes('disabled')) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// Convert to CSV
|
const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href'));
|
||||||
const csv = Papa.unparse(documents);
|
await nextButton.click();
|
||||||
await fs.writeFile(OUTPUT_FILE, csv, "utf-8");
|
|
||||||
|
|
||||||
console.log("✅ Saved to documents.csv");
|
await page.waitForFunction(
|
||||||
});
|
prevHref => {
|
||||||
|
const first = document.querySelector('ul.list > li a');
|
||||||
|
return first && first.getAttribute('href') !== prevHref;
|
||||||
|
},
|
||||||
|
firstHref,
|
||||||
|
{ timeout: 5000 }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========== INTERNALS ===========
|
||||||
|
function extractDocumentID(href: string): string | null {
|
||||||
|
const match = href.match(/\/#\/document\/(\d+)\//);
|
||||||
|
return !match ? null : match[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertHrefToUrl(href: string): string | null {
|
||||||
|
const docId = extractDocumentID(href);
|
||||||
|
return !docId ? null : `https://internet.garant.ru/#/document/${docId}`;
|
||||||
|
}
|
||||||
|
|
100
src/search.ts
Normal file
100
src/search.ts
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
import { Locator, Page } from 'playwright';
|
||||||
|
|
||||||
|
const GLOBAL_FILTERS = [
|
||||||
|
'Федеральные министерства и ведомства',
|
||||||
|
'Правительство России и СССР',
|
||||||
|
'Президент России и СССР',
|
||||||
|
'Органы судебной власти РФ и СССР',
|
||||||
|
'Органы законодательной власти России и СССР'
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
const LOCAL_TITLE = 'Органы власти Республики Коми';
|
||||||
|
const LOCAL_FILTERS = [
|
||||||
|
'Правительство Республики Коми',
|
||||||
|
'Государственный Совет Республики Коми',
|
||||||
|
'Глава Республики Коми',
|
||||||
|
'Президиум Верховного Совета Республики Коми',
|
||||||
|
'Президиум Верховного Совета Коми ССР',
|
||||||
|
'Верховный Совет Коми АССР',
|
||||||
|
'Верховный Совет Коми ССР',
|
||||||
|
'Верховный Совет Республики Коми'
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
export interface SearchOptions {
|
||||||
|
textPrompt?: string;
|
||||||
|
titlePrompt?: string;
|
||||||
|
onlyActive?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function executeSearch(page: Page, options: SearchOptions) {
|
||||||
|
await page.getByRole('button', { name: 'Расширенный поиск' }).click();
|
||||||
|
|
||||||
|
const clearButton = page.getByRole('link', { name: 'Очистить' });
|
||||||
|
if ((await clearButton.count()) > 0) {
|
||||||
|
await clearButton.first().click();
|
||||||
|
await page.waitForTimeout(1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!!options.textPrompt) {
|
||||||
|
await page
|
||||||
|
.locator('label:has-text("Слова в тексте")')
|
||||||
|
.locator('..')
|
||||||
|
.locator('..')
|
||||||
|
.locator('textarea')
|
||||||
|
.fill(options.textPrompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!!options.titlePrompt) {
|
||||||
|
await page
|
||||||
|
.locator('label:has-text("Слова в названии")')
|
||||||
|
.locator('..')
|
||||||
|
.locator('..')
|
||||||
|
.locator('textarea')
|
||||||
|
.fill(options.titlePrompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.onlyActive) {
|
||||||
|
const status = page.getByText('Статус', { exact: true });
|
||||||
|
const isStatusVisible = await status.isVisible().catch(() => false);
|
||||||
|
if (!isStatusVisible) {
|
||||||
|
await page.getByText('Дополнительные реквизиты документа').click();
|
||||||
|
}
|
||||||
|
await status.click();
|
||||||
|
|
||||||
|
const includeActive = page.getByRole('cell', { name: 'Действующие', exact: true });
|
||||||
|
if (!(await isItemChecked(includeActive))) {
|
||||||
|
await includeActive.locator('img').nth(1).click();
|
||||||
|
}
|
||||||
|
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.getByText('Орган / источник').click();
|
||||||
|
|
||||||
|
const localItem = page.getByRole('cell', { name: LOCAL_FILTERS[0], exact: true });
|
||||||
|
const isLocalExpanded = await localItem.isVisible().catch(() => false);
|
||||||
|
if (!isLocalExpanded) {
|
||||||
|
await page.getByRole('cell', { name: LOCAL_TITLE, exact: true }).click();
|
||||||
|
}
|
||||||
|
for (const filter of LOCAL_FILTERS) {
|
||||||
|
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
|
||||||
|
if (!(await isItemChecked(listItem))) {
|
||||||
|
await listItem.locator('img').nth(2).click();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const filter of GLOBAL_FILTERS) {
|
||||||
|
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
|
||||||
|
if (!(await isItemChecked(listItem))) {
|
||||||
|
await listItem.locator('img').nth(1).click();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
|
||||||
|
await page.getByRole('button', { name: 'Найти (Enter)' }).click();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========== INTERNALS ===========
|
||||||
|
async function isItemChecked(item: Locator) {
|
||||||
|
const classAttr = await item.getAttribute('class');
|
||||||
|
return !!classAttr && classAttr.includes('checked');
|
||||||
|
}
|
29
src/setup.ts
Normal file
29
src/setup.ts
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
// tests/scraper-with-user-data.spec.ts
|
||||||
|
import { test as base, BrowserContext } from "@playwright/test";
|
||||||
|
import { chromium } from "playwright";
|
||||||
|
import path from "path";
|
||||||
|
|
||||||
|
export const USER_DATA_DIR = path.resolve(__dirname, "../.user-data-dir");
|
||||||
|
|
||||||
|
export const test = base.extend<{
|
||||||
|
context: BrowserContext;
|
||||||
|
page: Awaited<ReturnType<BrowserContext["newPage"]>>;
|
||||||
|
}>({
|
||||||
|
context: async ({}, use) => {
|
||||||
|
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
|
||||||
|
headless: false,
|
||||||
|
args: [
|
||||||
|
"--disable-session-crashed-bubble", // Don't show restore bubble
|
||||||
|
"--disable-restore-session-state", // Don't try to restore session
|
||||||
|
"--disable-background-networking", // Optional: cleaner launch
|
||||||
|
"--disable-default-apps", // Optional: skip default Chrome apps
|
||||||
|
],
|
||||||
|
});
|
||||||
|
await use(context);
|
||||||
|
await context.close();
|
||||||
|
},
|
||||||
|
page: async ({ context }, use) => {
|
||||||
|
const page = context.pages()[0] ?? (await context.newPage());
|
||||||
|
await use(page);
|
||||||
|
},
|
||||||
|
});
|
Loading…
Reference in New Issue
Block a user