diff --git a/.gitignore b/.gitignore index b08e69d..946bfb3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# output +output/ +auth.json +.user-data-dir + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -18,6 +23,7 @@ coverage.xml .pytest_cache/ cover/ .mypy_cache/ +test-results/ # React .DS_* @@ -32,4 +38,4 @@ bower_components # Environments /GitExtensions.settings -/playwright-report +/playwright-report \ No newline at end of file diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..cbe6c04 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,6 @@ +build/ +node_modules/ +package-lock.json +yarn.lock +package.json +coverage \ No newline at end of file diff --git a/.prettierrc.json b/.prettierrc.json new file mode 100644 index 0000000..2242efb --- /dev/null +++ b/.prettierrc.json @@ -0,0 +1,13 @@ +{ + "semi": true, + "useTabs": false, + "printWidth": 120, + "tabWidth": 2, + "trailingComma": "none", + "arrowParens": "avoid", + "singleQuote": true, + "jsxSingleQuote": true, + "quoteProps": "consistent", + "bracketSameLine": false, + "bracketSpacing": true +} diff --git a/.vscode/launch.json b/.vscode/launch.json index d8fbed6..cf7f8f9 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,6 +4,16 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ + { + // Setup Auth + "name": "Auth", + "type": "node", + "request": "launch", + "cwd": "${workspaceFolder}", + "runtimeExecutable": "npx", + "runtimeArgs": ["ts-node", "src/save-auth.ts"], + "console": "integratedTerminal" + }, { // Run all "name": "Run all", diff --git a/.vscode/settings.json b/.vscode/settings.json index ba36a27..e0f172c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,4 @@ { - "cSpell.words": ["papaparse", "unparse"] + "cSpell.words": ["papaparse", "unparse"], + "isort.args": ["--line-length", "100", "--multi-line", "3", "--project", "apps", "--project", "shared"] } diff --git a/output.json b/output.json deleted file mode 100644 index cf310e8..0000000 --- a/output.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "heading": "Example Domain" -} \ No newline at end of file diff --git a/playwright.config.ts b/playwright.config.ts index 2aa89e5..18a21f0 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -1,18 +1,19 @@ -import { defineConfig, devices } from "@playwright/test"; +import { defineConfig, devices } from '@playwright/test'; export default defineConfig({ - testDir: "src", + testDir: 'src', retries: 0, - reporter: "list", + reporter: 'list', + timeout: 30 * 1000, fullyParallel: true, projects: [ { - name: "Desktop Chrome", - use: { ...devices["Desktop Chrome"] }, - }, + name: 'Desktop Chrome', + use: { ...devices['Desktop Chrome'] } + } ], use: { - baseURL: "https://internet.garant.ru/", - trace: "on-first-retry", - }, + baseURL: 'https://internet.garant.ru/', + storageState: 'auth.json' + } }); diff --git a/src/save-auth.ts b/src/save-auth.ts new file mode 100644 index 0000000..a7f4803 --- /dev/null +++ b/src/save-auth.ts @@ -0,0 +1,36 @@ +import { chromium } from '@playwright/test'; +import fs from 'fs/promises'; +import { USER_DATA_DIR } from './setup'; + +const IN_LOGIN = 'iborisov@acconcept.ru'; +const IN_PASSWORD = 'PuNS8br2'; + +// const IN_LOGIN = "dm9175025694"; +// const IN_PASSWORD = "52547"; + +(async () => { + // Launch persistent context with visible browser + const context = await chromium.launchPersistentContext(USER_DATA_DIR, { + headless: false + }); + + const page = await context.newPage(); + + await page.goto('https://account.garant.ru/login'); + + await page.getByRole('textbox').fill(IN_LOGIN); + await page.getByRole('button', { name: 'Войти' }).click(); + await page.getByRole('textbox').fill(IN_PASSWORD); + await page.getByRole('button', { name: 'Войти' }).click(); + + await page + .getByRole('button', { + name: 'Выйти из текущего сеанса и сменить пользователя' + }) + .waitFor(); + + // Save storage state to file + await fs.writeFile('auth.json', JSON.stringify(await page.context().storageState())); + + console.log('✅ Saved login state to auth.json'); +})(); diff --git a/src/scrape.spec.ts b/src/scrape.spec.ts index 9a633fd..73e6a09 100644 --- a/src/scrape.spec.ts +++ b/src/scrape.spec.ts @@ -1,48 +1,104 @@ -import { test, expect } from "@playwright/test"; -import fs from "fs/promises"; -import Papa from "papaparse"; +import { Page } from '@playwright/test'; +import fs from 'fs/promises'; +import Papa from 'papaparse'; +import { test } from './setup'; +import { executeSearch } from './search'; -const OUTPUT_FILE = "output.csv"; +/** Input text prompt */ +const IN_TEXT_PROMPT = 'Министерство труда, занятости и социальной защиты Республики Коми'; +const IN_TITLE_PROMPT = ''; -test("scrape data from example site", async ({ page }) => { - await page.goto("https://example.com"); +/** Output file naming */ +const OUT_NAME = 'output'; +const OUT_SUFFIX = '_1'; +const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`; - const heading = await page.locator("h1").textContent(); - console.log({ heading }); +interface DocumentInfo { + name: string; + docID: string; + url: string; + href: string; +} - expect(heading).toBeTruthy(); // optional check - - const data = await page.evaluate(() => { - const heading = document.querySelector("h1")?.textContent?.trim() ?? ""; - return { heading }; +/** ===== RUN Auth FIRST before running this! ===== */ +test('scrape documents list', async ({ page }) => { + // await page.goto("https://demo.garant.ru/"); + await page.goto('/'); + await executeSearch(page, { + textPrompt: IN_TEXT_PROMPT, + titlePrompt: IN_TITLE_PROMPT, + onlyActive: true }); - // Wait for the list to load - await page.waitForSelector("ul.list"); - - const documents = await page.$$eval("ul.list > li", (items) => - items.map((li) => { - const anchor = li.querySelector("a"); - const href = anchor?.getAttribute("href") ?? ""; - - const nameEl = anchor?.querySelector(".name p"); - // Clone the node to strip tags - const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null; - - if (cleanText) { - // Remove tags - cleanText.querySelectorAll("em").forEach((em) => em.remove()); - } - - const name = cleanText?.textContent?.trim() ?? ""; - - return { name, href }; - }) - ); - - // Convert to CSV - const csv = Papa.unparse(documents); - await fs.writeFile(OUTPUT_FILE, csv, "utf-8"); - - console.log("✅ Saved to documents.csv"); + const documents = await readData(page); + if (documents.length === 0) { + console.log('No data found'); + return; + } else { + const csv = Papa.unparse(documents); + await fs.writeFile(OUT_FILENAME, csv, 'utf-8'); + console.log('✅ Saved to documents.csv'); + } }); + +async function readData(page: Page): Promise { + const documents: DocumentInfo[] = []; + while (true) { + await page.waitForSelector('ul.list'); + const newDocuments = await page.$$eval('ul.list > li', items => + items + .map(li => { + const anchor = li.querySelector('a'); + const href = anchor?.getAttribute('href') ?? ''; + + const nameEl = anchor?.querySelector('.name p'); + const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null; + + if (cleanText) { + cleanText.querySelectorAll('em').forEach(em => em.remove()); + } + + const name = cleanText?.textContent?.trim() ?? ''; + return { name: name, href: href }; + }) + .filter(item => !!item.href && !!item.name) + ); + documents.push( + ...newDocuments.map(item => ({ + ...item, + docID: extractDocumentID(item.href) ?? '', + url: convertHrefToUrl(item.href) ?? '' + })) + ); + const nextButton = await page.getByText('Следующая'); + const classAttr = await nextButton.getAttribute('class'); + if (!classAttr || classAttr.includes('disabled')) { + break; + } + + const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href')); + await nextButton.click(); + + await page.waitForFunction( + prevHref => { + const first = document.querySelector('ul.list > li a'); + return first && first.getAttribute('href') !== prevHref; + }, + firstHref, + { timeout: 5000 } + ); + } + + return documents; +} + +// ========== INTERNALS =========== +function extractDocumentID(href: string): string | null { + const match = href.match(/\/#\/document\/(\d+)\//); + return !match ? null : match[1]; +} + +function convertHrefToUrl(href: string): string | null { + const docId = extractDocumentID(href); + return !docId ? null : `https://internet.garant.ru/#/document/${docId}`; +} diff --git a/src/search.ts b/src/search.ts new file mode 100644 index 0000000..b126e00 --- /dev/null +++ b/src/search.ts @@ -0,0 +1,100 @@ +import { Locator, Page } from 'playwright'; + +const GLOBAL_FILTERS = [ + 'Федеральные министерства и ведомства', + 'Правительство России и СССР', + 'Президент России и СССР', + 'Органы судебной власти РФ и СССР', + 'Органы законодательной власти России и СССР' +] as const; + +const LOCAL_TITLE = 'Органы власти Республики Коми'; +const LOCAL_FILTERS = [ + 'Правительство Республики Коми', + 'Государственный Совет Республики Коми', + 'Глава Республики Коми', + 'Президиум Верховного Совета Республики Коми', + 'Президиум Верховного Совета Коми ССР', + 'Верховный Совет Коми АССР', + 'Верховный Совет Коми ССР', + 'Верховный Совет Республики Коми' +] as const; + +export interface SearchOptions { + textPrompt?: string; + titlePrompt?: string; + onlyActive?: boolean; +} + +export async function executeSearch(page: Page, options: SearchOptions) { + await page.getByRole('button', { name: 'Расширенный поиск' }).click(); + + const clearButton = page.getByRole('link', { name: 'Очистить' }); + if ((await clearButton.count()) > 0) { + await clearButton.first().click(); + await page.waitForTimeout(1000); + } + + if (!!options.textPrompt) { + await page + .locator('label:has-text("Слова в тексте")') + .locator('..') + .locator('..') + .locator('textarea') + .fill(options.textPrompt); + } + + if (!!options.titlePrompt) { + await page + .locator('label:has-text("Слова в названии")') + .locator('..') + .locator('..') + .locator('textarea') + .fill(options.titlePrompt); + } + + if (options.onlyActive) { + const status = page.getByText('Статус', { exact: true }); + const isStatusVisible = await status.isVisible().catch(() => false); + if (!isStatusVisible) { + await page.getByText('Дополнительные реквизиты документа').click(); + } + await status.click(); + + const includeActive = page.getByRole('cell', { name: 'Действующие', exact: true }); + if (!(await isItemChecked(includeActive))) { + await includeActive.locator('img').nth(1).click(); + } + await page.getByRole('button', { name: 'Выбрать', exact: true }).click(); + } + + await page.getByText('Орган / источник').click(); + + const localItem = page.getByRole('cell', { name: LOCAL_FILTERS[0], exact: true }); + const isLocalExpanded = await localItem.isVisible().catch(() => false); + if (!isLocalExpanded) { + await page.getByRole('cell', { name: LOCAL_TITLE, exact: true }).click(); + } + for (const filter of LOCAL_FILTERS) { + const listItem = page.getByRole('cell', { name: filter, exact: true }).first(); + if (!(await isItemChecked(listItem))) { + await listItem.locator('img').nth(2).click(); + } + } + + for (const filter of GLOBAL_FILTERS) { + const listItem = page.getByRole('cell', { name: filter, exact: true }).first(); + if (!(await isItemChecked(listItem))) { + await listItem.locator('img').nth(1).click(); + } + } + + await page.getByRole('button', { name: 'Выбрать', exact: true }).click(); + await page.getByRole('button', { name: 'Найти (Enter)' }).click(); +} + +// ========== INTERNALS =========== +async function isItemChecked(item: Locator) { + const classAttr = await item.getAttribute('class'); + return !!classAttr && classAttr.includes('checked'); +} diff --git a/src/setup.ts b/src/setup.ts new file mode 100644 index 0000000..947533f --- /dev/null +++ b/src/setup.ts @@ -0,0 +1,29 @@ +// tests/scraper-with-user-data.spec.ts +import { test as base, BrowserContext } from "@playwright/test"; +import { chromium } from "playwright"; +import path from "path"; + +export const USER_DATA_DIR = path.resolve(__dirname, "../.user-data-dir"); + +export const test = base.extend<{ + context: BrowserContext; + page: Awaited>; +}>({ + context: async ({}, use) => { + const context = await chromium.launchPersistentContext(USER_DATA_DIR, { + headless: false, + args: [ + "--disable-session-crashed-bubble", // Don't show restore bubble + "--disable-restore-session-state", // Don't try to restore session + "--disable-background-networking", // Optional: cleaner launch + "--disable-default-apps", // Optional: skip default Chrome apps + ], + }); + await use(context); + await context.close(); + }, + page: async ({ context }, use) => { + const page = context.pages()[0] ?? (await context.newPage()); + await use(page); + }, +});