Compare commits
No commits in common. "4927837ac661ab5eeaed353861a236d8a6a7e4ae" and "9b1c8a79e85e45426e58ced07e147a66d29b1fb1" have entirely different histories.
4927837ac6
...
9b1c8a79e8
6
.gitignore
vendored
6
.gitignore
vendored
|
@ -1,8 +1,3 @@
|
||||||
# output
|
|
||||||
output/
|
|
||||||
auth.json
|
|
||||||
.user-data-dir
|
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
@ -23,7 +18,6 @@ coverage.xml
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
cover/
|
cover/
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
test-results/
|
|
||||||
|
|
||||||
# React
|
# React
|
||||||
.DS_*
|
.DS_*
|
||||||
|
|
|
@ -1,6 +0,0 @@
|
||||||
build/
|
|
||||||
node_modules/
|
|
||||||
package-lock.json
|
|
||||||
yarn.lock
|
|
||||||
package.json
|
|
||||||
coverage
|
|
|
@ -1,13 +0,0 @@
|
||||||
{
|
|
||||||
"semi": true,
|
|
||||||
"useTabs": false,
|
|
||||||
"printWidth": 120,
|
|
||||||
"tabWidth": 2,
|
|
||||||
"trailingComma": "none",
|
|
||||||
"arrowParens": "avoid",
|
|
||||||
"singleQuote": true,
|
|
||||||
"jsxSingleQuote": true,
|
|
||||||
"quoteProps": "consistent",
|
|
||||||
"bracketSameLine": false,
|
|
||||||
"bracketSpacing": true
|
|
||||||
}
|
|
10
.vscode/launch.json
vendored
10
.vscode/launch.json
vendored
|
@ -4,16 +4,6 @@
|
||||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
"version": "0.2.0",
|
"version": "0.2.0",
|
||||||
"configurations": [
|
"configurations": [
|
||||||
{
|
|
||||||
// Setup Auth
|
|
||||||
"name": "Auth",
|
|
||||||
"type": "node",
|
|
||||||
"request": "launch",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"runtimeExecutable": "npx",
|
|
||||||
"runtimeArgs": ["ts-node", "src/save-auth.ts"],
|
|
||||||
"console": "integratedTerminal"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
// Run all
|
// Run all
|
||||||
"name": "Run all",
|
"name": "Run all",
|
||||||
|
|
3
.vscode/settings.json
vendored
3
.vscode/settings.json
vendored
|
@ -1,4 +1,3 @@
|
||||||
{
|
{
|
||||||
"cSpell.words": ["papaparse", "unparse"],
|
"cSpell.words": ["papaparse", "unparse"]
|
||||||
"isort.args": ["--line-length", "100", "--multi-line", "3", "--project", "apps", "--project", "shared"]
|
|
||||||
}
|
}
|
||||||
|
|
3
output.json
Normal file
3
output.json
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"heading": "Example Domain"
|
||||||
|
}
|
|
@ -1,19 +1,18 @@
|
||||||
import { defineConfig, devices } from '@playwright/test';
|
import { defineConfig, devices } from "@playwright/test";
|
||||||
|
|
||||||
export default defineConfig({
|
export default defineConfig({
|
||||||
testDir: 'src',
|
testDir: "src",
|
||||||
retries: 0,
|
retries: 0,
|
||||||
reporter: 'list',
|
reporter: "list",
|
||||||
timeout: 30 * 1000,
|
|
||||||
fullyParallel: true,
|
fullyParallel: true,
|
||||||
projects: [
|
projects: [
|
||||||
{
|
{
|
||||||
name: 'Desktop Chrome',
|
name: "Desktop Chrome",
|
||||||
use: { ...devices['Desktop Chrome'] }
|
use: { ...devices["Desktop Chrome"] },
|
||||||
}
|
},
|
||||||
],
|
],
|
||||||
use: {
|
use: {
|
||||||
baseURL: 'https://internet.garant.ru/',
|
baseURL: "https://internet.garant.ru/",
|
||||||
storageState: 'auth.json'
|
trace: "on-first-retry",
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
import { chromium } from '@playwright/test';
|
|
||||||
import fs from 'fs/promises';
|
|
||||||
import { USER_DATA_DIR } from './setup';
|
|
||||||
|
|
||||||
const IN_LOGIN = 'iborisov@acconcept.ru';
|
|
||||||
const IN_PASSWORD = 'PuNS8br2';
|
|
||||||
|
|
||||||
// const IN_LOGIN = 'dm9175025694';
|
|
||||||
// const IN_PASSWORD = '52547';
|
|
||||||
|
|
||||||
(async () => {
|
|
||||||
// Launch persistent context with visible browser
|
|
||||||
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
|
|
||||||
headless: false
|
|
||||||
});
|
|
||||||
|
|
||||||
try {
|
|
||||||
const page = context.pages().length > 0 ? context.pages()[0] : await context.newPage();
|
|
||||||
|
|
||||||
await page.goto('https://account.garant.ru/login');
|
|
||||||
|
|
||||||
await page.getByRole('textbox').fill(IN_LOGIN);
|
|
||||||
await page.getByRole('button', { name: 'Войти' }).click();
|
|
||||||
await page.getByRole('textbox').fill(IN_PASSWORD);
|
|
||||||
await page.getByRole('button', { name: 'Войти' }).click();
|
|
||||||
|
|
||||||
await page
|
|
||||||
.getByRole('button', {
|
|
||||||
name: 'Выйти из текущего сеанса и сменить пользователя'
|
|
||||||
})
|
|
||||||
.waitFor();
|
|
||||||
|
|
||||||
await fs.writeFile('auth.json', JSON.stringify(await page.context().storageState()));
|
|
||||||
console.log('✅ Saved login state to auth.json');
|
|
||||||
} finally {
|
|
||||||
await context.close();
|
|
||||||
}
|
|
||||||
})();
|
|
|
@ -1,113 +1,48 @@
|
||||||
import { Page } from '@playwright/test';
|
import { test, expect } from "@playwright/test";
|
||||||
import fs from 'fs/promises';
|
import fs from "fs/promises";
|
||||||
import Papa from 'papaparse';
|
import Papa from "papaparse";
|
||||||
import { test } from './setup';
|
|
||||||
import { executeSearch } from './search';
|
|
||||||
|
|
||||||
/** Input text prompt */
|
const OUTPUT_FILE = "output.csv";
|
||||||
const IN_TEXT_PROMPT = 'Представительство Республики Коми в Северо-Западном регионе Российской Федерации';
|
|
||||||
const IN_TITLE_PROMPT = '';
|
|
||||||
|
|
||||||
/** Output file naming */
|
test("scrape data from example site", async ({ page }) => {
|
||||||
const OUT_NAME = 'output';
|
await page.goto("https://example.com");
|
||||||
const OUT_SUFFIX = '_22';
|
|
||||||
const OUT_FILENAME = `output/${OUT_NAME}${OUT_SUFFIX}.csv`;
|
|
||||||
|
|
||||||
interface DocumentInfo {
|
const heading = await page.locator("h1").textContent();
|
||||||
name: string;
|
console.log({ heading });
|
||||||
docID: string;
|
|
||||||
url: string;
|
|
||||||
href: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** ===== RUN Auth FIRST before running this! ===== */
|
expect(heading).toBeTruthy(); // optional check
|
||||||
test('scrape documents list', async ({ page }) => {
|
|
||||||
// await page.goto("https://demo.garant.ru/");
|
const data = await page.evaluate(() => {
|
||||||
await page.goto('/');
|
const heading = document.querySelector("h1")?.textContent?.trim() ?? "";
|
||||||
await executeSearch(page, {
|
return { heading };
|
||||||
textPrompt: IN_TEXT_PROMPT,
|
|
||||||
titlePrompt: IN_TITLE_PROMPT,
|
|
||||||
onlyActive: true
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const documents = await readData(page);
|
// Wait for the list to load
|
||||||
|
await page.waitForSelector("ul.list");
|
||||||
|
|
||||||
if (documents.length === 0) {
|
const documents = await page.$$eval("ul.list > li", (items) =>
|
||||||
console.log('No data found');
|
items.map((li) => {
|
||||||
return;
|
const anchor = li.querySelector("a");
|
||||||
} else {
|
const href = anchor?.getAttribute("href") ?? "";
|
||||||
const csv = Papa.unparse(documents);
|
|
||||||
await fs.writeFile(OUT_FILENAME, csv, 'utf-8');
|
const nameEl = anchor?.querySelector(".name p");
|
||||||
console.log('✅ Saved to documents.csv');
|
// Clone the node to strip <em> tags
|
||||||
}
|
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
|
||||||
|
|
||||||
|
if (cleanText) {
|
||||||
|
// Remove <em> tags
|
||||||
|
cleanText.querySelectorAll("em").forEach((em) => em.remove());
|
||||||
|
}
|
||||||
|
|
||||||
|
const name = cleanText?.textContent?.trim() ?? "";
|
||||||
|
|
||||||
|
return { name, href };
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
// Convert to CSV
|
||||||
|
const csv = Papa.unparse(documents);
|
||||||
|
await fs.writeFile(OUTPUT_FILE, csv, "utf-8");
|
||||||
|
|
||||||
|
console.log("✅ Saved to documents.csv");
|
||||||
});
|
});
|
||||||
|
|
||||||
async function readData(page: Page): Promise<DocumentInfo[]> {
|
|
||||||
const documents: DocumentInfo[] = [];
|
|
||||||
while (true) {
|
|
||||||
await page.waitForSelector('ul.list');
|
|
||||||
const newDocuments = await page.$$eval('ul.list > li', items =>
|
|
||||||
items
|
|
||||||
.map(li => {
|
|
||||||
const anchor = li.querySelector('a');
|
|
||||||
const href = anchor?.getAttribute('href') ?? '';
|
|
||||||
|
|
||||||
const nameEl = anchor?.querySelector('.name p');
|
|
||||||
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
|
|
||||||
|
|
||||||
if (cleanText) {
|
|
||||||
cleanText.querySelectorAll('em').forEach(em => {
|
|
||||||
const parent = em.parentNode;
|
|
||||||
if (parent) {
|
|
||||||
while (em.firstChild) {
|
|
||||||
parent.insertBefore(em.firstChild, em);
|
|
||||||
}
|
|
||||||
parent.removeChild(em);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
const name = cleanText?.textContent?.trim() ?? '';
|
|
||||||
return { name: name, href: 'https://internet.garant.ru' + href };
|
|
||||||
})
|
|
||||||
.filter(item => !!item.href && !!item.name)
|
|
||||||
);
|
|
||||||
documents.push(
|
|
||||||
...newDocuments.map(item => ({
|
|
||||||
...item,
|
|
||||||
docID: extractDocumentID(item.href) ?? '',
|
|
||||||
url: convertHrefToUrl(item.href) ?? ''
|
|
||||||
}))
|
|
||||||
);
|
|
||||||
const nextButton = await page.getByText('Следующая');
|
|
||||||
const classAttr = await nextButton.getAttribute('class');
|
|
||||||
if (!classAttr || classAttr.includes('disabled')) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const firstHref = await page.$eval('ul.list > li a', a => a.getAttribute('href'));
|
|
||||||
await nextButton.click();
|
|
||||||
|
|
||||||
await page.waitForFunction(
|
|
||||||
prevHref => {
|
|
||||||
const first = document.querySelector('ul.list > li a');
|
|
||||||
return first && first.getAttribute('href') !== prevHref;
|
|
||||||
},
|
|
||||||
firstHref,
|
|
||||||
{ timeout: 5000 }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ========== INTERNALS ===========
|
|
||||||
function extractDocumentID(href: string): string | null {
|
|
||||||
const match = href.match(/\/#\/document\/(\d+)\//);
|
|
||||||
return !match ? null : match[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
function convertHrefToUrl(href: string): string | null {
|
|
||||||
const docId = extractDocumentID(href);
|
|
||||||
return !docId ? null : `https://internet.garant.ru/#/document/${docId}`;
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,99 +0,0 @@
|
||||||
import { Locator, Page } from 'playwright';
|
|
||||||
|
|
||||||
const GLOBAL_FILTERS = [
|
|
||||||
'Федеральные министерства и ведомства',
|
|
||||||
'Правительство России и СССР',
|
|
||||||
'Президент России и СССР',
|
|
||||||
'Органы законодательной власти России и СССР'
|
|
||||||
] as const;
|
|
||||||
|
|
||||||
const LOCAL_TITLE = 'Органы власти Республики Коми';
|
|
||||||
const LOCAL_FILTERS = [
|
|
||||||
'Правительство Республики Коми',
|
|
||||||
'Государственный Совет Республики Коми',
|
|
||||||
'Глава Республики Коми',
|
|
||||||
'Президиум Верховного Совета Республики Коми',
|
|
||||||
'Президиум Верховного Совета Коми ССР',
|
|
||||||
'Верховный Совет Коми АССР',
|
|
||||||
'Верховный Совет Коми ССР',
|
|
||||||
'Верховный Совет Республики Коми'
|
|
||||||
] as const;
|
|
||||||
|
|
||||||
export interface SearchOptions {
|
|
||||||
textPrompt?: string;
|
|
||||||
titlePrompt?: string;
|
|
||||||
onlyActive?: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function executeSearch(page: Page, options: SearchOptions) {
|
|
||||||
await page.getByRole('button', { name: 'Расширенный поиск' }).click();
|
|
||||||
|
|
||||||
const clearButton = page.getByRole('link', { name: 'Очистить' });
|
|
||||||
if ((await clearButton.count()) > 0) {
|
|
||||||
await clearButton.first().click();
|
|
||||||
await page.waitForTimeout(1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!!options.textPrompt) {
|
|
||||||
await page
|
|
||||||
.locator('label:has-text("Слова в тексте")')
|
|
||||||
.locator('..')
|
|
||||||
.locator('..')
|
|
||||||
.locator('textarea')
|
|
||||||
.fill(options.textPrompt);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!!options.titlePrompt) {
|
|
||||||
await page
|
|
||||||
.locator('label:has-text("Слова в названии")')
|
|
||||||
.locator('..')
|
|
||||||
.locator('..')
|
|
||||||
.locator('textarea')
|
|
||||||
.fill(options.titlePrompt);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.onlyActive) {
|
|
||||||
const status = page.getByText('Статус', { exact: true });
|
|
||||||
const isStatusVisible = await status.isVisible().catch(() => false);
|
|
||||||
if (!isStatusVisible) {
|
|
||||||
await page.getByText('Дополнительные реквизиты документа').click();
|
|
||||||
}
|
|
||||||
await status.click();
|
|
||||||
|
|
||||||
const includeActive = page.getByRole('cell', { name: 'Действующие', exact: true });
|
|
||||||
if (!(await isItemChecked(includeActive))) {
|
|
||||||
await includeActive.locator('img').nth(1).click();
|
|
||||||
}
|
|
||||||
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
|
|
||||||
}
|
|
||||||
|
|
||||||
await page.getByText('Орган / источник').click();
|
|
||||||
|
|
||||||
const localItem = page.getByRole('cell', { name: LOCAL_FILTERS[0], exact: true });
|
|
||||||
const isLocalExpanded = await localItem.isVisible().catch(() => false);
|
|
||||||
if (!isLocalExpanded) {
|
|
||||||
await page.getByRole('cell', { name: LOCAL_TITLE, exact: true }).click();
|
|
||||||
}
|
|
||||||
for (const filter of LOCAL_FILTERS) {
|
|
||||||
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
|
|
||||||
if (!(await isItemChecked(listItem))) {
|
|
||||||
await listItem.locator('img').nth(2).click();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const filter of GLOBAL_FILTERS) {
|
|
||||||
const listItem = page.getByRole('cell', { name: filter, exact: true }).first();
|
|
||||||
if (!(await isItemChecked(listItem))) {
|
|
||||||
await listItem.locator('img').nth(1).click();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
await page.getByRole('button', { name: 'Выбрать', exact: true }).click();
|
|
||||||
|
|
||||||
await page.getByRole('button', { name: 'Найти (Enter)' }).click();
|
|
||||||
}
|
|
||||||
|
|
||||||
// ========== INTERNALS ===========
|
|
||||||
async function isItemChecked(item: Locator) {
|
|
||||||
const classAttr = await item.getAttribute('class');
|
|
||||||
return !!classAttr && classAttr.includes('checked');
|
|
||||||
}
|
|
29
src/setup.ts
29
src/setup.ts
|
@ -1,29 +0,0 @@
|
||||||
// tests/scraper-with-user-data.spec.ts
|
|
||||||
import { test as base, BrowserContext } from "@playwright/test";
|
|
||||||
import { chromium } from "playwright";
|
|
||||||
import path from "path";
|
|
||||||
|
|
||||||
export const USER_DATA_DIR = path.resolve(__dirname, "../.user-data-dir");
|
|
||||||
|
|
||||||
export const test = base.extend<{
|
|
||||||
context: BrowserContext;
|
|
||||||
page: Awaited<ReturnType<BrowserContext["newPage"]>>;
|
|
||||||
}>({
|
|
||||||
context: async ({}, use) => {
|
|
||||||
const context = await chromium.launchPersistentContext(USER_DATA_DIR, {
|
|
||||||
headless: false,
|
|
||||||
args: [
|
|
||||||
"--disable-session-crashed-bubble", // Don't show restore bubble
|
|
||||||
"--disable-restore-session-state", // Don't try to restore session
|
|
||||||
"--disable-background-networking", // Optional: cleaner launch
|
|
||||||
"--disable-default-apps", // Optional: skip default Chrome apps
|
|
||||||
],
|
|
||||||
});
|
|
||||||
await use(context);
|
|
||||||
await context.close();
|
|
||||||
},
|
|
||||||
page: async ({ context }, use) => {
|
|
||||||
const page = context.pages()[0] ?? (await context.newPage());
|
|
||||||
await use(page);
|
|
||||||
},
|
|
||||||
});
|
|
Loading…
Reference in New Issue
Block a user