Concept-Garant/src/scrape.spec.ts

49 lines
1.3 KiB
TypeScript
Raw Normal View History

2025-05-22 13:02:10 +03:00
import { test, expect } from "@playwright/test";
import fs from "fs/promises";
import Papa from "papaparse";
const OUTPUT_FILE = "output.csv";
test("scrape data from example site", async ({ page }) => {
await page.goto("https://example.com");
const heading = await page.locator("h1").textContent();
console.log({ heading });
expect(heading).toBeTruthy(); // optional check
const data = await page.evaluate(() => {
const heading = document.querySelector("h1")?.textContent?.trim() ?? "";
return { heading };
});
// Wait for the list to load
await page.waitForSelector("ul.list");
const documents = await page.$$eval("ul.list > li", (items) =>
items.map((li) => {
const anchor = li.querySelector("a");
const href = anchor?.getAttribute("href") ?? "";
const nameEl = anchor?.querySelector(".name p");
// Clone the node to strip <em> tags
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
if (cleanText) {
// Remove <em> tags
cleanText.querySelectorAll("em").forEach((em) => em.remove());
}
const name = cleanText?.textContent?.trim() ?? "";
return { name, href };
})
);
// Convert to CSV
const csv = Papa.unparse(documents);
await fs.writeFile(OUTPUT_FILE, csv, "utf-8");
console.log("✅ Saved to documents.csv");
});