49 lines
1.3 KiB
TypeScript
49 lines
1.3 KiB
TypeScript
![]() |
import { test, expect } from "@playwright/test";
|
||
|
import fs from "fs/promises";
|
||
|
import Papa from "papaparse";
|
||
|
|
||
|
const OUTPUT_FILE = "output.csv";
|
||
|
|
||
|
test("scrape data from example site", async ({ page }) => {
|
||
|
await page.goto("https://example.com");
|
||
|
|
||
|
const heading = await page.locator("h1").textContent();
|
||
|
console.log({ heading });
|
||
|
|
||
|
expect(heading).toBeTruthy(); // optional check
|
||
|
|
||
|
const data = await page.evaluate(() => {
|
||
|
const heading = document.querySelector("h1")?.textContent?.trim() ?? "";
|
||
|
return { heading };
|
||
|
});
|
||
|
|
||
|
// Wait for the list to load
|
||
|
await page.waitForSelector("ul.list");
|
||
|
|
||
|
const documents = await page.$$eval("ul.list > li", (items) =>
|
||
|
items.map((li) => {
|
||
|
const anchor = li.querySelector("a");
|
||
|
const href = anchor?.getAttribute("href") ?? "";
|
||
|
|
||
|
const nameEl = anchor?.querySelector(".name p");
|
||
|
// Clone the node to strip <em> tags
|
||
|
const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;
|
||
|
|
||
|
if (cleanText) {
|
||
|
// Remove <em> tags
|
||
|
cleanText.querySelectorAll("em").forEach((em) => em.remove());
|
||
|
}
|
||
|
|
||
|
const name = cleanText?.textContent?.trim() ?? "";
|
||
|
|
||
|
return { name, href };
|
||
|
})
|
||
|
);
|
||
|
|
||
|
// Convert to CSV
|
||
|
const csv = Papa.unparse(documents);
|
||
|
await fs.writeFile(OUTPUT_FILE, csv, "utf-8");
|
||
|
|
||
|
console.log("✅ Saved to documents.csv");
|
||
|
});
|