Concept-Garant/src/scrape.spec.ts

import { test, expect } from "@playwright/test";
import fs from "fs/promises";
import Papa from "papaparse";

const OUTPUT_FILE = "output.csv";

test("scrape data from example site", async ({ page }) => {
  await page.goto("https://example.com");

  const heading = await page.locator("h1").textContent();
  console.log({ heading });

  expect(heading).toBeTruthy(); // optional check

  const data = await page.evaluate(() => {
    const heading = document.querySelector("h1")?.textContent?.trim() ?? "";
    return { heading };
  });

  // Wait for the list to load
  await page.waitForSelector("ul.list");

  const documents = await page.$$eval("ul.list > li", (items) =>
    items.map((li) => {
      const anchor = li.querySelector("a");
      const href = anchor?.getAttribute("href") ?? "";

      const nameEl = anchor?.querySelector(".name p");
      // Clone the node to strip <em> tags
      const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;

      if (cleanText) {
        // Remove <em> tags
        cleanText.querySelectorAll("em").forEach((em) => em.remove());
      }

      const name = cleanText?.textContent?.trim() ?? "";

      return { name, href };
    })
  );

  // Convert to CSV
  const csv = Papa.unparse(documents);
  await fs.writeFile(OUTPUT_FILE, csv, "utf-8");

  console.log("✅ Saved to documents.csv");
});
Initial setup 2025-05-22 13:02:10 +03:00			`import { test, expect } from "@playwright/test";`
			`import fs from "fs/promises";`
			`import Papa from "papaparse";`

			`const OUTPUT_FILE = "output.csv";`

			`test("scrape data from example site", async ({ page }) => {`
			`await page.goto("https://example.com");`

			`const heading = await page.locator("h1").textContent();`
			`console.log({ heading });`

			`expect(heading).toBeTruthy(); // optional check`

			`const data = await page.evaluate(() => {`
			`const heading = document.querySelector("h1")?.textContent?.trim() ?? "";`
			`return { heading };`
			`});`

			`// Wait for the list to load`
			`await page.waitForSelector("ul.list");`

			`const documents = await page.$$eval("ul.list > li", (items) =>`
			`items.map((li) => {`
			`const anchor = li.querySelector("a");`
			`const href = anchor?.getAttribute("href") ?? "";`

			`const nameEl = anchor?.querySelector(".name p");`
			`// Clone the node to strip <em> tags`
			`const cleanText = nameEl ? (nameEl.cloneNode(true) as HTMLElement) : null;`

			`if (cleanText) {`
			`// Remove <em> tags`
			`cleanText.querySelectorAll("em").forEach((em) => em.remove());`
			`}`

			`const name = cleanText?.textContent?.trim() ?? "";`

			`return { name, href };`
			`})`
			`);`

			`// Convert to CSV`
			`const csv = Papa.unparse(documents);`
			`await fs.writeFile(OUTPUT_FILE, csv, "utf-8");`

			`console.log("✅ Saved to documents.csv");`
			`});`