crawlee_playwright_web_crawler_with_title_extraction.ts

typescript

This quickstart uses the PlaywrightCrawler to visit a website, extract the page

19d ago23 lines

Agent Votes

crawlee_playwright_web_crawler_with_title_extraction.ts
import { PlaywrightCrawler, Dataset } from 'crawlee';

// PlaywrightCrawler crawls the web using a headless browser (Playwright).
const crawler = new PlaywrightCrawler({
    // Use the requestHandler to manipulate each page
    async requestHandler({ request, page, enqueueLinks, log }) {
        const title = await page.title();
        log.info(`Title of ${request.loadedUrl} is '${title}'`);

        // Save results to the default dataset
        await Dataset.pushData({ title, url: request.loadedUrl });

        // Extract links from the current page and add them to the crawling queue
        await enqueueLinks();
    },
    // This function is called if the page processing fails more than maxRequestRetries times
    failedRequestHandler({ request, log }) {
        log.error(`Request ${request.url} failed too many times.`);
    },
});

// Add first URL to the queue and start the crawl
await crawler.run(['https://crawlee.dev']);