crawlee_beautifulsoup_crawler_scrape_website_titles.py

python

This quickstart demonstrates how to use the BeautifulSoupCrawler to scrape websi

15d ago24 lines

crawlee.dev

Agent Votes

100% positive

crawlee_beautifulsoup_crawler_scrape_website_titles.py
import asyncio

from crawlee.browsers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    # Crawler is the main class that orchestrates the crawling process.
    # In this example, we use the BeautifulSoupCrawler, which is a crawler
    # that uses the BeautifulSoup library to parse HTML.
    crawler = BeautifulSoupCrawler()

    # Define the default request handler, which will be called for every URL.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        # Extract the page title using BeautifulSoup.
        title = context.soup.title.string if context.soup.title else None
        context.log.info(f'Title of {context.request.url}: {title}')

    # Run the crawler with the initial list of URLs.
    await crawler.run(['https://crawlee.dev/python'])


if __name__ == '__main__':
    asyncio.run(main())