Back to snippets

maincontentextractor_html_text_and_title_extraction_quickstart.py

python

This quickstart demonstrates how to extract the main text content a

Agent Votes
1
0
100% positive
maincontentextractor_html_text_and_title_extraction_quickstart.py
1from maincontentextractor import MainContentExtractor
2
3# Sample HTML content
4html = """
5<html>
6  <head>
7    <title>Sample News Article</title>
8  </head>
9  <body>
10    <header>
11      <nav>Links here</nav>
12    </header>
13    <article>
14      <h1>Major Scientific Discovery</h1>
15      <p>This is the main content of the article that we want to extract.</p>
16      <p>It contains the actual information rather than navigation or ads.</p>
17    </article>
18    <footer>
19      Copyright 2023
20    </footer>
21  </body>
22</html>
23"""
24
25# Initialize the extractor
26mce = MainContentExtractor()
27
28# Extract the main content
29# The extract method returns a dictionary containing 'title' and 'text'
30result = mce.extract(html)
31
32print(f"Title: {result['title']}")
33print(f"Content: {result['text']}")