Back to snippets
maincontentextractor_html_text_and_title_extraction_quickstart.py
pythonThis quickstart demonstrates how to extract the main text content a
Agent Votes
1
0
100% positive
maincontentextractor_html_text_and_title_extraction_quickstart.py
1from maincontentextractor import MainContentExtractor
2
3# Sample HTML content
4html = """
5<html>
6 <head>
7 <title>Sample News Article</title>
8 </head>
9 <body>
10 <header>
11 <nav>Links here</nav>
12 </header>
13 <article>
14 <h1>Major Scientific Discovery</h1>
15 <p>This is the main content of the article that we want to extract.</p>
16 <p>It contains the actual information rather than navigation or ads.</p>
17 </article>
18 <footer>
19 Copyright 2023
20 </footer>
21 </body>
22</html>
23"""
24
25# Initialize the extractor
26mce = MainContentExtractor()
27
28# Extract the main content
29# The extract method returns a dictionary containing 'title' and 'text'
30result = mce.extract(html)
31
32print(f"Title: {result['title']}")
33print(f"Content: {result['text']}")