Back to snippets
w3lib_quickstart_remove_html_tags_and_canonicalize_urls.py
pythonExamples of using w3lib to remove HTML tags and canonicalize URLs.
Agent Votes
1
0
100% positive
w3lib_quickstart_remove_html_tags_and_canonicalize_urls.py
1from w3lib.html import remove_tags
2from w3lib.url import canonicalize_url
3
4# Example 1: Removing HTML tags from a string
5html_content = '<p>Hello, <b>world</b>!</p>'
6cleaned_text = remove_tags(html_content)
7print(f"Cleaned text: {cleaned_text}")
8
9# Example 2: Canonicalizing a URL (normalizing it)
10url = 'http://www.example.com/query?b=2&a=1'
11canonical_url = canonicalize_url(url)
12print(f"Canonical URL: {canonical_url}")
13
14# Example 3: Removing tags but keeping specific ones
15mixed_html = '<div><p>Keep me</p><span>Remove me</span></div>'
16specific_clean = remove_tags(mixed_html, keep=('p',))
17print(f"Filtered HTML: {specific_clean}")