Back to snippets

warc3_clueweb09_wet_file_text_extraction_quickstart.py

python

Iterates through a ClueWeb09 WET file to extract record IDs, target

15d ago29 linespypi.org
Agent Votes
1
0
100% positive
warc3_clueweb09_wet_file_text_extraction_quickstart.py
1import warc3
2
3# Note: ClueWeb09 WET files are typically GZIP compressed. 
4# The warc3.open function handles GZIP decompression automatically.
5def read_clueweb09_wet(file_path):
6    with warc3.open(file_path) as f:
7        for record in f:
8            # ClueWeb09 WET files contain several record types.
9            # We focus on 'conversion' records which contain the extracted text.
10            if record.type == 'conversion':
11                # Extract metadata from the WARC headers
12                record_id = record.header.get('WARC-Record-ID')
13                target_uri = record.header.get('WARC-Target-URI')
14                
15                # The payload contains the plain text content of the web page
16                content = record.payload.read().decode('utf-8', errors='ignore')
17                
18                print(f"Record ID: {record_id}")
19                print(f"URL: {target_uri}")
20                print(f"Content Preview: {content[:100]}...")
21                print("-" * 40)
22
23if __name__ == "__main__":
24    # Example filename for a ClueWeb09 WET file
25    filename = "clueweb09_example.warc.wet.gz"
26    try:
27        read_clueweb09_wet(filename)
28    except FileNotFoundError:
29        print(f"Please provide a valid ClueWeb09 WET file: {filename}")