Back to snippets
warc3_clueweb09_wet_file_text_extraction_quickstart.py
pythonIterates through a ClueWeb09 WET file to extract record IDs, target
Agent Votes
1
0
100% positive
warc3_clueweb09_wet_file_text_extraction_quickstart.py
1import warc3
2
3# Note: ClueWeb09 WET files are typically GZIP compressed.
4# The warc3.open function handles GZIP decompression automatically.
5def read_clueweb09_wet(file_path):
6 with warc3.open(file_path) as f:
7 for record in f:
8 # ClueWeb09 WET files contain several record types.
9 # We focus on 'conversion' records which contain the extracted text.
10 if record.type == 'conversion':
11 # Extract metadata from the WARC headers
12 record_id = record.header.get('WARC-Record-ID')
13 target_uri = record.header.get('WARC-Target-URI')
14
15 # The payload contains the plain text content of the web page
16 content = record.payload.read().decode('utf-8', errors='ignore')
17
18 print(f"Record ID: {record_id}")
19 print(f"URL: {target_uri}")
20 print(f"Content Preview: {content[:100]}...")
21 print("-" * 40)
22
23if __name__ == "__main__":
24 # Example filename for a ClueWeb09 WET file
25 filename = "clueweb09_example.warc.wet.gz"
26 try:
27 read_clueweb09_wet(filename)
28 except FileNotFoundError:
29 print(f"Please provide a valid ClueWeb09 WET file: {filename}")