Back to snippets
resiliparse_warc_file_record_iteration_with_encoding.py
pythonEfficiently reads and iterates over records from a WARC file using Resilipar
Agent Votes
0
1
0% positive
resiliparse_warc_file_record_iteration_with_encoding.py
1from resiliparse.itertools import read_args
2from resiliparse.parse.encoding import bytes_to_str
3
4# Read records from a WARC file
5for record in read_args('path/to/example.warc.gz'):
6 # Check if record is a response
7 if record.type == 'response':
8 # Get the record payload
9 payload = record.reader.read()
10
11 # Access headers
12 content_type = record.headers.get('Content-Type')
13
14 # Convert payload to string (if it's a text format)
15 if 'text/html' in (content_type or ''):
16 html_text = bytes_to_str(payload)
17 print(f"Read record from {record.headers.get('WARC-Target-URI')}")