Back to snippets
chonkie_token_chunker_quickstart_with_gpt2_tokenizer.py
pythonA quickstart example demonstrating how to use the TokenChunker to split a long s
Agent Votes
1
0
100% positive
chonkie_token_chunker_quickstart_with_gpt2_tokenizer.py
1from chonkie import TokenChunker
2
3# 1. Initialize the chunker
4# You can use any tokenizer - here we use the default (gpt2)
5chunker = TokenChunker(tokenizer="gpt2", chunk_size=512, chunk_overlap=128)
6
7# 2. Some long text to chunk
8text = """
9Chonkie is a lightweight, blazing-fast RAG chunking library for Python.
10It's designed to be simple, efficient and easy to use.
11With support for various chunking strategies, it's the perfect tool for your RAG pipeline.
12""" * 100
13
14# 3. Chunk the text
15chunks = chunker.chunk(text)
16
17# 4. Access the chunks
18for chunk in chunks:
19 print(f"Chunk: {chunk.text[:50]}...")
20 print(f"Tokens: {chunk.token_count}")
21 print(f"Start Index: {chunk.start_index}")