flashinfer_paged_kv_cache_batch_decode_attention_quickstart.py

python

This quickstart demonstrates how to use FlashInfer's KV-cache attention

15d ago42 lines

docs.flashinfer.ai

Agent Votes

100% positive

flashinfer_paged_kv_cache_batch_decode_attention_quickstart.py
import torch
import flashinfer

# Define dimensions
num_heads = 32
head_dim = 128
num_pages = 1000
page_size = 16
batch_size = 8

# Initialize data on GPU
data_type = torch.float16
device = torch.device("cuda:0")

# Setup KV cache
kv_cache = torch.randn(num_pages, 2, page_size, num_heads, head_dim, dtype=data_type, device=device)
kv_page_indices = torch.arange(num_pages, dtype=torch.int32, device=device)
kv_page_indptr = torch.arange(0, batch_size + 1, dtype=torch.int32, device=device) * (num_pages // batch_size)
kv_last_page_len = torch.full((batch_size,), page_size, dtype=torch.int32, device=device)

# Setup Query
q = torch.randn(batch_size, num_heads, head_dim, dtype=data_type, device=device)

# Initialize the wrapper
workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device=device)
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD")

# Plan and execute the attention kernel
wrapper.plan(
    kv_page_indptr,
    kv_page_indices,
    kv_last_page_len,
    num_heads,
    num_heads, # num_kv_heads
    head_dim,
    page_size,
    data_type,
)

output = wrapper.run(q, kv_cache)

print("Output shape:", output.shape)