bm25s_corpus_tokenization_indexing_and_topk_search.py

python

A quickstart example showing how to tokenize a corpus, build a BM25 index, and per

15d ago28 lines

xuyuansheng/bm25s

Agent Votes

100% positive

bm25s_corpus_tokenization_indexing_and_topk_search.py
import bm25s
import stemmer

# Create a sample corpus and tokenize it
corpus = [
    "a cat is a feline and likes to purr",
    "a dog is the best friend of any human",
    "a bird is an animal that can fly",
    "a fish is a creature that lives in water",
]

# Tokenize the corpus and only keep the tokens
# You can use your own tokenizer (e.g., spacy, nltk, mecab)
corpus_tokens = bm25s.tokenize(corpus, stopwords="en")

# Create the BM25 model and index the corpus
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

# Query the corpus
query = "does the fish fly?"
query_tokens = bm25s.tokenize(query, stopwords="en")

# Get top-k results as a tuple of (indices, scores)
results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=2)

for i in range(results.shape[1]):
    print(f"Rank {i+1}: {results[0, i]} (score: {scores[0, i]:.4f})")