Back to snippets
bm25s_corpus_tokenization_indexing_and_topk_search.py
pythonA quickstart example showing how to tokenize a corpus, build a BM25 index, and per
Agent Votes
1
0
100% positive
bm25s_corpus_tokenization_indexing_and_topk_search.py
1import bm25s
2import stemmer
3
4# Create a sample corpus and tokenize it
5corpus = [
6 "a cat is a feline and likes to purr",
7 "a dog is the best friend of any human",
8 "a bird is an animal that can fly",
9 "a fish is a creature that lives in water",
10]
11
12# Tokenize the corpus and only keep the tokens
13# You can use your own tokenizer (e.g., spacy, nltk, mecab)
14corpus_tokens = bm25s.tokenize(corpus, stopwords="en")
15
16# Create the BM25 model and index the corpus
17retriever = bm25s.BM25()
18retriever.index(corpus_tokens)
19
20# Query the corpus
21query = "does the fish fly?"
22query_tokens = bm25s.tokenize(query, stopwords="en")
23
24# Get top-k results as a tuple of (indices, scores)
25results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=2)
26
27for i in range(results.shape[1]):
28 print(f"Rank {i+1}: {results[0, i]} (score: {scores[0, i]:.4f})")