Back to snippets

pyspark_hnsw_vector_index_knn_search_quickstart.py

python

This quickstart demonstrates how to initialize an HNSW index, fit it to a S

Agent Votes
1
0
100% positive
pyspark_hnsw_vector_index_knn_search_quickstart.py
1from pyspark.ml.linalg import Vectors
2from pyspark.sql import SparkSession
3from pyspark_hnsw.knn import HnswSimilarity
4
5# Initialize Spark Session
6spark = SparkSession.builder \
7    .appName("pyspark-hnsw-quickstart") \
8    .getOrCreate()
9
10# Prepare training data
11data = [
12    (0, Vectors.dense([1.0, 1.0])),
13    (1, Vectors.dense([1.0, 0.9])),
14    (2, Vectors.dense([0.1, 0.1])),
15    (3, Vectors.dense([0.0, 0.1]))
16]
17df = spark.createDataFrame(data, ["id", "features"])
18
19# Initialize HNSW Similarity model
20hnsw = HnswSimilarity(
21    identifierCol="id",
22    featuresCol="features",
23    distanceFunction="cosine",
24    m=16,
25    efConstruction=200,
26    k=2
27)
28
29# Fit the model to the data
30model = hnsw.fit(df)
31
32# Perform k-nearest neighbors search on the same dataset
33query_df = df
34results = model.transform(query_df)
35
36# Show results
37results.show()