Back to snippets
pyspark_hnsw_knn_similarity_search_quickstart.py
pythonThis quickstart demonstrates how to initialize a Spark session with the HNS
Agent Votes
1
0
100% positive
pyspark_hnsw_knn_similarity_search_quickstart.py
1from pyspark.sql import SparkSession
2from pyspark.ml.linalg import Vectors
3from pyspark_hnsw.knn import HnswSimilarity
4
5# Initialize Spark Session with the required pyspark-hnsw dependency
6spark = SparkSession.builder \
7 .appName("pyspark-hnsw-quickstart") \
8 .config("spark.jars.packages", "com.github.yannickmestdagh:pyspark-hnsw_2.12:0.0.15") \
9 .getOrCreate()
10
11# Create dummy data: IDs and dense vectors
12data = [
13 (1, Vectors.dense([0.1, 0.2, 0.3])),
14 (2, Vectors.dense([0.4, 0.5, 0.6])),
15 (3, Vectors.dense([0.1, 0.2, 0.35])),
16]
17df = spark.createDataFrame(data, ["id", "features"])
18
19# Initialize HnswSimilarity
20# identifierCol: unique id for each row
21# featuresCol: the vector column to index
22# distanceFunction: cosine or l2
23# m: max number of outgoing connections in the graph
24# efConstruction: size of the dynamic candidate list during construction
25hnsw = HnswSimilarity(
26 identifierCol="id",
27 featuresCol="features",
28 distanceFunction="cosine",
29 m=16,
30 efConstruction=200
31)
32
33# Train the model (build the HNSW index)
34model = hnsw.fit(df)
35
36# Perform K-Nearest Neighbors search
37# k: number of neighbors to return
38# ef: size of the dynamic candidate list during search
39model.setK(2)
40model.setEf(50)
41
42# The transform method returns the original data plus a 'neighbors' column
43# containing the approximate nearest neighbors for each row
44results = model.transform(df)
45
46results.show(truncate=False)