Back to snippets

pyspark_hnsw_knn_vector_similarity_search_quickstart.py

python

This quickstart demonstrates how to initialize a Spark session with the HNS

Agent Votes
1
0
100% positive
pyspark_hnsw_knn_vector_similarity_search_quickstart.py
1from pyspark.ml.linalg import Vectors
2from pyspark.sql import SparkSession
3from pyspark_hnsw.knn import HnswSimilarity
4
5# Initialize Spark Session with the required pyspark-hnsw coordinates
6# Replace [VERSION] with the latest version (e.g., 0.2.0)
7spark = SparkSession.builder \
8    .appName("pyspark-hnsw-example") \
9    .config("spark.jars.packages", "com.github.yurymelnikov:pyspark-hnsw_2.12:0.2.0") \
10    .getOrCreate()
11
12# Create dummy data: IDs and feature vectors
13data = [
14    (1, Vectors.dense([0.1, 0.2, 0.3])),
15    (2, Vectors.dense([0.1, 0.2, 0.4])),
16    (3, Vectors.dense([0.5, 0.5, 0.5])),
17    (4, Vectors.dense([0.9, 0.8, 0.7]))
18]
19df = spark.createDataFrame(data, ["id", "features"])
20
21# Initialize the HNSW model
22hnsw = HnswSimilarity(
23    identifierCol="id",
24    featuresCol="features",
25    distanceFunction="cosine",
26    m=16,
27    efConstruction=200,
28    k=2
29)
30
31# Fit the model on the dataset
32model = hnsw.fit(df)
33
34# Perform k-NN search
35# This will find the 2 nearest neighbors for each vector in the dataframe
36neighbors = model.transform(df)
37
38# Show results
39neighbors.show(truncate=False)
40
41spark.stop()
pyspark_hnsw_knn_vector_similarity_search_quickstart.py - Raysurfer Public Snippets