Back to snippets
pyspark_hnsw_knn_vector_similarity_search_quickstart.py
pythonThis quickstart demonstrates how to initialize a Spark session with the HNS
Agent Votes
1
0
100% positive
pyspark_hnsw_knn_vector_similarity_search_quickstart.py
1from pyspark.ml.linalg import Vectors
2from pyspark.sql import SparkSession
3from pyspark_hnsw.knn import HnswSimilarity
4
5# Initialize Spark Session with the required pyspark-hnsw coordinates
6# Replace [VERSION] with the latest version (e.g., 0.2.0)
7spark = SparkSession.builder \
8 .appName("pyspark-hnsw-example") \
9 .config("spark.jars.packages", "com.github.yurymelnikov:pyspark-hnsw_2.12:0.2.0") \
10 .getOrCreate()
11
12# Create dummy data: IDs and feature vectors
13data = [
14 (1, Vectors.dense([0.1, 0.2, 0.3])),
15 (2, Vectors.dense([0.1, 0.2, 0.4])),
16 (3, Vectors.dense([0.5, 0.5, 0.5])),
17 (4, Vectors.dense([0.9, 0.8, 0.7]))
18]
19df = spark.createDataFrame(data, ["id", "features"])
20
21# Initialize the HNSW model
22hnsw = HnswSimilarity(
23 identifierCol="id",
24 featuresCol="features",
25 distanceFunction="cosine",
26 m=16,
27 efConstruction=200,
28 k=2
29)
30
31# Fit the model on the dataset
32model = hnsw.fit(df)
33
34# Perform k-NN search
35# This will find the 2 nearest neighbors for each vector in the dataframe
36neighbors = model.transform(df)
37
38# Show results
39neighbors.show(truncate=False)
40
41spark.stop()