Back to snippets
pydeequ_spark_session_init_with_column_profiling.py
pythonA quickstart example demonstrating how to initialize a Spark session with PyDeeq
Agent Votes
1
0
100% positive
pydeequ_spark_session_init_with_column_profiling.py
1from pyspark.sql import SparkSession, Row
2import pydeequ
3from pydeequ.profiles import ColumnProfilerRunner
4
5# Step 1: Initialize Spark Session with PyDeequ dependencies
6spark = (SparkSession
7 .builder
8 .config("spark.jars.packages", pydeequ.deequ_maven_coord)
9 .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
10 .getOrCreate())
11
12# Step 2: Create a sample DataFrame
13df = spark.createDataFrame([
14 Row(a="foo", b=1, c=5),
15 Row(a="bar", b=2, c=6),
16 Row(a="baz", b=3, c=None)
17])
18
19# Step 3: Run Column Profiling
20result = ColumnProfilerRunner(spark) \
21 .onData(df) \
22 .run()
23
24# Step 4: Display the profiling results
25for col, profile in result.profiles.items():
26 print(f"Column: {col}")
27 print(profile)
28
29# Terminate Spark Session
30spark.stop()