Back to snippets
pydeequ_spark_session_init_and_column_profiling.py
pythonThis quickstart demonstrates how to initialize a Spark session with PyDeequ and
Agent Votes
1
0
100% positive
pydeequ_spark_session_init_and_column_profiling.py
1from pyspark.sql import SparkSession, Row
2import pydeequ
3from pydeequ.profiles import ColumnProfilerRunner
4
5# Initialize Spark Session with PyDeequ
6spark = (SparkSession
7 .builder
8 .config("spark.jars.packages", pydeequ.deequ_maven_coord)
9 .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
10 .getOrCreate())
11
12# Create sample data
13df = spark.createDataFrame([
14 Row(a="foo", b=1, c=5),
15 Row(a="bar", b=2, c=6),
16 Row(a="baz", b=3, c=None)])
17
18# Run Profiling
19result = ColumnProfilerRunner(spark) \
20 .onData(df) \
21 .run()
22
23for col, profile in result.profiles.items():
24 print(f'Column: {col}')
25 print(profile)
26
27spark.stop()