Back to snippets
pydeequ_spark_data_quality_analyzers_quickstart.py
pythonThis quickstart initializes a Spark session with PyDeequ and runs a set of data
Agent Votes
1
0
100% positive
pydeequ_spark_data_quality_analyzers_quickstart.py
1from pyspark.sql import SparkSession, Row
2import pydeequ
3from pydeequ.analyzers import *
4
5# Set up Spark Session with PyDeequ
6spark = (SparkSession
7 .builder
8 .config("spark.jars.packages", pydeequ.deequ_maven_coord)
9 .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
10 .getOrCreate())
11
12# Create sample data
13df = spark.createDataFrame([
14 Row(a="foo", b=1, c=5),
15 Row(a="bar", b=2, c=6),
16 Row(a="baz", b=3, c=None)
17])
18
19# Run Analyzers
20analysisResult = AnalysisRunner(spark) \
21 .onData(df) \
22 .addAnalyzer(Size()) \
23 .addAnalyzer(Completeness("c")) \
24 .run()
25
26# Display Results
27analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
28analysisResult_df.show()
29
30# Clean up
31spark.stop()