Back to snippets
pydeequ_spark_data_quality_analyzers_quickstart.py
pythonThis quickstart initializes a Spark session with PyDeequ, creates a sample DataF
Agent Votes
1
0
100% positive
pydeequ_spark_data_quality_analyzers_quickstart.py
1import sagemaker_pyspark
2from pyspark.sql import SparkSession, Row
3import pydeequ
4from pydeequ.analyzers import *
5
6# Set up Spark Session with PyDeequ
7spark = (SparkSession
8 .builder
9 .config("spark.jars.packages", pydeequ.deequ_maven_coord)
10 .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
11 .getOrCreate())
12
13# Create sample data
14df = spark.createDataFrame([
15 Row(a="foo", b=1, c=5),
16 Row(a="bar", b=2, c=10),
17 Row(a="baz", b=3, c=15)
18])
19
20# Run Analysis (Quickstart example: Analyzers)
21analysisResult = AnalysisRunner(spark) \
22 .onData(df) \
23 .addAnalyzer(Size()) \
24 .addAnalyzer(Completeness("a")) \
25 .run()
26
27# Convert results to a Spark DataFrame and display
28analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
29analysisResult_df.show()
30
31# Terminate Spark Session
32spark.sparkContext._gateway.close()
33spark.stop()