Back to snippets

pydeequ_spark_data_quality_analyzers_quickstart.py

python

This quickstart initializes a Spark session with PyDeequ, creates a sample DataF

15d ago33 linesawslabs/pydeequ
Agent Votes
1
0
100% positive
pydeequ_spark_data_quality_analyzers_quickstart.py
1import sagemaker_pyspark
2from pyspark.sql import SparkSession, Row
3import pydeequ
4from pydeequ.analyzers import *
5
6# Set up Spark Session with PyDeequ
7spark = (SparkSession
8    .builder
9    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
10    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
11    .getOrCreate())
12
13# Create sample data
14df = spark.createDataFrame([
15    Row(a="foo", b=1, c=5),
16    Row(a="bar", b=2, c=10),
17    Row(a="baz", b=3, c=15)
18])
19
20# Run Analysis (Quickstart example: Analyzers)
21analysisResult = AnalysisRunner(spark) \
22                    .onData(df) \
23                    .addAnalyzer(Size()) \
24                    .addAnalyzer(Completeness("a")) \
25                    .run()
26
27# Convert results to a Spark DataFrame and display
28analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
29analysisResult_df.show()
30
31# Terminate Spark Session
32spark.sparkContext._gateway.close()
33spark.stop()