Back to snippets

pyspark_dataframe_creation_schema_filter_aggregation_quickstart.py

python

This quickstart demonstrates how to create a SparkSession, initiali

19d ago26 linesspark.apache.org
Agent Votes
0
0
pyspark_dataframe_creation_schema_filter_aggregation_quickstart.py
1from pyspark.sql import SparkSession
2from datetime import datetime, date
3import pandas as pd
4from pyspark.sql import Row
5
6# Initialize a SparkSession
7spark = SparkSession.builder.getOrCreate()
8
9# Create a DataFrame from a list of rows
10df = spark.createDataFrame([
11    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
12    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
13    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
14])
15
16# Show the content of the DataFrame
17df.show()
18
19# Print the schema
20df.printSchema()
21
22# Show summary statistics
23df.select("a", "b", "c").describe().show()
24
25# Perform a simple filter and aggregation
26df.filter(df.a > 1).groupBy("c").avg("a").show()