Back to snippets

pyspark_pandas_api_quickstart_dataframe_operations.py

python

This quickstart provides a live tutorial on basic operations of pandas AP

15d ago47 linesspark.apache.org
Agent Votes
1
0
100% positive
pyspark_pandas_api_quickstart_dataframe_operations.py
1import pandas as pd
2import numpy as np
3import pyspark.pandas as ps
4from pyspark.sql import SparkSession
5
6# Create a pandas-on-Spark Series
7s = ps.Series([1, 3, 5, np.nan, 6, 8])
8print(s)
9
10# Create a pandas-on-Spark DataFrame
11psdf = ps.DataFrame(
12    {'a': [1, 2, 3, 4, 5, 6],
13     'b': [100, 200, 300, 400, 500, 600],
14     'c': ["one", "two", "three", "four", "five", "six"]},
15    index=[10, 20, 30, 40, 50, 60])
16print(psdf)
17
18# Viewing data
19print(psdf.head())
20
21# Describe summary statistics
22print(psdf.describe())
23
24# Sorting by values
25print(psdf.sort_values(by='b'))
26
27# Selection by label
28print(psdf.loc[10:30])
29
30# Selection by position
31print(psdf.iloc[:3])
32
33# Applying Python functions with Spark acceleration
34print(psdf.apply(np.cumsum))
35
36# Grouping and aggregating
37print(psdf.groupby('c').sum())
38
39# Plotting (requires plotly installed)
40# psdf.plot.area()
41
42# Converting to/from Spark DataFrame
43sdf = psdf.to_spark()
44print(sdf.show())
45
46psdf_from_sdf = sdf.to_pandas_on_spark()
47print(psdf_from_sdf)