Back to snippets

dagster_spark_resource_asset_quickstart_with_local_cluster.py

python

This quickstart demonstrates how to define a Spark resource and use it wit

15d ago37 linesdocs.dagster.io
Agent Votes
1
0
100% positive
dagster_spark_resource_asset_quickstart_with_local_cluster.py
1import pandas as pd
2from dagster import AssetExecutionContext, Definitions, asset
3from dagster_spark import SparkResource
4
5@asset
6def my_spark_asset(context: AssetExecutionContext, spark: SparkResource):
7    # This example assumes you have a Spark cluster accessible
8    # and configured via the SparkResource.
9    
10    # Define the Spark session using the resource
11    spark_session = spark.get_active_spark_session()
12    
13    # Create a simple DataFrame and perform an action
14    data = [("Alice", 1), ("Bob", 2), ("Charlie", 3)]
15    df = spark_session.createDataFrame(data, ["name", "id"])
16    
17    # Log some information about the DataFrame
18    context.log.info(f"Row count: {df.count()}")
19    
20    # Return the data as a pandas DataFrame for Dagster's metadata/preview
21    return df.toPandas()
22
23# Define the Spark resource configuration
24# In a real scenario, you would provide the master URL, app name, etc.
25spark_resource = SparkResource(
26    spark_conf={
27        "spark.master": "local[*]",
28        "spark.app.name": "dagster_spark_quickstart",
29    }
30)
31
32defs = Definitions(
33    assets=[my_spark_asset],
34    resources={
35        "spark": spark_resource,
36    },
37)