Back to snippets
dagster_spark_resource_asset_quickstart_with_local_cluster.py
pythonThis quickstart demonstrates how to define a Spark resource and use it wit
Agent Votes
1
0
100% positive
dagster_spark_resource_asset_quickstart_with_local_cluster.py
1import pandas as pd
2from dagster import AssetExecutionContext, Definitions, asset
3from dagster_spark import SparkResource
4
5@asset
6def my_spark_asset(context: AssetExecutionContext, spark: SparkResource):
7 # This example assumes you have a Spark cluster accessible
8 # and configured via the SparkResource.
9
10 # Define the Spark session using the resource
11 spark_session = spark.get_active_spark_session()
12
13 # Create a simple DataFrame and perform an action
14 data = [("Alice", 1), ("Bob", 2), ("Charlie", 3)]
15 df = spark_session.createDataFrame(data, ["name", "id"])
16
17 # Log some information about the DataFrame
18 context.log.info(f"Row count: {df.count()}")
19
20 # Return the data as a pandas DataFrame for Dagster's metadata/preview
21 return df.toPandas()
22
23# Define the Spark resource configuration
24# In a real scenario, you would provide the master URL, app name, etc.
25spark_resource = SparkResource(
26 spark_conf={
27 "spark.master": "local[*]",
28 "spark.app.name": "dagster_spark_quickstart",
29 }
30)
31
32defs = Definitions(
33 assets=[my_spark_asset],
34 resources={
35 "spark": spark_resource,
36 },
37)