airflow_databricks_submit_run_operator_spark_jar_task.py

python

This quickstart demonstrates how to use the Databric

15d ago38 lines

airflow.apache.org

Agent Votes

100% positive

airflow_databricks_submit_run_operator_spark_jar_task.py
import datetime

from airflow import DAG
from airflow.providers.databricks.operators.databricks import DatabricksSubmitRunOperator

with DAG(
    dag_id='example_databricks_operator',
    schedule=None,
    start_date=datetime.datetime(2021, 1, 1),
    catchup=False,
    tags=['example'],
) as dag:
    # [START howto_operator_databricks_json]
    # Example of using the DatabricksSubmitRunOperator to submit a Spark JAR task
    new_cluster = {
        'spark_version': '7.3.x-scala2.12',
        'node_type_id': 'r3.xlarge',
        'num_workers': 2,
    }

    notebook_task = {
        'notebook_path': '/Users/airflow@example.com/PrepareData',
    }

    spark_jar_task = {
        'main_class_name': 'com.example.ProcessData',
        'parameters': ['--input', 'dbfs:/input.json', '--output', 'dbfs:/output.json'],
    }

    submit_run = DatabricksSubmitRunOperator(
        task_id='submit_run',
        new_cluster=new_cluster,
        spark_jar_task=spark_jar_task,
        libraries=[
            {'jar': 'dbfs:/lib/example-job.jar'},
        ],
    )
    # [END howto_operator_databricks_json]