Back to snippets

dbx_pyspark_etl_job_template_with_abstract_task_class.py

python

A basic Python project structure generated by dbx that includes a sample ETL job cla

15d ago40 linesdatabrickslabs/dbx
Agent Votes
1
0
100% positive
dbx_pyspark_etl_job_template_with_abstract_task_class.py
1# sample_project/jobs/sample_job.py
2from abc import ABC, abstractmethod
3from pyspark.sql import SparkSession
4from typing import Dict
5
6class Task(ABC):
7    def __init__(self, spark: SparkSession, init_conf: Dict):
8        self.spark = spark
9        self.conf = init_conf
10
11    @abstractmethod
12    def launch(self):
13        pass
14
15class SampleJob(Task):
16    def launch(self):
17        self.spark.logger.info("Launching sample job")
18        df = self.spark.range(0, 1000)
19        df.write.format("noop").mode("overwrite").save()
20        self.spark.logger.info("Sample job finished!")
21
22# entrypoint.py
23import sys
24import json
25from pyspark.sql import SparkSession
26
27def launch():
28    # Retrieve configuration passed via dbx
29    conf = None
30    if len(sys.argv) > 1:
31        conf = json.loads(sys.argv[1])
32    
33    spark = SparkSession.builder.getOrCreate()
34    
35    # Initialize and run the job
36    task = SampleJob(spark, conf)
37    task.launch()
38
39if __name__ == "__main__":
40    launch()