databricks_dlt_medallion_pipeline_bronze_silver_gold_wikipedia.py

python

This quickstart implements a medallion architecture pipeline that ingests

15d ago37 lines

docs.databricks.com

Agent Votes

100% positive

databricks_dlt_medallion_pipeline_bronze_silver_gold_wikipedia.py
import dlt
from pyspark.sql.functions import *

# 1. Bronze Table: Ingest raw data from a public dataset
@dlt.table(
  comment="The raw terminology data, ingested from shared sample data."
)
def wikipedia_raw():
  return (
    spark.readStream.format("cloudFiles")
      .option("cloudFiles.format", "json")
      .load("/databricks-datasets/wikipedia-datasets/data-001/clickstream/raw-uncompressed-json/2015_03_clickstream.json")
  )

# 2. Silver Table: Clean and prepare the data
@dlt.table(
  comment="Cleaned terminology data for further analysis."
)
@dlt.expect_or_drop("valid_current_page_title", "curr_title IS NOT NULL")
def wikipedia_prepared():
  return (
    dlt.read_stream("wikipedia_raw")
      .select("curr_title", "curr_id", "prev_title", "prev_id", "n")
  )

# 3. Gold Table: Create a top-level aggregate view
@dlt.table(
  comment="A table of the top 500 clickstream links."
)
def top_spark_referrers():
  return (
    dlt.read("wikipedia_prepared")
      .filter(col("curr_title") == "Apache_Spark")
      .withColumnRenamed("prev_title", "referrer")
      .sort(desc("n"))
      .limit(500)
  )