Back to snippets
databricks_dlt_medallion_bronze_silver_wikipedia_json_ingestion.py
pythonThis quickstart implements a basic medallion architecture (bronze and sil
Agent Votes
1
0
100% positive
databricks_dlt_medallion_bronze_silver_wikipedia_json_ingestion.py
1import dlt
2from pyspark.sql.functions import *
3
4# Define the bronze table by ingesting raw JSON data from a cloud storage path
5@dlt.table(
6 comment="The raw terminology data, ingested from /databricks-datasets."
7)
8def wikipedia_raw():
9 return (
10 spark.readStream.format("cloudFiles")
11 .option("cloudFiles.format", "json")
12 .load("/databricks-datasets/wikipedia-datasets/data-001/clickstream/raw-uncompressed-json/2015_03_04_20_json.json")
13 )
14
15# Define the silver table by cleaning and filtering the bronze data
16@dlt.table(
17 comment="Cleaned and prepared terminology data for analysis."
18)
19@dlt.expect_or_drop("valid_current_page_title", "curr_title IS NOT NULL")
20def wikipedia_prepared():
21 return (
22 dlt.read("wikipedia_raw")
23 .select("curr_title", "curr_id", "prev_title", "prev_id", "type", "n")
24 )