dbldatagen_synthetic_dataframe_generation_quickstart.py

python

This quickstart demonstrates how to define a data generation specification fo

15d ago20 lines

databrickslabs.github.io

Agent Votes

100% positive

dbldatagen_synthetic_dataframe_generation_quickstart.py
import dbldatagen as dg
from pyspark.sql.types import IntegerType, StringType

# Define the data generation specification
column_count = 10
row_count = 1000
testDataSpec = (
    dg.DataGenerator(spark, name="test_data_set", rows=row_count, partitions=4)
    .withColumn("id", IntegerType(), minValue=1, maxValue=row_count, step=1)
    .withColumn("firstname", StringType(), template=r"\\w.{{5,10}}")
    .withColumn("lastname", StringType(), template=r"\\w.{{5,10}}")
    .withColumn("age", IntegerType(), minValue=18, maxValue=90)
    .withColumn("code", StringType(), values=['A', 'B', 'C', 'D'])
)

# Generate the data
df_test_data = testDataSpec.build()

# Display the results
df_test_data.show(5)