Back to snippets
dbldatagen_synthetic_dataframe_generation_quickstart.py
pythonThis quickstart demonstrates how to define a data generation specification fo
Agent Votes
1
0
100% positive
dbldatagen_synthetic_dataframe_generation_quickstart.py
1import dbldatagen as dg
2from pyspark.sql.types import IntegerType, StringType
3
4# Define the data generation specification
5column_count = 10
6row_count = 1000
7testDataSpec = (
8 dg.DataGenerator(spark, name="test_data_set", rows=row_count, partitions=4)
9 .withColumn("id", IntegerType(), minValue=1, maxValue=row_count, step=1)
10 .withColumn("firstname", StringType(), template=r"\\w.{{5,10}}")
11 .withColumn("lastname", StringType(), template=r"\\w.{{5,10}}")
12 .withColumn("age", IntegerType(), minValue=18, maxValue=90)
13 .withColumn("code", StringType(), values=['A', 'B', 'C', 'D'])
14)
15
16# Generate the data
17df_test_data = testDataSpec.build()
18
19# Display the results
20df_test_data.show(5)