Skip to content

Instantly share code, notes, and snippets.

@ireneisdoomed
Last active January 18, 2023 18:08
Show Gist options
  • Save ireneisdoomed/48fdf643e31a1c6d46187a71e8d19a1d to your computer and use it in GitHub Desktop.
Save ireneisdoomed/48fdf643e31a1c6d46187a71e8d19a1d to your computer and use it in GitHub Desktop.
!pip3 install dbldatagen
import dbldatagen as dg
from pyspark.sql import SparkSession
import pyspark.sql.types as t
spark = SparkSession.builder.master("local[*]").appName("spark").getOrCreate()
# Generate a mock eCaviar coloc dataframe
ecaviar_schema = t.StructType(
[
t.StructField("left_chromosome", t.StringType(), False),
t.StructField("left_studyId", t.StringType(), False),
t.StructField("left_leadVariantId", t.StringType(), False),
t.StructField("left_type", t.StringType(), True),
t.StructField("right_chromosome", t.StringType(), True),
t.StructField("right_studyId", t.StringType(), True),
t.StructField("right_leadVariantId", t.StringType(), True),
t.StructField("right_type", t.StringType(), True),
t.StructField("coloc_n_vars", t.LongType(), False),
t.StructField("clpp", t.DoubleType(), True),
]
)
mock_ecaviar = dg.DataGenerator(spark, name="ecaviar_coloc", rows=500, partitions=1).withSchema(ecaviar_schema).build(withTempView=T
...: rue)
mock_ecaviar.show(5)
>>>
+---------------+------------+------------------+---------+----------------+-------------+-------------------+----------+------------+----+
|left_chromosome|left_studyId|left_leadVariantId|left_type|right_chromosome|right_studyId|right_leadVariantId|right_type|coloc_n_vars|clpp|
+---------------+------------+------------------+---------+----------------+-------------+-------------------+----------+------------+----+
| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0.0|
| 1| 1| 1| 1| 1| 1| 1| 1| 1| 1.0|
| 2| 2| 2| 2| 2| 2| 2| 2| 2| 2.0|
| 3| 3| 3| 3| 3| 3| 3| 3| 3| 3.0|
| 4| 4| 4| 4| 4| 4| 4| 4| 4| 4.0|
+---------------+------------+------------------+---------+----------------+-------------+-------------------+----------+------------+----+
# Data looks like it's all integers, but it actually conforms to the schema
mock_ecaviar.first()
>>> Row(left_chromosome='0', left_studyId='0', left_leadVariantId='0', left_type='0', right_chromosome='0', right_studyId='0', right_leadVariantId='0', right_type='0', coloc_n_vars=0, clpp=0.0)
# their docs https://databrickslabs.github.io/dbldatagen/public_docs/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment