ireneisdoomed/Mock Pyspark Data with dbldatagen.py

## Mock Pyspark Data with dbldatagen.py
!pip3 install dbldatagen

import dbldatagen as dg
from pyspark.sql import SparkSession
import pyspark.sql.types as t

spark = SparkSession.builder.master("local[*]").appName("spark").getOrCreate()

# Generate a mock eCaviar coloc dataframe

ecaviar_schema = t.StructType(
    [
        t.StructField("left_chromosome", t.StringType(), False),
        t.StructField("left_studyId", t.StringType(), False),
        t.StructField("left_leadVariantId", t.StringType(), False),
        t.StructField("left_type", t.StringType(), True),
        t.StructField("right_chromosome", t.StringType(), True),
        t.StructField("right_studyId", t.StringType(), True),
        t.StructField("right_leadVariantId", t.StringType(), True),
        t.StructField("right_type", t.StringType(), True),
        t.StructField("coloc_n_vars", t.LongType(), False),
        t.StructField("clpp", t.DoubleType(), True),
    ]
)

mock_ecaviar = dg.DataGenerator(spark, name="ecaviar_coloc", rows=500, partitions=1).withSchema(ecaviar_schema).build(withTempView=T
    ...: rue)

mock_ecaviar.show(5)
>>>
+---------------+------------+------------------+---------+----------------+-------------+-------------------+----------+------------+----+
|left_chromosome|left_studyId|left_leadVariantId|left_type|right_chromosome|right_studyId|right_leadVariantId|right_type|coloc_n_vars|clpp|
+---------------+------------+------------------+---------+----------------+-------------+-------------------+----------+------------+----+
|              0|           0|                 0|        0|               0|            0|                  0|         0|           0| 0.0|
|              1|           1|                 1|        1|               1|            1|                  1|         1|           1| 1.0|
|              2|           2|                 2|        2|               2|            2|                  2|         2|           2| 2.0|
|              3|           3|                 3|        3|               3|            3|                  3|         3|           3| 3.0|
|              4|           4|                 4|        4|               4|            4|                  4|         4|           4| 4.0|
+---------------+------------+------------------+---------+----------------+-------------+-------------------+----------+------------+----+

# Data looks like it's all integers, but it actually conforms to the schema
mock_ecaviar.first()
>>> Row(left_chromosome='0', left_studyId='0', left_leadVariantId='0', left_type='0', right_chromosome='0', right_studyId='0', right_leadVariantId='0', right_type='0', coloc_n_vars=0, clpp=0.0)

# their docs https://databrickslabs.github.io/dbldatagen/public_docs/
	!pip3 install dbldatagen

	import dbldatagen as dg
	from pyspark.sql import SparkSession
	import pyspark.sql.types as t

	spark = SparkSession.builder.master("local[*]").appName("spark").getOrCreate()

	# Generate a mock eCaviar coloc dataframe

	ecaviar_schema = t.StructType(
	[
	t.StructField("left_chromosome", t.StringType(), False),
	t.StructField("left_studyId", t.StringType(), False),
	t.StructField("left_leadVariantId", t.StringType(), False),
	t.StructField("left_type", t.StringType(), True),
	t.StructField("right_chromosome", t.StringType(), True),
	t.StructField("right_studyId", t.StringType(), True),
	t.StructField("right_leadVariantId", t.StringType(), True),
	t.StructField("right_type", t.StringType(), True),
	t.StructField("coloc_n_vars", t.LongType(), False),
	t.StructField("clpp", t.DoubleType(), True),
	]
	)

	mock_ecaviar = dg.DataGenerator(spark, name="ecaviar_coloc", rows=500, partitions=1).withSchema(ecaviar_schema).build(withTempView=T
	...: rue)

	mock_ecaviar.show(5)
	>>>
	+---------------+------------+------------------+---------+----------------+-------------+-------------------+----------+------------+----+
	\|left_chromosome\|left_studyId\|left_leadVariantId\|left_type\|right_chromosome\|right_studyId\|right_leadVariantId\|right_type\|coloc_n_vars\|clpp\|
	+---------------+------------+------------------+---------+----------------+-------------+-------------------+----------+------------+----+
	\| 0\| 0\| 0\| 0\| 0\| 0\| 0\| 0\| 0\| 0.0\|
	\| 1\| 1\| 1\| 1\| 1\| 1\| 1\| 1\| 1\| 1.0\|
	\| 2\| 2\| 2\| 2\| 2\| 2\| 2\| 2\| 2\| 2.0\|
	\| 3\| 3\| 3\| 3\| 3\| 3\| 3\| 3\| 3\| 3.0\|
	\| 4\| 4\| 4\| 4\| 4\| 4\| 4\| 4\| 4\| 4.0\|
	+---------------+------------+------------------+---------+----------------+-------------+-------------------+----------+------------+----+

	# Data looks like it's all integers, but it actually conforms to the schema
	mock_ecaviar.first()
	>>> Row(left_chromosome='0', left_studyId='0', left_leadVariantId='0', left_type='0', right_chromosome='0', right_studyId='0', right_leadVariantId='0', right_type='0', coloc_n_vars=0, clpp=0.0)

	# their docs https://databrickslabs.github.io/dbldatagen/public_docs/