karpan karpanGit

## pyspark, local vs global views.py
spark = (
    SparkSession.builder
        .appName('learn')
        # .config('spark.sql.shuffle.partitions', 10)
        # .config('spark.default.parallelism', 10)
        # .config('spark.executor.memory', '1g')
        # .config('spark.driver.memory', '1g')
        # .config('spark.executor.instances', 1)
        #.config('spark.executor.cores', 2)
        .getOrCreate()

## pyspark, create struct from columns.py
# simple example, create struct
import pyspark.sql.functions as F
df = [[1, 'mplah', 'gogo'], [2, 'mplah2', 'gogo2'], [3, 'mplah3', 'gogo3']]
df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
res = df.select(F.col('x'), F.struct(F.col('x').alias('_x'), F.col('y').alias('_y')).alias('_xy'))
res.show()
# |  x|        _xy|
# +---+-----------+
# |  1| {1, mplah}|
# |  2|{2, mplah2}|

## pyspark, explode array, explode list, collect_list, collect_set.py

# simple example: explode array
import pyspark.sql.functions as F
df = [[1, 'mplah', ['a', 'b', 'c']], [2, 'mplah2', ['a2', 'b2', 'c2']], [3, 'mplah3', ['a3', 'b3', 'c3', 'd3']]]
df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
df.printSchema()
res = df.select(F.col('x'), F.explode(F.col('z')).alias('z'))
res.show(truncate=False)
res = df.select(F.col('x'), F.posexplode(F.col('z')).alias('z_id', 'z'))
res.show(truncate=False)

## pyspark, from pyspark to json and back.py
# from pyspark schema to json and the other way round
import pyspark.sql.types as T
from pprint import pprint
# -- create simple dataframe and schema
df = [[1, 'mplah', ['Panos', 'George'], {'a': 'b', 'c': 'd'}, ('mplip1', 'mplip1_')], [2, 'mplah2', ['Panos2', 'George2'], {'a2': 'b2', 'c2': 'd2'}, ('mplip2', 'mplip2_')] ]
schema = T.StructType([
    T.StructField('x1', T.LongType()),
    T.StructField('x2', T.StringType()),
    T.StructField('x3', T.ArrayType(StringType())),
    T.StructField('x4', T.MapType(StringType(), StringType())),

## pyspark, read csv by specifying date and datetime format.py
# note the FAILFAST mode. It is much preferred to PERMISSIVE to catch errors early


# read dates and datetimes using the default ISO format
# date: yyyy-MM-dd
# datetime: yyyy-MM-ddTHH:mm:ss.SSS
import pyspark.sql.types as T
with open(r'D:/junk/tmp.csv', 'wt') as f:
    f.write('1\t2022-10-03\t2022-10-03T06:02:01.657\n')
    f.write('1\t2022-10-13\t2021-10-03T06:32:01.001')

## pyspark, manually create schema containing complex columns, populate dataframe and extract data.py
# create a simple schema and populate an example dataframe
childSchema = StructType([
    StructField('child name', StringType(), nullable=False),
    StructField('child age', LongType(), nullable=False)
])
schema = StructType([
    StructField('name', StringType(), nullable=False),
    StructField('age', LongType(), nullable=False),
    StructField('children', ArrayType(childSchema, containsNull=False), nullable=False)
])

## pyspark, extract data from structs with scalars and structs with arrays.py

# extract data from array of structs, nested
import json
from pyspark.sql.types import *
structureSchema = StructType([
        StructField('name', ArrayType(StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ]))),

## pyspark, create map from arrays.py
# experiment with map

# experiment 1
df = [[1,2], [3,4]]
df = spark.createDataFrame(df, schema=['a', 'b'])
res = df.select(f.array([f.lit(col) for col in df.columns]).alias('names'), f.array(f.col('a'), f.col('b')).alias('values'))
res.printSchema()
res = res.select(f.map_from_arrays(f.col('names'), f.col('values')).alias('mapped'))
res.printSchema()
#  |-- mapped: map (nullable = false)

## pyspark, read json.py
# read single singleline json
ex1 = '''{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }'''
with open(r'./junk/ex1.json', 'wt') as f:
    f.write(ex1)
data = spark.read.json(r'./junk/ex1.json')
data.show()
# +-----+-------+----------------+
# |    a|      b|               c|
# +-----+-------+----------------+
# |hello|hello 2|{[1, 2, 3], bye}|

## python, enforcing type checking.py
def f(a: int, b: str, c: float):
    import inspect
    args = inspect.getfullargspec(f).args
    annotations = inspect.getfullargspec(f).annotations
    # annotations = f.__annotations__
    # print(type(locals()), locals())
    for x in args:
         print(x, '->',
         'arg is', type(locals()[x]), ',',
         'annotation is', annotations[x],
	spark = (
	SparkSession.builder
	.appName('learn')
	# .config('spark.sql.shuffle.partitions', 10)
	# .config('spark.default.parallelism', 10)
	# .config('spark.executor.memory', '1g')
	# .config('spark.driver.memory', '1g')
	# .config('spark.executor.instances', 1)
	#.config('spark.executor.cores', 2)
	.getOrCreate()
	# simple example, create struct
	import pyspark.sql.functions as F
	df = [[1, 'mplah', 'gogo'], [2, 'mplah2', 'gogo2'], [3, 'mplah3', 'gogo3']]
	df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
	res = df.select(F.col('x'), F.struct(F.col('x').alias('_x'), F.col('y').alias('_y')).alias('_xy'))
	res.show()
	# \| x\| _xy\|
	# +---+-----------+
	# \| 1\| {1, mplah}\|
	# \| 2\|{2, mplah2}\|

	# simple example: explode array
	import pyspark.sql.functions as F
	df = [[1, 'mplah', ['a', 'b', 'c']], [2, 'mplah2', ['a2', 'b2', 'c2']], [3, 'mplah3', ['a3', 'b3', 'c3', 'd3']]]
	df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
	df.printSchema()
	res = df.select(F.col('x'), F.explode(F.col('z')).alias('z'))
	res.show(truncate=False)
	res = df.select(F.col('x'), F.posexplode(F.col('z')).alias('z_id', 'z'))
	res.show(truncate=False)
	# from pyspark schema to json and the other way round
	import pyspark.sql.types as T
	from pprint import pprint
	# -- create simple dataframe and schema
	df = [[1, 'mplah', ['Panos', 'George'], {'a': 'b', 'c': 'd'}, ('mplip1', 'mplip1_')], [2, 'mplah2', ['Panos2', 'George2'], {'a2': 'b2', 'c2': 'd2'}, ('mplip2', 'mplip2_')] ]
	schema = T.StructType([
	T.StructField('x1', T.LongType()),
	T.StructField('x2', T.StringType()),
	T.StructField('x3', T.ArrayType(StringType())),
	T.StructField('x4', T.MapType(StringType(), StringType())),
	# note the FAILFAST mode. It is much preferred to PERMISSIVE to catch errors early


	# read dates and datetimes using the default ISO format
	# date: yyyy-MM-dd
	# datetime: yyyy-MM-ddTHH:mm:ss.SSS
	import pyspark.sql.types as T
	with open(r'D:/junk/tmp.csv', 'wt') as f:
	f.write('1\t2022-10-03\t2022-10-03T06:02:01.657\n')
	f.write('1\t2022-10-13\t2021-10-03T06:32:01.001')
	# create a simple schema and populate an example dataframe
	childSchema = StructType([
	StructField('child name', StringType(), nullable=False),
	StructField('child age', LongType(), nullable=False)
	])
	schema = StructType([
	StructField('name', StringType(), nullable=False),
	StructField('age', LongType(), nullable=False),
	StructField('children', ArrayType(childSchema, containsNull=False), nullable=False)
	])

	# extract data from array of structs, nested
	import json
	from pyspark.sql.types import *
	structureSchema = StructType([
	StructField('name', ArrayType(StructType([
	StructField('firstname', StringType(), True),
	StructField('middlename', StringType(), True),
	StructField('lastname', StringType(), True)
	]))),
	# experiment with map

	# experiment 1
	df = [[1,2], [3,4]]
	df = spark.createDataFrame(df, schema=['a', 'b'])
	res = df.select(f.array([f.lit(col) for col in df.columns]).alias('names'), f.array(f.col('a'), f.col('b')).alias('values'))
	res.printSchema()
	res = res.select(f.map_from_arrays(f.col('names'), f.col('values')).alias('mapped'))
	res.printSchema()
	# \|-- mapped: map (nullable = false)
	# read single singleline json
	ex1 = '''{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }'''
	with open(r'./junk/ex1.json', 'wt') as f:
	f.write(ex1)
	data = spark.read.json(r'./junk/ex1.json')
	data.show()
	# +-----+-------+----------------+
	# \| a\| b\| c\|
	# +-----+-------+----------------+
	# \|hello\|hello 2\|{[1, 2, 3], bye}\|
	def f(a: int, b: str, c: float):
	import inspect
	args = inspect.getfullargspec(f).args
	annotations = inspect.getfullargspec(f).annotations
	# annotations = f.__annotations__
	# print(type(locals()), locals())
	for x in args:
	print(x, '->',
	'arg is', type(locals()[x]), ',',
	'annotation is', annotations[x],