This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spark = ( | |
SparkSession.builder | |
.appName('learn') | |
# .config('spark.sql.shuffle.partitions', 10) | |
# .config('spark.default.parallelism', 10) | |
# .config('spark.executor.memory', '1g') | |
# .config('spark.driver.memory', '1g') | |
# .config('spark.executor.instances', 1) | |
#.config('spark.executor.cores', 2) | |
.getOrCreate() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# simple example, create struct | |
import pyspark.sql.functions as F | |
df = [[1, 'mplah', 'gogo'], [2, 'mplah2', 'gogo2'], [3, 'mplah3', 'gogo3']] | |
df = spark.createDataFrame(df, schema=['x', 'y', 'z']) | |
res = df.select(F.col('x'), F.struct(F.col('x').alias('_x'), F.col('y').alias('_y')).alias('_xy')) | |
res.show() | |
# | x| _xy| | |
# +---+-----------+ | |
# | 1| {1, mplah}| | |
# | 2|{2, mplah2}| |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# simple example: explode array | |
import pyspark.sql.functions as F | |
df = [[1, 'mplah', ['a', 'b', 'c']], [2, 'mplah2', ['a2', 'b2', 'c2']], [3, 'mplah3', ['a3', 'b3', 'c3', 'd3']]] | |
df = spark.createDataFrame(df, schema=['x', 'y', 'z']) | |
df.printSchema() | |
res = df.select(F.col('x'), F.explode(F.col('z')).alias('z')) | |
res.show(truncate=False) | |
res = df.select(F.col('x'), F.posexplode(F.col('z')).alias('z_id', 'z')) | |
res.show(truncate=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from pyspark schema to json and the other way round | |
import pyspark.sql.types as T | |
from pprint import pprint | |
# -- create simple dataframe and schema | |
df = [[1, 'mplah', ['Panos', 'George'], {'a': 'b', 'c': 'd'}, ('mplip1', 'mplip1_')], [2, 'mplah2', ['Panos2', 'George2'], {'a2': 'b2', 'c2': 'd2'}, ('mplip2', 'mplip2_')] ] | |
schema = T.StructType([ | |
T.StructField('x1', T.LongType()), | |
T.StructField('x2', T.StringType()), | |
T.StructField('x3', T.ArrayType(StringType())), | |
T.StructField('x4', T.MapType(StringType(), StringType())), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# note the FAILFAST mode. It is much preferred to PERMISSIVE to catch errors early | |
# read dates and datetimes using the default ISO format | |
# date: yyyy-MM-dd | |
# datetime: yyyy-MM-ddTHH:mm:ss.SSS | |
import pyspark.sql.types as T | |
with open(r'D:/junk/tmp.csv', 'wt') as f: | |
f.write('1\t2022-10-03\t2022-10-03T06:02:01.657\n') | |
f.write('1\t2022-10-13\t2021-10-03T06:32:01.001') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create a simple schema and populate an example dataframe | |
childSchema = StructType([ | |
StructField('child name', StringType(), nullable=False), | |
StructField('child age', LongType(), nullable=False) | |
]) | |
schema = StructType([ | |
StructField('name', StringType(), nullable=False), | |
StructField('age', LongType(), nullable=False), | |
StructField('children', ArrayType(childSchema, containsNull=False), nullable=False) | |
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# extract data from array of structs, nested | |
import json | |
from pyspark.sql.types import * | |
structureSchema = StructType([ | |
StructField('name', ArrayType(StructType([ | |
StructField('firstname', StringType(), True), | |
StructField('middlename', StringType(), True), | |
StructField('lastname', StringType(), True) | |
]))), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# experiment with map | |
# experiment 1 | |
df = [[1,2], [3,4]] | |
df = spark.createDataFrame(df, schema=['a', 'b']) | |
res = df.select(f.array([f.lit(col) for col in df.columns]).alias('names'), f.array(f.col('a'), f.col('b')).alias('values')) | |
res.printSchema() | |
res = res.select(f.map_from_arrays(f.col('names'), f.col('values')).alias('mapped')) | |
res.printSchema() | |
# |-- mapped: map (nullable = false) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read single singleline json | |
ex1 = '''{"a": "hello", "b": "hello 2", "c":{"x":[1,2,3], "y": "bye"} }''' | |
with open(r'./junk/ex1.json', 'wt') as f: | |
f.write(ex1) | |
data = spark.read.json(r'./junk/ex1.json') | |
data.show() | |
# +-----+-------+----------------+ | |
# | a| b| c| | |
# +-----+-------+----------------+ | |
# |hello|hello 2|{[1, 2, 3], bye}| |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def f(a: int, b: str, c: float): | |
import inspect | |
args = inspect.getfullargspec(f).args | |
annotations = inspect.getfullargspec(f).annotations | |
# annotations = f.__annotations__ | |
# print(type(locals()), locals()) | |
for x in args: | |
print(x, '->', | |
'arg is', type(locals()[x]), ',', | |
'annotation is', annotations[x], |
NewerOlder