Skip to content

Instantly share code, notes, and snippets.

# pandas
df[col5] = pd.to_datetime(df[col5], errors='coerce')
# pyspark
data_regex = r"\d{2,4}(\.|\-|\/|\\)+\d{2,4}(\.|\-|\/|\\)+\d{2,4}(\s)*(\d{2}\:\d{2}\:\d{2})?(\.\d{3})?|^$"
df = df.withColumn(col5, F.when(F.regexp_replace(F.col(col5), data_regex, '').isNotNull(),\
F.to_timestamp(F.col(col5), 'yyyy/MM/dd')).otherwise(None))
# pandas
df[col4] = df[col4].apply(lambda m: None if m in [None, float('nan'), np.nan, math.nan] else int(float(m)))
# pyspark
def floatint(x):
return int(float(x))
int_udf = F.udf(lambda m: None if m is None else floatint(m))
df = df.withColumn(col4, F.when(F.col(col4).isNotNull(), int_udf(F.col(col4))).otherwise(None))
# pandas
df[col3] = df[col3].replace(regex=r"[^0-9\\.]", value="")
df[col3] = pd.to_numeric(df[col3], errors='coerce')
# pyspark
df = df.withColumn(col3, F.regexp_replace(F.col(col3), r"[^0-9\\.]", '').cast("double"))
# pandas
row_ct = df.shape[0]
num_ct = pd.to_numeric(df[col3], errors='coerce').count() # coerce makes nan, count drops nan
# another check using regex
num_regex = r"^((-)?[0-9]+)(,[0-9]+)*(\.[0-9]+)?$|(^$)"
all_are_nums = all(df[col3].fillna('').astype(str).apply(lambda x: re.match(num_regex, x)))
if (num_ct == row_ct) or all_are_nums:
df[col3] = pd.to_numeric(df[col3], errors='coerce')
# pandas
df[col2] = np.where(df[col2] == 1, True, False)
# pyspark
df = df.withColumn(col2, F.when(F.col(col2) == 1, True).otherwise(False))
@yvan
yvan / pandas-pyspark-simple.py
Last active February 22, 2023 23:37
Simple Transform
# in pandas
df[col1] = df[col1]*5
# in spark
df = df.withColumn(col1, F.col(col1)*5)
(loop [val :generate (enumerate-iter buf)]
(print val))
(defn enumerate-iter
[iter]
(var i -1)
(generate [val :in iter]
(++ i)
val))
iterable = ["a" "some" "data"]
for i,value in enumerate(iterable):
print("this is the index", i)
print("this is the vale", value)
section .text
global _start
section .data
hash db '1234'
_start:
mov edx,4
mov ecx,hash
mov ebx,1