Skip to content

Instantly share code, notes, and snippets.

View thangarajan8's full-sized avatar
💭
Learning to How to Learn

itsthanga thangarajan8

💭
Learning to How to Learn
View GitHub Profile
@thangarajan8
thangarajan8 / gist:68640b9cc0958fc925c68d9777f05f63
Created May 10, 2022 12:22
pandas to spark data frame.py
from pyspark.sql.types import *
# Auxiliar functions
# Pandas Types -> Sparks Types
def equivalent_type(f):
if f == 'datetime64[ns]': return DateType()
elif f == 'int64': return LongType()
elif f == 'int32': return IntegerType()
elif f == 'float64': return FloatType()
else: return StringType()
import datetime
# text = "april 23 january 11 2020"
text = "enero 01 diciembre 31 2020"
def multi_date_text(text):
month_dict = {"enero":"january","febrero":"february","marzo":"march","abril":"april",
"mayo":"may","junio":"june","julio":"july","agosto":"august",
"septiembre":"september","octubre":"october","noviembre":"november","diciembre":"december"}
text = [month_dict[i] if i.lower() in month_dict.keys() else i for i in text.split(" ") ]
import pandas as pd
import numpy as np
import json
f_path = "ContactsPage.java"
with open(f_path,'r') as f:
content = f.read()
#def flattern_json(d):
# if len(d) == 0:
# return {}
import pandas as pd
import numpy as np
import json
f_path = "HomePage.java"
with open(f_path,'r') as f:
content = f.read()
def flattern_json(d):
if len(d) == 0:
return {}
import json
import javalang as jl
tree = jl.parse.parse(content)
def json_ast_encoder(o):
if type(o) is set and len(o) == 0:
return []
if hasattr(o, "__dict__"):
return o.__dict__
return ""
SELECT
Coalesce(
try(date_parse(multi_date_format, '%Y-%m-%d %H:%i:%s')),
try(date_parse(multi_date_format, '%Y/%m/%d %H:%i:%s')),
try(date_parse(multi_date_format, '%Y/%m/%d')),
try(date_parse(multi_date_format, '%d %M %Y')),
try(date_parse(multi_date_format, '%d %M %Y %H:%i:%s')),
try(date_parse(multi_date_format, '%d/%m/%Y %H:%i:%s')),
try(date_parse(multi_date_format, '%d-%m-%Y %H:%i:%s'))
) as DateConvertedToTimestamp,
SELECT *
FROM
(
SELECT '2021-01-15 13:01:01' AS multi_date_format
UNION ALL
SELECT '2021/01/15 13:01:02'
UNION ALL
SELECT '2021/01/03'
UNION ALL
SELECT '04 JAN 2021'
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
multi_date_format
07/01/2020 13:01
03/01/2020
02/01/2020 13:01
01/01/2020 13:01
05/01/2020 13:01
04-Jan-20
06/01/2020 13:01
select date_parse('2021-12-31 00:00:00','%Y-%m-%d %H:%i:%s')
import pandas as pd
import time
import numpy as np
#http://eforexcel.com/wp/wp-content/uploads/2020/09/5m-Sales-Records.zip
df = pd.read_csv("5m Sales Records.csv")
def filter1(df):
start_time = time.time()
for i in df.Country.unique():