nmukerje/gdelta_parquet.py

## gdelta_parquet.py
# Get the column names
from urllib import urlopen
html = urlopen("http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt").read().rstrip()
columns = html.split('\t')
# Load 73,385,698 records from 2016
df1 = spark.read.option("delimiter", "\t").csv("s3://gdelt-open-data/events/2016*")
# Apply the schema
df2=df1.toDF(*columns)
# Split SQLDATE to Year, Month and Day
from pyspark.sql.functions import expr
df3 = df2.withColumn("Month", expr("substring(SQLDATE, 5, 2)")).withColumn("Day", expr("substring(SQLDATE, 7, 2)"))
# Write to parquet in S3
cols=["Year","Month","Day"]
df3.repartition(*cols).write.mode("append").partitionBy(cols).parquet("s3://<bucket>/gdelt/")
	# Get the column names
	from urllib import urlopen
	html = urlopen("http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt").read().rstrip()
	columns = html.split('\t')
	# Load 73,385,698 records from 2016
	df1 = spark.read.option("delimiter", "\t").csv("s3://gdelt-open-data/events/2016*")
	# Apply the schema
	df2=df1.toDF(*columns)
	# Split SQLDATE to Year, Month and Day
	from pyspark.sql.functions import expr
	df3 = df2.withColumn("Month", expr("substring(SQLDATE, 5, 2)")).withColumn("Day", expr("substring(SQLDATE, 7, 2)"))
	# Write to parquet in S3
	cols=["Year","Month","Day"]
	df3.repartition(*cols).write.mode("append").partitionBy(cols).parquet("s3://<bucket>/gdelt/")