Skip to content

Instantly share code, notes, and snippets.

@enkeboll
Last active February 1, 2018 14:19
Show Gist options
  • Save enkeboll/0ffe2e5c21ab032c8035dc19f9efd6b5 to your computer and use it in GitHub Desktop.
Save enkeboll/0ffe2e5c21ab032c8035dc19f9efd6b5 to your computer and use it in GitHub Desktop.
CSV to Parquet
# from http://blogs.quovantis.com/how-to-convert-csv-to-parquet-files/
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, TimestampType
if __name__ == "__main__":
sc = SparkContext(appName="CSV2Parquet")
sqlContext = SQLContext(sc)
schema = StructType([
# still an issue passing empty strings in to other types like TimestampType and IntegerType
StructField("action_id", StringType(), True),
StructField("created_at", StringType(), True),
StructField("category", StringType(), True),
StructField("user_id", StringType(), True),
StructField("uuid", StringType(), True),
StructField("advertising_id", StringType(), True),
StructField("device_id", StringType(), True),
])
rdd = sc.textFile("/Users/enkeboll/Downloads/0001_part_00.txt").map(lambda line: line.split("|"))
df = sqlContext.createDataFrame(rdd, schema)
df.write.parquet('/Users/enkeboll/Downloads/parquet')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment