enkeboll/csv_to_parquet.py

## csv_to_parquet.py
# from http://blogs.quovantis.com/how-to-convert-csv-to-parquet-files/

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, TimestampType


if __name__ == "__main__":
    sc = SparkContext(appName="CSV2Parquet")
    sqlContext = SQLContext(sc)

    schema = StructType([
            # still an issue passing empty strings in to other types like TimestampType and IntegerType
            StructField("action_id", StringType(), True),
            StructField("created_at", StringType(), True),
            StructField("category", StringType(), True),
            StructField("user_id", StringType(), True),
            StructField("uuid", StringType(), True),
            StructField("advertising_id", StringType(), True),
            StructField("device_id", StringType(), True),
    ])

    rdd = sc.textFile("/Users/enkeboll/Downloads/0001_part_00.txt").map(lambda line: line.split("|"))
    df = sqlContext.createDataFrame(rdd, schema)
    df.write.parquet('/Users/enkeboll/Downloads/parquet')
	# from http://blogs.quovantis.com/how-to-convert-csv-to-parquet-files/

	from pyspark import SparkContext
	from pyspark.sql import SQLContext
	from pyspark.sql.types import StructField, StructType, IntegerType, StringType, TimestampType


	if __name__ == "__main__":
	sc = SparkContext(appName="CSV2Parquet")
	sqlContext = SQLContext(sc)

	schema = StructType([
	# still an issue passing empty strings in to other types like TimestampType and IntegerType
	StructField("action_id", StringType(), True),
	StructField("created_at", StringType(), True),
	StructField("category", StringType(), True),
	StructField("user_id", StringType(), True),
	StructField("uuid", StringType(), True),
	StructField("advertising_id", StringType(), True),
	StructField("device_id", StringType(), True),
	])

	rdd = sc.textFile("/Users/enkeboll/Downloads/0001_part_00.txt").map(lambda line: line.split("\|"))
	df = sqlContext.createDataFrame(rdd, schema)
	df.write.parquet('/Users/enkeboll/Downloads/parquet')