Skip to content

Instantly share code, notes, and snippets.

View cesar1091's full-sized avatar
🏠
Working

César Aarón Fernández Niño cesar1091

🏠
Working
View GitHub Profile
#-*-mode: ruby-*-
#vi: set ft=ruby:
Vagrant.configure("2") do|config|
config.vm.box = "itversity/centos7spark"
config.vm.network "forwarded_port", guest: 8888, host: 8888
config.vm.network "forwarded_port", guest: 4040, host: 4040
config.vm.provider "virtualbox" do|vb|
vb.cpus = "2"
hdfs dfs -put /data/retail_db /public
hdfs dfs -ls /public/retail_db
from pyspark.sql.types import *
customSchema = StructType([ StructField("order_id", IntegerType(), True),
StructField("order_date", DateType(), True),
StructField("order_customer_id", IntegerType(), True),
StructField("order_status", StringType(), True) ])
orders = spark.read.option("inferSchema", "true").schema(customSchema).csv("/public/retail_db/orders")
orders.write.format("avro").save("/user/vagrant/lab1/pregunta1/orders_avro")
pyspark --packages org.apache.spark:spark-avro_2.11:2.4.4
validate = spark.read.format("avro").load("/user/vagrant/lab1/pregunta1/orders_avro")
validate.printSchema()
validate.count()
validate.show()
hdfs dfs -ls /user/vagrant/lab1/pregunta1/orders_avro
customerSchema = StructType([ StructField("customer_id", IntegerType(), True),
StructField("customer_fname", StringType(), True),
StructField("customer_lname", StringType(), True),
StructField("customer_email", StringType(), True),
StructField("customer_password", StringType(), True),
StructField("customer_street", StringType(), True),
StructField("customer_state", StringType(), True),
StructField("customer_city", StringType(), True),
StructField("customer_zipcode", StringType(), True) ])
customer = spark.read.schema(customerSchema).csv("/public/retail_db/customers/part-00000")
hdfs dfs -ls /user/vagrant/lab1/pregunta2/customer_parquet
orders_avro = spark.read.format("avro").load("/user/vagrant/lab1/pregunta1/orders_avro/part-00000-26e036ad-b866-4723-bab3-a450da706f5a-c000.avro")
orders_avro.select("order_id","order_status").write.option("compression", "gzip").parquet("/user/vagrant/lab1/pregunta3/resultado")
hdfs dfs -ls /user/vagrant/lab1/pregunta3/resultado