Skip to content

Instantly share code, notes, and snippets.

View cesar1091's full-sized avatar
🏠
Working

César Aarón Fernández Niño cesar1091

🏠
Working
View GitHub Profile
#-*-mode: ruby-*-
#vi: set ft=ruby:
Vagrant.configure("2") do|config|
config.vm.box = "itversity/centos7spark"
config.vm.network "forwarded_port", guest: 8888, host: 8888
config.vm.network "forwarded_port", guest: 4040, host: 4040
config.vm.provider "virtualbox" do|vb|
vb.cpus = "2"
hdfs dfs -put /data/retail_db /public
hdfs dfs -ls /public/retail_db
pyspark --packages org.apache.spark:spark-avro_2.11:2.4.4
validate = spark.read.format("avro").load("/user/vagrant/lab1/pregunta1/orders_avro")
validate.printSchema()
validate.count()
validate.show()
hdfs dfs -ls /user/vagrant/lab1/pregunta1/orders_avro
hdfs dfs -ls /user/vagrant/lab1/pregunta2/customer_parquet
hdfs dfs -ls /user/vagrant/lab1/pregunta3/resultado
hdfs dfs -ls /user/vagrant/lab1/pregunta4/resultado
customer = spark.read.format("parquet").load("/user/vagrant/lab1/pregunta2/customer")
customer.createOrReplaceTempView("customer")
val result = spark.sql("select customer_id, concat(substring(customer_fname,1,3),' ', customer_lname) as name, customer_street from customer")
result.rdd.map(lambda x: "\t".join(map(str,x))).saveAsTextFile("/user/vagrant/lab1/pregunta6/resultado","org.apache.hadoop.io.compress.BZip2Codec")
ProductSchema = StructType([StructField("product_id", IntegerType(), True),
StructField("product_category_id", IntegerType(), True),
StructField("product_name", StringType(), True),
StructField("product_description", StringType(), True),
StructField("product_price", FloatType(), True),
StructField("product_image", StringType(), True)])
product = spark.read.format("csv").option("inferSchema","true").schema(ProductSchema).load("/public/retail_db/products/part-00000")
product.createOrReplaceTempView("product")
result =spark.sql("select product_id, max(product_price) as max_price from product group by product_id")