César Aarón Fernández Niño cesar1091

## Ejercicio10.py
customer = spark.read.format("parquet").load("/user/vagrant/lab1/pregunta2/customer")
customer.createOrReplaceTempView("customer")
order_items = spark.read.format("orc").load("/user/vagrant/lab1/pregunta9/resultado")
order_items.createOrReplaceTempView("order_items")
top_customer = spark.sql("select customer_id, customer_fname, count(*) as cant,
                          sum(order_item_subtotal) as total
                          from customer a inner join orders b
                          on a.customer_id = b.order_customer_id inner
                          join order_items c
                          on c.order_item_order_id = b.order_id where

## hdfs consulta 9
hdfs dfs -ls /user/vagrant/lab1/pregunta9/resultado

## Ejercicio9.py
itemsSchema = StructType([StructField("order_item_id", IntegerType(), True),
                          StructField("order_item_order_id", IntegerType(), True),
                          StructField("order_item_product_id", IntegerType(), True),
                          StructField("order_item_quantity", IntegerType(), True),
                          StructField("order_item_subtotal", FloatType(), True),
                          StructField("order_item_productprice", FloatType(), True)])
order_items= spark.read.format("csv").option("inferSchema", "true").schema(itemsSchema).load("/public/retail_db/order_items/part-00000")
order_items.write.format("orc").option("compression","uncompressed").save("/user/vagrant/lab1/pregunta9/resultado")

## hdfs consulta 8
hdfs dfs -ls /user/vagrant/lab1/pregunta8/resultado

## Ejercicio8.py
orders = spark.read.format("csv").option("inferSchema","true").schema(customSchema).load("/public/retail_db/orders/part-00000")
orders.write.format("hive").saveAsTable("orders")
result = spark.sql("select count(*) as count,date_format(order_date,'YYYYMM') as month from orders group by date_format(order_date, 'YYYYMM')")
result.write.option("compression","uncompressed").format("parquet").save("/user/vagrant/lab1/pregunta8/resultado")

## Ejercicio7.py
product = spark.read.format("csv").option("inferSchema","true").schema(ProductSchema).load("/public/retail_db/products/part-00000")
product.write.format("hive").saveAsTable("product")

## hdfs consulta 6
hdfs dfs -ls /user/vagrant/lab1/pregunta6/resultado

## Ejercicio6.py
customer = spark.read.format("parquet").load("/user/vagrant/lab1/pregunta2/customer")
customer.createOrReplaceTempView("customer")
val result = spark.sql("select customer_id, concat(substring(customer_fname,1,3),' ', customer_lname) as name, customer_street from customer")
result.rdd.map(lambda x: "\t".join(map(str,x))).saveAsTextFile("/user/vagrant/lab1/pregunta6/resultado","org.apache.hadoop.io.compress.BZip2Codec")

## Ejercicio5.py
ProductSchema = StructType([StructField("product_id", IntegerType(), True),
                            StructField("product_category_id", IntegerType(), True),
                            StructField("product_name", StringType(), True),
                            StructField("product_description", StringType(), True),
                            StructField("product_price", FloatType(), True),
                            StructField("product_image", StringType(), True)])

product = spark.read.format("csv").option("inferSchema","true").schema(ProductSchema).load("/public/retail_db/products/part-00000")
product.createOrReplaceTempView("product")
result =spark.sql("select product_id, max(product_price) as max_price from product group by product_id")

## hdfs consulta 4
hdfs dfs -ls /user/vagrant/lab1/pregunta4/resultado
	customer = spark.read.format("parquet").load("/user/vagrant/lab1/pregunta2/customer")
	customer.createOrReplaceTempView("customer")
	order_items = spark.read.format("orc").load("/user/vagrant/lab1/pregunta9/resultado")
	order_items.createOrReplaceTempView("order_items")
	top_customer = spark.sql("select customer_id, customer_fname, count(*) as cant,
	sum(order_item_subtotal) as total
	from customer a inner join orders b
	on a.customer_id = b.order_customer_id inner
	join order_items c
	on c.order_item_order_id = b.order_id where
	itemsSchema = StructType([StructField("order_item_id", IntegerType(), True),
	StructField("order_item_order_id", IntegerType(), True),
	StructField("order_item_product_id", IntegerType(), True),
	StructField("order_item_quantity", IntegerType(), True),
	StructField("order_item_subtotal", FloatType(), True),
	StructField("order_item_productprice", FloatType(), True)])
	order_items= spark.read.format("csv").option("inferSchema", "true").schema(itemsSchema).load("/public/retail_db/order_items/part-00000")
	order_items.write.format("orc").option("compression","uncompressed").save("/user/vagrant/lab1/pregunta9/resultado")
	orders = spark.read.format("csv").option("inferSchema","true").schema(customSchema).load("/public/retail_db/orders/part-00000")
	orders.write.format("hive").saveAsTable("orders")
	result = spark.sql("select count(*) as count,date_format(order_date,'YYYYMM') as month from orders group by date_format(order_date, 'YYYYMM')")
	result.write.option("compression","uncompressed").format("parquet").save("/user/vagrant/lab1/pregunta8/resultado")
	product = spark.read.format("csv").option("inferSchema","true").schema(ProductSchema).load("/public/retail_db/products/part-00000")
	product.write.format("hive").saveAsTable("product")
	customer = spark.read.format("parquet").load("/user/vagrant/lab1/pregunta2/customer")
	customer.createOrReplaceTempView("customer")
	val result = spark.sql("select customer_id, concat(substring(customer_fname,1,3),' ', customer_lname) as name, customer_street from customer")
	result.rdd.map(lambda x: "\t".join(map(str,x))).saveAsTextFile("/user/vagrant/lab1/pregunta6/resultado","org.apache.hadoop.io.compress.BZip2Codec")
	ProductSchema = StructType([StructField("product_id", IntegerType(), True),
	StructField("product_category_id", IntegerType(), True),
	StructField("product_name", StringType(), True),
	StructField("product_description", StringType(), True),
	StructField("product_price", FloatType(), True),
	StructField("product_image", StringType(), True)])

	product = spark.read.format("csv").option("inferSchema","true").schema(ProductSchema).load("/public/retail_db/products/part-00000")
	product.createOrReplaceTempView("product")
	result =spark.sql("select product_id, max(product_price) as max_price from product group by product_id")