Skip to content

Instantly share code, notes, and snippets.

View cesar1091's full-sized avatar
🏠
Working

César Aarón Fernández Niño cesar1091

🏠
Working
View GitHub Profile
customer = spark.read.format("parquet").load("/user/vagrant/lab1/pregunta2/customer")
customer.createOrReplaceTempView("customer")
order_items = spark.read.format("orc").load("/user/vagrant/lab1/pregunta9/resultado")
order_items.createOrReplaceTempView("order_items")
top_customer = spark.sql("select customer_id, customer_fname, count(*) as cant,
sum(order_item_subtotal) as total
from customer a inner join orders b
on a.customer_id = b.order_customer_id inner
join order_items c
on c.order_item_order_id = b.order_id where
hdfs dfs -ls /user/vagrant/lab1/pregunta9/resultado
itemsSchema = StructType([StructField("order_item_id", IntegerType(), True),
StructField("order_item_order_id", IntegerType(), True),
StructField("order_item_product_id", IntegerType(), True),
StructField("order_item_quantity", IntegerType(), True),
StructField("order_item_subtotal", FloatType(), True),
StructField("order_item_productprice", FloatType(), True)])
order_items= spark.read.format("csv").option("inferSchema", "true").schema(itemsSchema).load("/public/retail_db/order_items/part-00000")
order_items.write.format("orc").option("compression","uncompressed").save("/user/vagrant/lab1/pregunta9/resultado")
hdfs dfs -ls /user/vagrant/lab1/pregunta8/resultado
orders = spark.read.format("csv").option("inferSchema","true").schema(customSchema).load("/public/retail_db/orders/part-00000")
orders.write.format("hive").saveAsTable("orders")
result = spark.sql("select count(*) as count,date_format(order_date,'YYYYMM') as month from orders group by date_format(order_date, 'YYYYMM')")
result.write.option("compression","uncompressed").format("parquet").save("/user/vagrant/lab1/pregunta8/resultado")
product = spark.read.format("csv").option("inferSchema","true").schema(ProductSchema).load("/public/retail_db/products/part-00000")
product.write.format("hive").saveAsTable("product")
hdfs dfs -ls /user/vagrant/lab1/pregunta6/resultado
customer = spark.read.format("parquet").load("/user/vagrant/lab1/pregunta2/customer")
customer.createOrReplaceTempView("customer")
val result = spark.sql("select customer_id, concat(substring(customer_fname,1,3),' ', customer_lname) as name, customer_street from customer")
result.rdd.map(lambda x: "\t".join(map(str,x))).saveAsTextFile("/user/vagrant/lab1/pregunta6/resultado","org.apache.hadoop.io.compress.BZip2Codec")
ProductSchema = StructType([StructField("product_id", IntegerType(), True),
StructField("product_category_id", IntegerType(), True),
StructField("product_name", StringType(), True),
StructField("product_description", StringType(), True),
StructField("product_price", FloatType(), True),
StructField("product_image", StringType(), True)])
product = spark.read.format("csv").option("inferSchema","true").schema(ProductSchema).load("/public/retail_db/products/part-00000")
product.createOrReplaceTempView("product")
result =spark.sql("select product_id, max(product_price) as max_price from product group by product_id")
hdfs dfs -ls /user/vagrant/lab1/pregunta4/resultado