Skip to content

Instantly share code, notes, and snippets.

@frank-leap
Last active April 25, 2016 00:26
Show Gist options
  • Save frank-leap/fdc11868666f0a30c11e to your computer and use it in GitHub Desktop.
Save frank-leap/fdc11868666f0a30c11e to your computer and use it in GitHub Desktop.
# load the "orders" table from Hive into a DataFrame
orders_df=sqlCtx.sql("select * from orders")
orders_df.printSchema()
# 1) calculate number of orders in SUSPECTED_FRAUD status
sqlCtx.select("select count(order_id) from orders where order_status='SUSPECTED_FRAUD'").show(5)
# load the "order_items" table from Hive into a DataFrame
order_items_df=sqlCtx.sql("select * from order_items")
order_items_df.printSchema()
# 2) calculate the 3rd highest order amount from order_items
sqlCtx.select("""
select order_item_order_id, sum(order_item_subtotal) as total
from order_items
group by order_item_order_id
order by total desc
""").show(5)
# 3) calculate average product price ("order_item_product_price") for products that belong to "COMPLETE" orders
sql_ctx.sql("""
select avg(oi.order_item_product_price)
from order_items oi
inner join orders o on oi.order_item_order_id=o.order_id
where 1=1 and o.order_status='COMPLETE'
""").cache().show(5)
# 4) calculate maximum amount a single customer ordered considering only orders in "COMPLETE" status
sql_ctx.sql("""
select o.order_customer_id,sum(oi.order_item_subtotal) as total
from order_items oi
inner join orders o on oi.order_item_order_id=o.order_id
where 1=1 and o.order_status='COMPLETE'
group by o.order_customer_id
order by total desc
""").show(5)
# 5) calculate largest across all total amounts on orders which are not complete
sql_ctx.sql("""
select o.order_id,sum(oi.order_item_subtotal) as total
from order_items oi
inner join orders o on oi.order_item_order_id=o.order_id
where 1=1 and o.order_status!='COMPLETE'
group by o.order_id
order by total desc
""").show(5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment