Skip to content

Instantly share code, notes, and snippets.

@tzolov
Last active December 22, 2015 05:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tzolov/6422047 to your computer and use it in GitHub Desktop.
Save tzolov/6422047 to your computer and use it in GitHub Desktop.
Testing the multi-VMs Pivotal HD cluster
# Within your vagrant project folder:
git clone https://github.com/gopivotal/pivotal-samples.git
mv pivotal-samples/sample-data ./retail_demo
rm -Rf ./pivotal-samples
# login to the PHD master (and client) node. The ssh password: gpadmin
# phd1 = 10.211.55.101
ssh gpadmin@10.211.55.101
# First download the demo data into your vagrant start directory under folder sample-data
hadoop fs -rm -r /retail_demo
hadoop fs -mkdir /retail_demo
hadoop fs -mkdir /retail_demo/categories_dim
hadoop fs -put /vagrant/sample-data/categories_dim.tsv.gz /retail_demo/categories_dim/
hadoop fs -mkdir /retail_demo/customer_addresses_dim
hadoop fs -put /vagrant/sample-data/customer_addresses_dim.tsv.gz /retail_demo/customer_addresses_dim/
hadoop fs -mkdir /retail_demo/customers_dim
hadoop fs -put /vagrant/sample-data/customers_dim.tsv.gz /retail_demo/customers_dim/
hadoop fs -mkdir /retail_demo/email_addresses_dim
hadoop fs -put /vagrant/sample-data/email_addresses_dim.tsv.gz /retail_demo/email_addresses_dim/
hadoop fs -mkdir /retail_demo/order_lineitems
hadoop fs -put /vagrant/sample-data/order_lineitems.tsv.gz /retail_demo/order_lineitems/
hadoop fs -mkdir /retail_demo/orders
hadoop fs -put /vagrant/sample-data/orders.tsv.gz /retail_demo/orders/
hadoop fs -mkdir /retail_demo/products_dim
hadoop fs -put /vagrant/sample-data/products_dim.tsv.gz /retail_demo/products_dim/
hadoop fs -mkdir /retail_demo/date_dim/
hadoop fs -put /vagrant/sample-data/date_dim.tsv.gz /retail_demo/date_dim/
hadoop fs -mkdir /retail_demo/payment_methods/
hadoop fs -put /vagrant/sample-data/payment_methods.tsv.gz /retail_demo/payment_methods/
hdfs fsck /retail_demo -files
# psql -d postgres
DROP EXTERNAL TABLE retail_demo.categories_dim_pxf;
DROP EXTERNAL TABLE retail_demo.customers_dim_pxf;
DROP EXTERNAL TABLE retail_demo.order_lineitems_pxf;
DROP EXTERNAL TABLE retail_demo.orders_pxf;
DROP EXTERNAL TABLE retail_demo.customer_addresses_dim_pxf;
DROP EXTERNAL TABLE retail_demo.date_dim_pxf;
DROP EXTERNAL TABLE retail_demo.email_addresses_dim_pxf;
DROP EXTERNAL TABLE retail_demo.payment_methods_pxf;
DROP EXTERNAL TABLE retail_demo.products_dim_pxf;
CREATE EXTERNAL TABLE retail_demo.categories_dim_pxf
(
category_id integer,
category_name character varying(400)
)
LOCATION ('pxf://phd1:50070/retail_demo/categories_dim/categories_dim.tsv.gz?Fragmenter=HdfsDataFragmenter&Accessor=TextFileAccessor&Resolver=TextResolver')
FORMAT 'TEXT' (DELIMITER = E'\t');
CREATE EXTERNAL TABLE retail_demo.customers_dim_pxf
(
customer_id TEXT,
first_name TEXT,
last_name TEXT,
gender TEXT
)
LOCATION ('pxf://phd1:50070/retail_demo/customers_dim/customers_dim.tsv.gz?Fragmenter=HdfsDataFragmenter&Accessor=TextFileAccessor&Resolver=TextResolver')
FORMAT 'TEXT' (DELIMITER = E'\t');
CREATE EXTERNAL TABLE retail_demo.order_lineitems_pxf
(
order_id TEXT,
order_item_id TEXT,
product_id TEXT,
product_name TEXT,
customer_id TEXT,
store_id TEXT,
item_shipment_status_code TEXT,
order_datetime TEXT,
ship_datetime TEXT,
item_return_datetime TEXT,
item_refund_datetime TEXT,
product_category_id TEXT,
product_category_name TEXT,
payment_method_code TEXT,
tax_amount TEXT,
item_quantity TEXT,
item_price TEXT,
discount_amount TEXT,
coupon_code TEXT,
coupon_amount TEXT,
ship_address_line1 TEXT,
ship_address_line2 TEXT,
ship_address_line3 TEXT,
ship_address_city TEXT,
ship_address_state TEXT,
ship_address_postal_code TEXT,
ship_address_country TEXT,
ship_phone_number TEXT,
ship_customer_name TEXT,
ship_customer_email_address TEXT,
ordering_session_id TEXT,
website_url TEXT
)
LOCATION ('pxf://phd1:50070/retail_demo/order_lineitems/order_lineitems.tsv.gz?Fragmenter=HdfsDataFragmenter&Accessor=TextFileAccessor&Resolver=TextResolver')
FORMAT 'TEXT' (DELIMITER = E'\t');
CREATE EXTERNAL TABLE retail_demo.orders_pxf
(
order_id TEXT,
customer_id TEXT,
store_id TEXT,
order_datetime TEXT,
ship_completion_datetime TEXT,
return_datetime TEXT,
refund_datetime TEXT,
payment_method_code TEXT,
total_tax_amount TEXT,
total_paid_amount TEXT,
total_item_quantity TEXT,
total_discount_amount TEXT,
coupon_code TEXT,
coupon_amount TEXT,
order_canceled_flag TEXT,
has_returned_items_flag TEXT,
has_refunded_items_flag TEXT,
fraud_code TEXT,
fraud_resolution_code TEXT,
billing_address_line1 TEXT,
billing_address_line2 TEXT,
billing_address_line3 TEXT,
billing_address_city TEXT,
billing_address_state TEXT,
billing_address_postal_code TEXT,
billing_address_country TEXT,
billing_phone_number TEXT,
customer_name TEXT,
customer_email_address TEXT,
ordering_session_id TEXT,
website_url TEXT
)
LOCATION ('pxf://phd1:50070/retail_demo/orders/orders.tsv.gz?Fragmenter=HdfsDataFragmenter&Accessor=TextFileAccessor&Resolver=TextResolver')
FORMAT 'TEXT' (DELIMITER = E'\t');
CREATE EXTERNAL TABLE retail_demo.customer_addresses_dim_pxf
(
customer_address_id TEXT,
customer_id TEXT,
valid_from_timestamp TEXT,
valid_to_timestamp TEXT,
house_number TEXT,
street_name TEXT,
appt_suite_no TEXT,
city TEXT,
state_code TEXT,
zip_code TEXT,
zip_plus_four TEXT,
country TEXT,
phone_number TEXT
)
LOCATION ('pxf://phd1:50070/retail_demo/customer_addresses_dim/customer_addresses_dim.tsv.gz?Fragmenter=HdfsDataFragmenter&Accessor=TextFileAccessor&Resolver=TextResolver')
FORMAT 'TEXT' (DELIMITER = E'\t');
CREATE EXTERNAL TABLE retail_demo.date_dim_pxf
(
calendar_day date,
reporting_year smallint,
reporting_quarter smallint,
reporting_month smallint,
reporting_week smallint,
reporting_dow smallint
)
LOCATION ('pxf://phd1:50070/retail_demo/date_dim/date_dim.tsv.gz?Fragmenter=HdfsDataFragmenter&Accessor=TextFileAccessor&Resolver=TextResolver')
FORMAT 'TEXT' (DELIMITER = E'\t');
CREATE EXTERNAL TABLE retail_demo.email_addresses_dim_pxf
(
customer_id TEXT,
email_address TEXT
)
LOCATION ('pxf://phd1:50070/retail_demo/email_addresses_dim/email_addresses_dim.tsv.gz?Fragmenter=HdfsDataFragmenter&Accessor=TextFileAccessor&Resolver=TextResolver')
FORMAT 'TEXT' (DELIMITER = E'\t');
CREATE EXTERNAL TABLE retail_demo.payment_methods_pxf
(
payment_method_id smallint,
payment_method_code character varying(20)
)
LOCATION ('pxf://phd1:50070/retail_demo/payment_methods/payment_methods.tsv.gz?Fragmenter=HdfsDataFragmenter&Accessor=TextFileAccessor&Resolver=TextResolver')
FORMAT 'TEXT' (DELIMITER = E'\t');
CREATE EXTERNAL TABLE retail_demo.products_dim_pxf
(
product_id TEXT,
category_id TEXT,
price TEXT,
product_name TEXT
)
LOCATION ('pxf://phd1:50070/retail_demo/products_dim/products_dim.tsv.gz?Fragmenter=HdfsDataFragmenter&Accessor=TextFileAccessor&Resolver=TextResolver')
FORMAT 'TEXT' (DELIMITER = E'\t');
select billing_address_postal_code, sum(total_paid_amount::float8) as total, sum(total_tax_amount::float8) as tax
from retail_demo.orders_pxf
group by billing_address_postal_code
order by total desc limit 10;
orders = LOAD '/retail_demo/orders/orders.tsv.gz'
USING PigStorage('\t') AS (
order_id : long,
customer_id : int,
store_id : int,
order_datetime : chararray,
ship_completion_datetime : chararray,
return_datetime : chararray,
refund_datetime : chararray,
payment_method_code : chararray,
total_tax_amount :double,
total_paid_amount : double,
total_item_quantity : int,
total_discount_amount : int,
coupon_code : chararray,
coupon_amount : int,
order_canceled_flag : chararray,
has_returned_items_flag : chararray,
has_refunded_items_flag : chararray,
fraud_code : chararray,
fraud_resolution_code : chararray,
billing_address_line1 : chararray,
billing_address_line2 : chararray,
billing_address_line3 : chararray,
billing_address_city : chararray,
billing_address_state : chararray,
billing_address_postal_code : int,
billing_address_country : chararray,
billing_phone_number : chararray,
customer_name : chararray,
customer_email_address :chararray,
ordering_session_id : int,
website_url : chararray
);
pcode = GROUP orders BY billing_address_postal_code;
revenue_counts = FOREACH pcode GENERATE group as zip, SUM(orders.total_paid_amount) as total,SUM(orders.total_tax_amount);
order_revenue = order revenue_counts by total DESC;
firstten = limit order_revenue 10;
dump firstten;
@tzolov
Copy link
Author

tzolov commented Sep 3, 2013

If you messup your external table HDFS path you will see something like:
ERROR: remote component error (0): There is no pxf servlet listening on the host and port specified in the external table url (libchurl.c:694)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment