Skip to content

Instantly share code, notes, and snippets.

@t3rmin4t0r
Created January 28, 2019 06:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save t3rmin4t0r/93126eb9315ffbbe08bcc7b02e1159c5 to your computer and use it in GitHub Desktop.
Save t3rmin4t0r/93126eb9315ffbbe08bcc7b02e1159c5 to your computer and use it in GitHub Desktop.
TPC-DS schema parser
import sys
import re
PK_POS=re.compile(r'\(([0-9])\)')
class SourceCol(object):
def __init__(self, data):
"""
['ss_sold_date_sk', 'identifier', '', '', 'd_date_sk', 'Y']
['c_customer_id (B)', 'char(16)', 'N', 'U']
"""
self.name = data[0].replace("(B)","").strip()
self.type = data[1].strip()
self.notnull = False
self.unique = False
self.pkpos = -1
self.pk = False
self.fk = None
self.partition_key = False
if (len(data) >= 3):
self.notnull = data[2]=='N'
if (len(data) >= 4):
self.pk = data[3] == 'Y'
m = PK_POS.search(self.name)
if m:
self.pkpos = int(m.group(1))
self.name = self.name.replace("(%d)" % self.pkpos, "").strip()
self.unique = data[3] == 'U'
if len(data) >= 5:
self.fk = data[4]
if len(data) >= 6:
self.partition_key=data[5] == 'Y'
def __repr__(self):
return "Col(%s %s, not-null=%s, pk=%s, pkpos=%d, unique=%s, fk=%s, partition_key=%s)" % (self.name, self.type, self.notnull, self.pk, self.pkpos, self.unique, self.fk, self.partition_key)
CTAS_EXT="""
create external table if not exists %(name)s(
%(cols)s
)
row format delimited fields terminated by '|'
location '/tmp/tpcds-generate/%(scale)d/%(name)s';
"""
CTAS_FLAT="""
create table if not exists %(name)s stored as ORC as
select * from tpcds_text_exact_%(scale)d.%(name)s
cluster by %(pks)s;
"""
CTAS_PARTITIONED="""
create table if not exists %(name)s(
%(other_coltypes)s
)
partitioned by (%(part_col)s %(part_type)s)
stored as ORC
;
from
tpcds_text_exact_%(scale)d.%(name)s src
insert overwrite table %(name)s partition (%(part_col)s)
select
%(other_cols)s
, %(part_col)s
where %(part_col)s is not null
insert overwrite table %(name)s partition (%(part_col)s)
select
%(other_cols)s
, %(part_col)s
where %(part_col)s is null
sort by %(pks)s
;
"""
class SourceTable(object):
def __init__(self):
self.name = None
self.cols = []
def feed(self, data):
if data[1] == 'Datatype':
tbl = SourceTable()
tbl.name = data[0].lower()
return tbl
else:
self.cols.append(SourceCol(data))
self.partitioned = [x for x in self.cols if x.partition_key]
self.pks = [x for x in self.cols if x.pk]
return self
def ctas_external(self, scale=10000, exact=True):
cols = ['%s %s' % (c.name, self.xform(c.type, exact)) for c in self.cols]
return CTAS_EXT % {'name' : self.name, 'cols' : "\n, ".join(cols), 'scale' : scale}
def xform(self, t, exact):
exacts = {"identifier" : "int", "bigidentifier" : "bigint", "integer" : "int"}
if exact:
if exacts.has_key(t.lower()):
return exacts[t.lower()]
return t.lower()
def ctas(self, scale=10000, exact=True):
if (self.partitioned):
partition = self.partitioned[0]
comma = "\n, "
params = {'name' : self.name, 'scale' : scale, 'pks' : ",".join([x.name for x in self.pks])}
params['other_coltypes'] = comma.join(['%s %s' % (c.name, self.xform(c.type, exact)) for c in self.cols if not c.partition_key])
params['other_cols'] = comma.join(['%s' % (c.name) for c in self.cols if not c.partition_key])
params['part_col'] = partition.name
params['part_type'] = self.xform(partition.type, exact)
return CTAS_PARTITIONED % params
else:
return CTAS_FLAT % {'name' : self.name, 'scale' : scale, 'pks' : ",".join([x.name for x in self.pks])}
def __repr__(self):
return "Table<%s (%d cols)%s>" % (self.name, len(self.cols), (self.partitioned and " partition=%s" % (self.partitioned[0].name)) or "")
def main(args):
scale = 10000
if (args):
scale = int(args[0])
lines = [l.strip() for l in open("tpcds-schema.tsv")]
tbl = SourceTable()
tables = set([])
for l in lines:
tbl = tbl.feed(l.split("\t"))
tables.add(tbl)
print "create database if not exists tpcds_text_exact_%d;" % scale
print "use tpcds_text_exact_%d;" % scale
for tbl in sorted(tables):
print "-- %s" % tbl
print tbl.ctas_external(scale=scale)
print "create database if not exists tpcds_orc_partitioned_exact_%d;" % scale
print "use tpcds_orc_partitioned_exact_%d;" % scale
for tbl in sorted(tables):
print "-- %s" % tbl
print tbl.ctas(scale=scale)
if __name__ == "__main__":
main(sys.argv[1:])
We can make this file beautiful and searchable if this error is corrected: It looks like row 13 should actually have 1 column, instead of 2. in line 12.
store_sales Datatype NULLs Primary Key Foreign Key Partition Key
ss_sold_date_sk identifier d_date_sk Y
ss_sold_time_sk identifier t_time_sk
ss_item_sk (1) identifier N Y i_item_sk
ss_customer_sk identifier c_customer_sk
ss_cdemo_sk identifier cd_demo_sk
ss_hdemo_sk identifier hd_demo_sk
ss_addr_sk identifier ca_address_sk
ss_store_sk identifier s_store_sk
ss_promo_sk identifier p_promo_sk
ss_ticket_number (2) bigidentifier N Y
ss_quantity integer
ss_wholesale_cost decimal(7,2)
ss_list_price decimal(7,2)
ss_sales_price decimal(7,2)
ss_ext_discount_amt decimal(7,2)
ss_ext_sales_price decimal(7,2)
ss_ext_wholesale_cost decimal(7,2)
ss_ext_list_price decimal(7,2)
ss_ext_tax decimal(7,2)
ss_coupon_amt decimal(7,2)
ss_net_paid decimal(7,2)
ss_net_paid_inc_tax decimal(7,2)
ss_net_profit decimal(7,2)
store_returns Datatype NULLs Primary Key Foreign Key Partition Key
sr_returned_date_sk identifier d_date_sk Y
sr_return_time_sk identifier t_time_sk
sr_item_sk (1) identifier N Y i_item_sk,ss_item_sk
sr_customer_sk identifier c_customer_sk
sr_cdemo_sk identifier cd_demo_sk
sr_hdemo_sk identifier hd_demo_sk
sr_addr_sk identifier ca_address_sk
sr_store_sk identifier s_store_sk
sr_reason_sk identifier r_reason_sk
sr_ticket_number (2) bigidentifier N Y ss_ticket_number
sr_return_quantity integer
sr_return_amt decimal(7,2)
sr_return_tax decimal(7,2)
sr_return_amt_inc_tax decimal(7,2)
sr_fee decimal(7,2)
sr_return_ship_cost decimal(7,2)
sr_refunded_cash decimal(7,2)
sr_reversed_charge decimal(7,2)
sr_store_credit decimal(7,2)
sr_net_loss decimal(7,2)
catalog_sales Datatype NULLs Primary Key Foreign Key Partition Key
cs_sold_date_sk identifier d_date_sk Y
cs_sold_time_sk identifier t_time_sk
cs_ship_date_sk identifier d_date_sk
cs_bill_customer_sk identifier c_customer_sk
cs_bill_cdemo_sk identifier cd_demo_sk
cs_bill_hdemo_sk identifier hd_demo_sk
cs_bill_addr_sk identifier ca_address_sk
cs_ship_customer_sk identifier c_customer_sk
cs_ship_cdemo_sk identifier cd_demo_sk
cs_ship_hdemo_sk identifier hd_demo_sk
cs_ship_addr_sk identifier ca_address_sk
cs_call_center_sk identifier cc_call_center_sk
cs_catalog_page_sk identifier cp_catalog_page_sk
cs_ship_mode_sk identifier sm_ship_mode_sk
cs_warehouse_sk identifier w_warehouse_sk
cs_item_sk (1) identifier N Y i_item_sk
cs_promo_sk identifier p_promo_sk
cs_order_number (2) bigidentifier N Y
cs_quantity integer
cs_wholesale_cost decimal(7,2)
cs_list_price decimal(7,2)
cs_sales_price decimal(7,2)
cs_ext_discount_amt decimal(7,2)
cs_ext_sales_price decimal(7,2)
cs_ext_wholesale_cost decimal(7,2)
cs_ext_list_price decimal(7,2)
cs_ext_tax decimal(7,2)
cs_coupon_amt decimal(7,2)
cs_ext_ship_cost decimal(7,2)
cs_net_paid decimal(7,2)
cs_net_paid_inc_tax decimal(7,2)
cs_net_paid_inc_ship decimal(7,2)
cs_net_paid_inc_ship_tax decimal(7,2)
cs_net_profit decimal(7,2)
Catalog_returns Datatype NULLs Primary Key Foreign Key Partition Key
cr_returned_date_sk identifier d_date_sk Y
cr_returned_time_sk identifier t_time_sk
cr_item_sk (1) identifier N Y i_item_sk,cs_item_sk
cr_refunded_customer_sk identifier c_customer_sk
cr_refunded_cdemo_sk identifier cd_demo_sk
cr_refunded_hdemo_sk identifier hd_demo_sk
cr_refunded_addr_sk identifier ca_address_sk
cr_returning_customer_sk identifier c_customer_sk
cr_returning_cdemo_sk identifier cd_demo_sk
cr_returning_hdemo_sk identifier hd_demo_sk
cr_returning_addr_sk identifier ca_address_sk
cr_call_center_sk identifier cc_call_center_sk
cr_catalog_page_sk identifier cp_catalog_page_sk
cr_ship_mode_sk identifier sm_ship_mode_sk
cr_warehouse_sk identifier w_warehouse_sk
cr_reason_sk identifier r_reason_sk
cr_order_number (2) bigidentifier N Y cs_order_number
cr_return_quantity integer
cr_return_amount decimal(7,2)
cr_return_tax decimal(7,2)
cr_return_amt_inc_tax decimal(7,2)
cr_fee decimal(7,2)
cr_return_ship_cost decimal(7,2)
cr_refunded_cash decimal(7,2)
cr_reversed_charge decimal(7,2)
cr_store_credit decimal(7,2)
cr_net_loss decimal(7,2)
Web_sales Datatype NULLs Primary Key Foreign Key Partition Key
ws_sold_date_sk identifier d_date_sk Y
ws_sold_time_sk identifier t_time_sk
ws_ship_date_sk identifier d_date_sk
ws_item_sk (1) identifier N Y i_item_sk
ws_bill_customer_sk identifier c_customer_sk
ws_bill_cdemo_sk identifier cd_demo_sk
ws_bill_hdemo_sk identifier hd_demo_sk
ws_bill_addr_sk identifier ca_address_sk
ws_ship_customer_sk identifier c_customer_sk
ws_ship_cdemo_sk identifier cd_demo_sk
ws_ship_hdemo_sk identifier hd_demo_sk
ws_ship_addr_sk identifier ca_address_sk
ws_web_page_sk identifier wp_web_page_sk
ws_web_site_sk identifier web_site_sk
ws_ship_mode_sk identifier sm_ship_mode_sk
ws_warehouse_sk identifier w_warehouse_sk
ws_promo_sk identifier p_promo_sk
ws_order_number (2) bigidentifier N Y
ws_quantity integer
ws_wholesale_cost decimal(7,2)
ws_list_price decimal(7,2)
ws_sales_price decimal(7,2)
ws_ext_discount_amt decimal(7,2)
ws_ext_sales_price decimal(7,2)
ws_ext_wholesale_cost decimal(7,2)
ws_ext_list_price decimal(7,2)
ws_ext_tax decimal(7,2)
ws_coupon_amt decimal(7,2)
ws_ext_ship_cost decimal(7,2)
ws_net_paid decimal(7,2)
ws_net_paid_inc_tax decimal(7,2)
ws_net_paid_inc_ship decimal(7,2)
ws_net_paid_inc_ship_tax decimal(7,2)
ws_net_profit decimal(7,2)
Web_returns Datatype NULLs Primary Key Foreign Key Partition Key
wr_returned_date_sk identifier d_date_sk Y
wr_returned_time_sk identifier t_time_sk
wr_item_sk (2) identifier N Y i_item_sk,ws_item_sk
wr_refunded_customer_sk identifier c_customer_sk
wr_refunded_cdemo_sk identifier cd_demo_sk
wr_refunded_hdemo_sk identifier hd_demo_sk
wr_refunded_addr_sk identifier ca_address_sk
wr_returning_customer_sk identifier c_customer_sk
wr_returning_cdemo_sk identifier cd_demo_sk
wr_returning_hdemo_sk identifier hd_demo_sk
wr_returning_addr_sk identifier ca_address_sk
wr_web_page_sk identifier wp_web_page_sk
wr_reason_sk identifier r_reason_sk
wr_order_number (1) bigidentifier N Y ws_order_number
wr_return_quantity integer
wr_return_amt decimal(7,2)
wr_return_tax decimal(7,2)
wr_return_amt_inc_tax decimal(7,2)
wr_fee decimal(7,2)
wr_return_ship_cost decimal(7,2)
wr_refunded_cash decimal(7,2)
wr_reversed_charge decimal(7,2)
wr_account_credit decimal(7,2)
wr_net_loss decimal(7,2)
Inventory Datatype NULLs Primary Key Foreign Key Partition Key
inv_date_sk (1) identifier N Y d_date_sk
inv_item_sk (2) identifier N Y i_item_sk
inv_warehouse_sk (3) identifier N Y w_warehouse_sk
inv_quantity_on_hand integer
Store Datatype NULLs Primary Key Foreign Key Partition Key
s_store_sk identifier N Y
s_store_id (B) char(16) N N
s_rec_start_date date
s_rec_end_date date
s_closed_date_sk identifier d_date_sk
s_store_name varchar(50)
s_number_employees integer
s_floor_space integer
s_hours char(20)
S_manager varchar(40)
S_market_id integer
S_geography_class varchar(100)
S_market_desc varchar(100)
s_market_manager varchar(40)
s_division_id integer
s_division_name varchar(50)
s_company_id integer
s_company_name varchar(50)
s_street_number varchar(10)
s_street_name varchar(60)
s_street_type char(15)
s_suite_number char(10)
s_city varchar(60)
s_county varchar(30)
s_state char(2)
s_zip char(10)
s_country varchar(20)
s_gmt_offset decimal(5,2)
s_tax_percentage decimal(5,2)
Call_center Datatype NULLs Primary Key Foreign Key Partition Key
cc_call_center_sk identifier N Y
cc_call_center_id (B) char(16) N N
cc_rec_start_date date
cc_rec_end_date date
cc_closed_date_sk integer d_date_sk
cc_open_date_sk integer d_date_sk
cc_name varchar(50)
cc_class varchar(50)
cc_employees integer
cc_sq_ft integer
cc_hours char(20)
cc_manager varchar(40)
cc_mkt_id integer
cc_mkt_class char(50)
cc_mkt_desc varchar(100)
cc_market_manager varchar(40)
cc_division integer
cc_division_name varchar(50)
cc_company integer
cc_company_name char(50)
cc_street_number char(10)
cc_street_name varchar(60)
cc_street_type char(15)
cc_suite_number char(10)
cc_city varchar(60)
cc_county varchar(30)
cc_state char(2)
cc_zip char(10)
cc_country varchar(20)
cc_gmt_offset decimal(5,2)
cc_tax_percentage decimal(5,2)
catalog_page Datatype NULLs Primary Key Foreign Key Partition Key
cp_catalog_page_sk identifier N Y
cp_catalog_page_id (B) char(16) N N
cp_start_date_sk integer d_date_sk
cp_end_date_sk integer d_date_sk
cp_department varchar(50)
cp_catalog_number integer
cp_catalog_page_number integer
cp_description varchar(100)
cp_type varchar(100)
web_site Datatype NULLs Primary Key Foreign Key Partition Key
web_site_sk identifier N Y
web_site_id (B) char(16) N N
web_rec_start_date date
web_rec_end_date date
web_name varchar(50)
web_open_date_sk identifier d_date_sk
web_close_date_sk identifier d_date_sk
web_class varchar(50)
web_manager varchar(40)
web_mkt_id integer
web_mkt_class varchar(50)
web_mkt_desc varchar(100)
web_market_manager varchar(40)
web_company_id integer
web_company_name char(50)
web_street_number char(10)
web_street_name varchar(60)
web_street_type char(15)
web_suite_number char(10)
web_city varchar(60)
web_county varchar(30)
web_state char(2)
web_zip char(10)
web_country varchar(20)
web_gmt_offset decimal(5,2)
web_tax_percentage decimal(5,2)
web_page Datatype NULLs Primary Key Foreign Key Partition Key
wp_web_page_sk identifier N Y
wp_web_page_id (B) char(16) N N
wp_rec_start_date date
wp_rec_end_date date
wp_creation_date_sk identifier d_date_sk
wp_access_date_sk identifier d_date_sk
wp_autogen_flag char(1)
wp_customer_sk identifier c_customer_sk
wp_url varchar(100)
wp_type char(50)
wp_char_count integer
wp_link_count integer
wp_image_count integer
wp_max_ad_count integer
warehouse Datatype NULLs Primary Key Foreign Key Partition Key
w_warehouse_sk identifier N Y
w_warehouse_id (B) char(16) N U
w_warehouse_name varchar(20)
w_warehouse_sq_ft integer
w_street_number char(10)
w_street_name varchar(60)
w_street_type char(15)
w_suite_number char(10)
w_city varchar(60)
w_county varchar(30)
w_state char(2)
w_zip char(10)
w_country varchar(20)
w_gmt_offset decimal(5,2)
customer Datatype NULLs Primary Key Foreign Key Partition Key
c_customer_sk identifier N Y
c_customer_id (B) char(16) N U
c_current_cdemo_sk identifier cd_demo_sk
c_current_hdemo_sk identifier hd_demo_sk
c_current_addr_sk identifier ca_addres_sk
c_first_shipto_date_sk identifier d_date_sk
c_first_sales_date_sk identifier d_date_sk
c_salutation char(10)
c_first_name char(20)
c_last_name char(30)
c_preferred_cust_flag char(1)
c_birth_day integer
c_birth_month integer
c_birth_year integer
c_birth_country varchar(20)
c_login char(13)
c_email_address char(50)
c_last_review_date_sk identifier d_date_sk
customer_address Datatype NULLs Primary Key Foreign Key Partition Key
ca_address_sk identifier N Y
ca_address_id (B) char(16) N U
ca_street_number char(10)
ca_street_name varchar(60)
ca_street_type char(15)
ca_suite_number char(10)
ca_city varchar(60)
ca_county varchar(30)
ca_state char(2)
ca_zip char(10)
ca_country varchar(20)
ca_gmt_offset decimal(5,2)
ca_location_type char(20)
customer_demographics Datatype NULLs Primary Key Foreign Key Partition Key
cd_demo_sk identifier N Y
cd_gender char(1)
cd_marital_status char(1)
cd_education_status char(20)
cd_purchase_estimate integer
cd_credit_rating char(10)
cd_dep_count integer
cd_dep_employed_count integer
cd_dep_college_count integer
date_dim Datatype NULLs Primary Key Foreign Key Partition Key
d_date_sk identifier N Y
d_date_id (B) char(16) N U
d_date date U
d_month_seq integer
d_week_seq integer
d_quarter_seq integer
d_year integer
d_dow integer
d_moy integer
d_dom integer
d_qoy integer
d_fy_year integer
d_fy_quarter_seq integer
d_fy_week_seq integer
d_day_name char(9)
d_quarter_name char(6)
d_holiday char(1)
d_weekend char(1)
d_following_holiday char(1)
d_first_dom integer
d_last_dom integer
d_same_day_ly integer
d_same_day_lq integer
d_current_day char(1)
d_current_week char(1)
d_current_month char(1)
d_current_quarter char(1)
d_current_year char(1)
household_demographics Datatype NULLs Primary Key Foreign Key Partition Key
hd_demo_sk identifier N Y
hd_income_band_sk identifier ib_income_band_sk
hd_buy_potential char(15)
hd_dep_count integer
hd_vehicle_count integer
item Datatype NULLs Primary Key Foreign Key Partition Key
i_item_sk identifier N Y
i_item_id (B) char(16) N N
i_rec_start_date date
i_rec_end_date date
i_item_desc varchar(200)
i_current_price decimal(7,2)
i_wholesale_cost decimal(7,2)
i_brand_id integer
i_brand char(50)
i_class_id integer
i_class char(50)
i_category_id integer
i_category char(50)
i_manufact_id integer
i_manufact char(50)
i_size char(20)
i_formulation char(20)
i_color char(20)
i_units char(10)
i_container char(10)
i_manager_id integer
i_product_name char(50)
income_band Datatype NULLs Primary Key Foreign Key Partition Key
ib_income_band_sk identifier N Y
ib_lower_bound integer
ib_upper_bound integer
promotion Datatype NULLs Primary Key Foreign Key Partition Key
p_promo_sk identifier N Y
p_promo_id (B) char(16) N N
p_start_date_sk identifier d_date_sk
p_end_date_sk identifier d_date_sk
p_item_sk identifier i_item_sk
p_cost decimal(15,2)
p_response_target integer
p_promo_name char(50)
p_channel_dmail char(1)
p_channel_email char(1)
p_channel_catalog char(1)
p_channel_tv char(1)
p_channel_radio char(1)
p_channel_press char(1)
p_channel_event char(1)
p_channel_demo char(1)
p_channel_details varchar(100)
p_purpose char(15)
p_discount_active char(1)
reason Datatype NULLs Primary Key Foreign Key Partition Key
r_reason_sk identifier N Y
r_reason_id (B) char(16) N U
r_reason_desc char(100)
ship_mode Datatype NULLs Primary Key Foreign Key Partition Key
sm_ship_mode_sk identifier N Y
sm_ship_mode_id (B) char(16) N U
sm_type char(30)
sm_code char(10)
sm_carrier char(20)
sm_contract char(20)
time_dim Datatype NULLs Primary Key Foreign Key Partition Key
t_time_sk Identifier N Y
t_time_id (B) char(16) N U
t_time Integer
t_hour Integer
t_minute Integer
t_second Integer
t_am_pm char(2)
t_shift char(20)
t_sub_shift char(20)
t_meal_time char(20)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment