Skip to content

Instantly share code, notes, and snippets.

@dvannoy
Last active June 25, 2016 21:11
Show Gist options
  • Save dvannoy/094665c76c42941a98584fcd39e767f2 to your computer and use it in GitHub Desktop.
Save dvannoy/094665c76c42941a98584fcd39e767f2 to your computer and use it in GitHub Desktop.
Commands used for SQL on Hadoop: Getting Started. You can run these commands when following along with the slides from the presentation. These may not be perfect, but hopefully will be better than typing out what shows up in slides.
cd /home/cloudera/Downloads
hadoop fs -ls /
hadoop fs -mkdir -p /data/baseball/team/
hadoop fs -copyFromLocal baseball/team.csv /data/baseball/team/
CREATE EXTERNAL TABLE team (
year string,
league_id string,
team_id string,
franchise_id string,
div_id string,
rank string,
g string,
ghome string,
w string,
l string,
div_win string,
wc_win string,
lg_win string,
ws_win string,
r string,
ab string,
h string,
double string,
triple string,
hr string,
bb string,
so string,
sb string,
cs string,
hbp string,
sf string,
ra string,
er string,
era string,
cg string,
sho string,
sv string,
ipouts string,
ha string,
hra string,
bba string,
soa string,
e string,
dp string,
fp string,
name string,
park string,
attendance string,
bpf string,
ppf string,
team_id_br string,
team_id_lahman45 string,
team_id_retro string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/data/baseball/team'
tblproperties ('skip.header.line.count'='1');
select * from team limit 5;
select distinct name, league_id from team where league_id in ('NL', 'AL');
hadoop fs -mkdir -p /data/baseball/player/
hadoop fs -copyFromLocal baseball/player.csv /data/baseball/player/
CREATE TABLE player (
player_id string,
birth_year string,
birth_month string,
birth_day string,
birth_country string,
birth_state string,
birth_city string,
death_year string,
death_month string,
death_day string,
death_country string,
death_state string,
death_city string,
name_first string,
name_last string,
name_given string,
weight string,
height string,
bats string,
throws string,
debut string,
final_game string,
retro_id string,
bbref_id string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
tblproperties ('skip.header.line.count'='1');
LOAD DATA INPATH "/data/baseball/player" INTO TABLE player;
hadoop fs -mkdir -p /data/baseball/batting/
hadoop fs -copyFromLocal baseball/batting.csv /data/baseball/batting/
CREATE EXTERNAL TABLE batting (
player_id string, year string, stint string, team_id string, league_id string, g string, ab string, r string, h string, double string, triple string, hr string, rbi string, sb string, cs string, bb string, so string, ibb string, hbp string, sh string, sf string, g_idp string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/data/baseball/batting'
tblproperties ('skip.header.line.count'='1');
select * from batting limit 5;
select *
from player
join batting
on player.player_id = batting.player_id;
select count(1)
from player
left join batting
on player.player_id = batting.player_id;
select hr, count(distinct batting.player_id) player_count
from player
join batting on player.player_id = batting.player_id
group by hr;
CREATE TABLE batting_full
STORED AS Parquet
TBLPROPERTIES('parquet.compression'='SNAPPY')
AS
select batting.*, player.name_first, player.name_last,
player.weight, player.height, player.bats, player.birth_city,
player.birth_year
from player
join batting
on player.player_id = batting.player_id;
select hr, count(distinct player_id) player_count
from batting_full
group by hr;
select hr, count(distinct player_id) player_count
from batting
group by hr;
hadoop fs -ls /
hadoop fs -mkdir -p /data/sd/crimedata
hadoop fs -copyFromLocal ARJISPublicCrime061516.txt /data/sd/crimedata/
create table crimedata (
agency string,
charge_description_orig string,
activity_date string,
block_address string,
zip_code string,
community string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES('separatorChar' = ',', 'quoteChar' = '"', 'escapeChar' = '\\')
STORED AS TEXTFILE
tblproperties ('skip.header.line.count'='1');
LOAD DATA INPATH '/data/sd/crimedata/ARJISPublicCrime061516.txt' INTO TABLE crimedata;
hadoop fs -ls /user/hive/warehouse/crimedata
hadoop fs -ls /data/sd/crimedata/ARJISPublicCrime061516.txt
-- notes:
-- important to use CSV serde with this file to deal with quotes as text qualifiers, otherwise the commas in the values will throw off the parsing
-- when loading data inpath, it moves the file
-- since not external, if you drop the table it will remove data from HDFS
-- for example, this won't throw an error but it also won't work properly
--CREATE EXTERNAL TABLE crimedata_external (
--agency string,
--charge_description_orig string,
--activity_date timestamp,
--block_address string,
--zip_code string,
--community string
--)
--ROW FORMAT DELIMITED
--FIELDS TERMINATED BY ‘,’
--STORED AS TEXTFILE
--LOCATION '/data/sd/crimedata/' ;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment