Skip to content

Instantly share code, notes, and snippets.

@sambos
Last active May 11, 2018 17:24
Show Gist options
  • Save sambos/fbf28736af9e145a0bbf3aa95319904b to your computer and use it in GitHub Desktop.
Save sambos/fbf28736af9e145a0bbf3aa95319904b to your computer and use it in GitHub Desktop.
create Impala from Parquet

Demonstrates how to create impala table from parquet data

#!/bin/bash

# script that creates impala table from parquet file
# prerequisite: convert avro to parquet

mtype=default
db_name=user
table_name=$mtype_metrics
impala_daemon=impala-host.rsol.org
data_path=/data/uer/parquet

#take a smaple file for schema inference
file="$(hdfs dfs -ls $data_path/$mtype.parquet/part-r-00000-*.gz.parquet | awk -F " " '{print $8}' | head)"
echo using this $file for schema inference

impala-shell --ssl -k -V -i $impala_daemon <<EOF
create external table $db_name.$table_name LIKE PARQUET '$file' stored as PARQUET;
load data inpath '$data_path/$mtype.parquet' into table $db_name.$table_name;
EOF

impala-shell --ssl -k -V -i $impala_daemon -q "describe $db_name.$table_name;"

Create impala from Guardium csv data

echo "staging results to imapal staging directory"
hdfs dfs -cp $HDFS_RESULTS_DIR/part-00000 $IMPALA_STAGING_PATH

# Testing impala connection
impala-shell -k -i $SERVER_NAME <<EOF
select count(*) from $DB_NAME.$TABLE_NAME;
EOF

# Drop and recreate table with new data
impala-shell -k -i $SERVER_NAME <<EOF
DROP TABLE IF EXISTS $DB_NAME.$TABLE_NAME;


CREATE EXTERNAL TABLE $DB_NAME.$TABLE_NAME (
Log_Timestamp           String,
ClientIP                String,
OSUser                  String,
DBUserName              String,
ServerIP                String,
DBProtocol              String,
NetworkProtocol         String,
SourceProgram           String,
ServiceName             String,
ObjectName              String,
SQLVerb                 String,
FullSql                 String,
DatabaseName            String,
Succeeded               String,
ReturnedData            String,
SessionStart            String,
FullSQLID               String,
RecordsAffected         String,
RecordsAffected_desc    String,
ReturnedDataCount       String,
Log_Values              String,
Log_Timestamp2          String,
Log_SessionId           String,
AccessId                String,
ClientHostName          String,
AnalyzedClientIP        String,
UidChain                String,
ProcessID               String,
GlobalId                String,
BindVariablesValues     String
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LOCATION '$IMPALA_STAGING_PATH'
tblproperties ("skip.header.line.count"="1");

select count(*) from $DB_NAME.$TABLE_NAME;
EOF

exit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment