Skip to content

Instantly share code, notes, and snippets.

@rajkrrsingh
Last active July 4, 2019 10:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rajkrrsingh/f01475f4bfa4a33240134561171f378f to your computer and use it in GitHub Desktop.
Save rajkrrsingh/f01475f4bfa4a33240134561171f378f to your computer and use it in GitHub Desktop.
hive druid integration : quick test to create druid table from hive table
generate data for hive table
echo "generating sample data for hive table"
echo {-1..-181451}hours | xargs -n1 date +"%Y-%m-%d %H:%M:%S" -d >> /tmp/dates.data
echo {-1..-18145}minutes | xargs -n1 date +"%Y-%m-%d %H:%M:%S" -d >> /tmp/dates.data
echo {-1..-1825}days | xargs -n1 date +"%Y-%m-%d %H:%M:%S" -d >> /tmp/dates.data
cat /tmp/dates.data | while read LINE ; do echo $LINE,"user"$((1 + RANDOM % 10000)),$((1 + RANDOM % 1000)) >> /tmp/hive_user_table.data; done

create hive table

create table hive_user_table(`timecolumn` timestamp, `username` string, `credit_rating` int) row format delimited fields terminated by ',';
load data local inpath '/tmp/hive_user_table.data' into table hive_user_table;
select count(*) from hive_user_table;
// 201421
select * from hive_user_table limit 5;

default hive conf to work with druid

hive.druid.bitmap.type=roaring
hive.druid.broker.address.default=brokernode:8888
hive.druid.coordinator.address.default=coordinatorenode:8081
hive.druid.http.numConnection=20
hive.druid.http.read.timeout=PT10M
hive.druid.indexer.memory.rownum.max=75000
hive.druid.indexer.partition.size.max=1000000
hive.druid.indexer.segments.granularity=DAY
hive.druid.maxTries=5
hive.druid.metadata.base=druid
hive.druid.metadata.db.type=mysql
hive.druid.metadata.password=druid
hive.druid.metadata.uri=jdbc:mysql://mysqlhost:3306/druid?createDatabaseIfNotExist=true
hive.druid.metadata.username=druid
hive.druid.passiveWaitTimeMs=30000
hive.druid.select.distribute=true
hive.druid.select.threshold=10000
hive.druid.sleep.time=PT10S
hive.druid.storage.storageDirectory=/apps/druid/warehouse
hive.druid.working.directory=/tmp/druid-indexing

create druid table from hive using CTAS

drop table druid_user_table;
CREATE TABLE druid_user_table STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler' TBLPROPERTIES ("druid.segment.granularity" = "DAY", "druid.query.granularity" = "HOUR") 
AS select `timecolumn` as `__time`, `username`, `credit_rating` FROM hive_user_table;

-- since druid.segment.granularity is day so it will create ~1800 segments in druid
select * from druid_user_table limit 5;

check druid coordinatore UI, you will see a new datasource with the name of database.druid_table_name.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment