rajkrrsingh/hive_druid_integration.md

## hive_druid_integration.md

      
    Raw
  

              hive_druid_integration.md
            
          
    generate data for hive table

echo "generating sample data for hive table"
echo {-1..-181451}hours | xargs -n1 date +"%Y-%m-%d %H:%M:%S" -d >> /tmp/dates.data
echo {-1..-18145}minutes | xargs -n1 date +"%Y-%m-%d %H:%M:%S" -d >> /tmp/dates.data
echo {-1..-1825}days | xargs -n1 date +"%Y-%m-%d %H:%M:%S" -d >> /tmp/dates.data
cat /tmp/dates.data | while read LINE ; do echo $LINE,"user"$((1 + RANDOM % 10000)),$((1 + RANDOM % 1000)) >> /tmp/hive_user_table.data; done

create hive table

create table hive_user_table(`timecolumn` timestamp, `username` string, `credit_rating` int) row format delimited fields terminated by ',';
load data local inpath '/tmp/hive_user_table.data' into table hive_user_table;
select count(*) from hive_user_table;
// 201421
select * from hive_user_table limit 5;

default hive conf to work with druid

hive.druid.bitmap.type=roaring
hive.druid.broker.address.default=brokernode:8888
hive.druid.coordinator.address.default=coordinatorenode:8081
hive.druid.http.numConnection=20
hive.druid.http.read.timeout=PT10M
hive.druid.indexer.memory.rownum.max=75000
hive.druid.indexer.partition.size.max=1000000
hive.druid.indexer.segments.granularity=DAY
hive.druid.maxTries=5
hive.druid.metadata.base=druid
hive.druid.metadata.db.type=mysql
hive.druid.metadata.password=druid
hive.druid.metadata.uri=jdbc:mysql://mysqlhost:3306/druid?createDatabaseIfNotExist=true
hive.druid.metadata.username=druid
hive.druid.passiveWaitTimeMs=30000
hive.druid.select.distribute=true
hive.druid.select.threshold=10000
hive.druid.sleep.time=PT10S
hive.druid.storage.storageDirectory=/apps/druid/warehouse
hive.druid.working.directory=/tmp/druid-indexing

create druid table from hive using CTAS

drop table druid_user_table;
CREATE TABLE druid_user_table STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler' TBLPROPERTIES ("druid.segment.granularity" = "DAY", "druid.query.granularity" = "HOUR") 
AS select `timecolumn` as `__time`, `username`, `credit_rating` FROM hive_user_table;

-- since druid.segment.granularity is day so it will create ~1800 segments in druid
select * from druid_user_table limit 5;

check druid coordinatore UI, you will see a new datasource with the name of database.druid_table_name.