Skip to content

Instantly share code, notes, and snippets.

@linusmcm
Last active May 26, 2018 02:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save linusmcm/2076056d78860ef9d004d9809e974a6b to your computer and use it in GitHub Desktop.
Save linusmcm/2076056d78860ef9d004d9809e974a6b to your computer and use it in GitHub Desktop.
hadoop_hbase_csv_import
#! /bin/sh -x
hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.columns=HBASE_ROW_KEY,colFamily:a,colFamily:b '-Dimporttsv.separator=,' tab /tmp/hbase/test.csv
#! /bin/sh -x
########################################
###
mkdir ../tempDir
###
dataSource="http://samplecsvs.s3.amazonaws.com/SalesJan2009.csv"
### path of csv files on bigVM
dataPath="../tempDir/baseFile.csv"
dataPathTemp="../tempDir/tempFile.csv"
dataPathClean="../tempDir/cleaned.csv"
### input path on HDFS
inputHDFS="/tmp/hbase"
### hdfs path of csv file
hdfsPATH="/tmp/hbase/hbaseFile.csv"
########################################
wget $dataSource -O $dataPath
echo "clean up header od csv file___________________"
tail -n +2 $dataPath > "../tempDir/tempFile.csv"
#sed ':l;s/,,/,NA,/;tl; s/[[:blank:]]*//g' $dataPathTemp > $dataPathClean
#tail -n +2 SalesJan2009.csv > file3.csv
sed 's/ *,/,/g' $dataPathTemp | sed 's/,,/,empty,/g' > $dataPathClean
#rm -rf $dataPath
########################################
### set column names
a="Transaction_date"
b="Product"
c="Price"
d="Payment_Type"
e="Name"
f="City"
g="State"
h="Country"
i="Account_Created"
j="Last_Login"
k="Latitude"
l="Longitude"
########################################
### set table name
tableName="Sales"
########################################
echo "### put files in hadoop___________________"
hadoop fs -put $dataPathClean $inputHDFS
echo "### remove file from local directory___________________"
#rm -rf $dataPathClean
#echo "disable and drop table if it pre-exists: $tableName ___________________"
#echo "disable '$tableName'" | hbase shell
#echo "drop '$tableName'" | hbase shell
echo "create '$tableName', '$a', '$b', '$c', '$d', '$e', '$f', '$g', '$h', '$i', '$j', '$k', '$l'" | hbase shell
hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.columns="HBASE_ROW_KEY,$a,$b,$c,$d,$e,$f,$g,$h,$i,$j,$k,$l" '-Dimporttsv.separator=,' $tableName $hdfsPATH
#echo "deleteall '$tableName', '$a' " | hbase shell
echo "count number of rows in: $tableName ___________________"
echo "count '$tableName' " | hbase shell
echo "scan '$tableName' " | hbase shell
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment