timrobertson100/gist:2a5a23ed0494c35317e8

## gistfile1.md

      
    Raw
  

              gistfile1.md
            
          
    Step 1: Create a table as CSV
CREATE EXTERNAL TABLE tim.delimiter_csv (
  gbifId INT,
  v_scientificName STRING  
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE LOCATION '/user/tim/delimiter.csv'

Step 2: Populate the table

add the jar (/user/trobertson/occurrence-hive-0.22-SNAPSHOT-jar-with-dependencies.jar)
create the UDF (cleanDelimiters and org.gbif.occurrence.hive.udf.CleanDelimiterCharsUDF)
run the SQL:

INSERT OVERWRITE TABLE tim.delimiter_csv
SELECT 
  gbifId, 
  cleanDelimiters(v_scientificName)
FROM prod_b.occurrence_hdfs
LIMIT 1000000