Tim Robertson timrobertson100

## notes.txt
Based on ideas here, expanded to enable Hive support
https://www.linkedin.com/pulse/running-spark-2xx-cloudera-hadoop-distro-cdh-deenar-toraskar-cfa/

wget https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-without-hadoop.tgz

tar -xvzf spark-2.4.8-bin-without-hadoop.tgz
cd spark-2.4.8-bin-without-hadoop

cp -R /etc/spark2/conf/* conf/
cp /etc/hive/conf/hive-site.xml conf/

## NameLookup.java
package org.gbif.pipelines.ingest.java;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;

## export-cluster.sql
-- wget https://repository.gbif.org/repository/gbif/org/gbif/occurrence/occurrence-hive/0.187/occurrence-hive-0.187-jar-with-dependencies.jar
-- hdfs dfs -put occurrence-hive-0.187-jar-with-dependencies.jar /tmp
-- wget https://repository.gbif.org/repository/central/com/klout/brickhouse/0.6.0/brickhouse-0.6.0.jar
-- hdfs dfs -put brickhouse-0.6.0.jar /tmp

ADD JAR hdfs:///tmp/occurrence-hive-0.187-jar-with-dependencies.jar;
ADD JAR hdfs:///tmp/brickhouse-0.6.0.jar;

CREATE TEMPORARY FUNCTION toLocalISO8601 AS 'org.gbif.occurrence.hive.udf.ToLocalISO8601UDF';
CREATE TEMPORARY FUNCTION joinArray AS 'brickhouse.udf.collect.JoinArrayUDF';

## datapackage.json
{
  "id": "https://doi.org/10.5281/zenodo.3968687",
  "profile": "tabular-data-package",
  "resources": [
    {
      "name": "reference-data",
      "path": "https://zenodo.org/record/3968687/files/LBBG_ZEEBRUGGE-reference-data.csv",
      "profile": "tabular-data-resource",
      "schema": {
        "fields": [

## split-avro.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                timrobertson100
                / split-avro.md
            
            
              Created
              April 11, 2020 11:26
            
              
                Splitting the single Avro file using Hive
              
          
    bq load --source_format=AVRO dataset.table gs://mybucket/00/*.avro
did a small download to get the schema

avro-utils getschema ... > schema.avsc
hdfs dfs -put schema.avsc /tmp

  
## bloodhound.md

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              0 stars
            
          
                timrobertson100
                / bloodhound.md
            
            
              Last active
              February 19, 2020 15:56
            
              
                A quick test to explore a bloodhound process
              
          
    This is a quick test of a modified version of the Bloodhound spark script to check it runs on the GBIF Cloudera cluster (CDH 5.16.2).
From the gateway, grab the file from HDFS (skip HTTP for speed), unzip (15-20 mins) and upload to HDFS:
hdfs dfs -getmerge /occurrence-download/prod-downloads/0002504-181003121212138.zip /mnt/auto/misc/bloodhound/data.zip
unzip /mnt/auto/misc/bloodhound/data.zip -d /mnt/auto/misc/bloodhound/data

hdfs dfs -rm /tmp/verbatim.txt
hdfs dfs -rm /tmp/occurrence.txt


## pipelines-demo.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                timrobertson100
                / pipelines-demo.md
            
            
              Last active
              November 26, 2019 22:36
            
              
                Example pipelines
              
          
    Example running standalone GBIF pipelines

This example will show a DwC-A into interpreted Avro files.
git clone https://github.com/gbif/pipelines.git
cd pipelines
mvn clean package -DskipTests


## ids.md

      
              1 file
            
          
              0 forks
            
          
              2 comments
            
          
              0 stars
            
          
                timrobertson100
                / ids.md
            
            
              Created
              November 18, 2019 12:22
            
              
                Diagnosing id duplication
              
          
    Using the lookup tool on c5gateway-vh.gbif.org we can get the keys for the id 1668748136:
12:06:39 UTC c5gateway-vh /usr/local/bin $ ./lookup-occurrence-key 1668748136
Lookup 1668748136 with dataset key from API 97bd086a-cf43-11e2-a9b3-00145eb45e9a
 27:97bd086a-cf43-11e2-a9b3-00145eb45e9a|JMRC|JMRCfungicoll|JMRC:FSU:02570 / 14837 / 750|null column=o:i, timestamp=1553909664771, value=\x00\x00\x00\x00cw\x13h
 73:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5004 column=o:i, timestamp=1563244584180, value=\x00\x00\x00\x00cw\x13h
 74:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5005 column=o:i, timestamp=1563244586420, value=\x00\x00\x00\x00cw\x13h
 75:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5006 column=o:i, timestamp=1553909265952, value=\x00\x00\x00\x00cw\x13h
 76:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5007 column=o:i, timestamp=1563244589868, value=\x00\x00\x00\x00cw\x13h


## stats.json
{
   "_shards": {
      "total": 812,
      "successful": 812,
      "failed": 0
   },
   "_all": {
      "primaries": {},
      "total": {}
   },

## format.txt
occurrenceCount,

// verbatim fields in records
v_kingdom,
v_phylum,
v_class,
v_order,
v_family,
v_genus,
v_scientificName,
	Based on ideas here, expanded to enable Hive support
	https://www.linkedin.com/pulse/running-spark-2xx-cloudera-hadoop-distro-cdh-deenar-toraskar-cfa/

	wget https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-without-hadoop.tgz

	tar -xvzf spark-2.4.8-bin-without-hadoop.tgz
	cd spark-2.4.8-bin-without-hadoop

	cp -R /etc/spark2/conf/* conf/
	cp /etc/hive/conf/hive-site.xml conf/
	package org.gbif.pipelines.ingest.java;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.hbase.HBaseConfiguration;
	import org.apache.hadoop.hbase.TableName;
	import org.apache.hadoop.hbase.client.Connection;
	import org.apache.hadoop.hbase.client.ConnectionFactory;
	import org.apache.hadoop.hbase.client.Get;
	import org.apache.hadoop.hbase.client.HTable;
	import org.apache.hadoop.hbase.client.Result;
	-- wget https://repository.gbif.org/repository/gbif/org/gbif/occurrence/occurrence-hive/0.187/occurrence-hive-0.187-jar-with-dependencies.jar
	-- hdfs dfs -put occurrence-hive-0.187-jar-with-dependencies.jar /tmp
	-- wget https://repository.gbif.org/repository/central/com/klout/brickhouse/0.6.0/brickhouse-0.6.0.jar
	-- hdfs dfs -put brickhouse-0.6.0.jar /tmp

	ADD JAR hdfs:///tmp/occurrence-hive-0.187-jar-with-dependencies.jar;
	ADD JAR hdfs:///tmp/brickhouse-0.6.0.jar;

	CREATE TEMPORARY FUNCTION toLocalISO8601 AS 'org.gbif.occurrence.hive.udf.ToLocalISO8601UDF';
	CREATE TEMPORARY FUNCTION joinArray AS 'brickhouse.udf.collect.JoinArrayUDF';
	{
	"id": "https://doi.org/10.5281/zenodo.3968687",
	"profile": "tabular-data-package",
	"resources": [
	{
	"name": "reference-data",
	"path": "https://zenodo.org/record/3968687/files/LBBG_ZEEBRUGGE-reference-data.csv",
	"profile": "tabular-data-resource",
	"schema": {
	"fields": [
	{
	"_shards": {
	"total": 812,
	"successful": 812,
	"failed": 0
	},
	"_all": {
	"primaries": {},
	"total": {}
	},
	occurrenceCount,

	// verbatim fields in records
	v_kingdom,
	v_phylum,
	v_class,
	v_order,
	v_family,
	v_genus,
	v_scientificName,