Tim Robertson timrobertson100

## gist:2d6239b5f846ff8f261f
package org.gbif.hadoop.compress;

import java.io.IOException;
import java.io.OutputStream;
import java.util.zip.CRC32;
import java.util.zip.Checksum;
import java.util.zip.Deflater;
import java.util.zip.DeflaterOutputStream;

/**

## gist:031bbe37d79d82f7c1fa
  /**
   * Writes the custom fixed length footer to the stream.
   */
  @Override
  public void finish() throws IOException {
    flush(); // make sure deflater flushes, and counts are accurate

    // Push the custom footer to the output stream
    ByteBuffer footer = ByteBuffer.allocate(26);
    footer.put(FOOTER_CLOSE_DEFLATE);           // 2 bytes: which means the deflate stream can be read in isolation

## gist:86799f2770a0c4dc04ac
  /**
   * An end to end test that writes some random files and ensures that when deflated separately, merged and inflated
   * they represent the same byte sequence as a concatenation of the original files.
   */
  @Test
  public void testParallelCompress() throws IOException {

    // generate the uncompressed files and create a merged version
    List<File> parts = Lists.newArrayList();
    for (int i = 0; i < NUMBER_PARTS; i++) {

## gist:5fc8e27c3dc1afa703d5

----
-- Compress data (2.8 million record result set)
-- Runtime: 37 secs
----
SET hive.exec.compress.output=true;
SET io.seqfile.compression.type=BLOCK;
SET mapred.output.compression.codec = org.gbif.hadoop.compress.d2.D2Codec;

CREATE TABLE tim.occurrence_tab_def2

## gist:58bd652a418e6c9691a6

1) get tab file (runtime 2 mins):
$ hadoop dfs -getmerge /user/hive/warehouse/tim.db/occurrence_tab occurrence.txt
-> problem #1.  5.4GB file just pulled of hadoop

2) zip the file on the local filesystem (runtime 90 secs)
$ zip local.zip occurrence_tab.txt


## gist:8a1ced7a91b7e7f3812a
# Detailed steps to distribute a new codec for Hadoop for use in Hive tab delimited files.

##
# 1: Copy up the compress jar around the cluster
##
$ scp hadoop-compress-1.0-SNAPSHOT.jar root@c2n1.gbif.org:/usr/local/lib
$ scp hadoop-compress-1.0-SNAPSHOT.jar root@c2n2.gbif.org:/usr/local/lib
$ scp hadoop-compress-1.0-SNAPSHOT.jar root@c2n3.gbif.org:/usr/local/lib

##

## gist:8793ab1c3410b19c4fde

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                timrobertson100
                / gist:8793ab1c3410b19c4fde
            
            
              Last active
              August 29, 2015 14:06
            
          
    Summary of the GBIF dev environment for the codec


This is what I did... Now we need to work out what was unncessary!

Copy Jar around the slaves

$ scp hadoop-compress-1.0-SNAPSHOT.jar root@c2n1.gbif.org:/usr/local/lib
$ scp hadoop-compress-1.0-SNAPSHOT.jar root@c2n2.gbif.org:/usr/local/lib
$ scp hadoop-compress-1.0-SNAPSHOT.jar root@c2n3.gbif.org:/usr/local/lib


## gist:9b8d23ac0bff444059a3

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                timrobertson100
                / gist:9b8d23ac0bff444059a3
            
            
              Last active
              August 29, 2015 14:06
            
          
    This explains the dip we are seeing on Plantae on http://oliver.gbif.org/global/
SELECT
  occ1.k, occ1.cnt, occ2.cnt, occ2.cnt - occ1.cnt as increase
FROM
  (SELECT COALESCE(kingdom, 'UNKNOWN') AS k, count(*) AS cnt 
   FROM occurrence_20140908 GROUP BY kingdom) occ1 
JOIN
  (SELECT COALESCE(kingdom, 'UNKNOWN') AS k, count(*) AS cnt 


## gist:1f0d68c8339e88b7c7de

      
              1 file
            
          
              0 forks
            
          
              1 comment
            
          
              0 stars
            
          
                timrobertson100
                / gist:1f0d68c8339e88b7c7de
            
            
              Last active
              August 29, 2015 14:06
            
              
                Reducing occurrence download widths to match content
              
          
    Optimizing the downloads for users

GBIF.org delivers really wide tables, which are unmanageable for many, and slow to work with.  By only returning columns with actual values in the data returned for any query, users will have narrower tables and will be easier to manage.
Currently we have 441 fields in occurrence_hdfs.  Of these, across all records, only 347 are populated in one or more records.
We could consider

creating occurrence_hdfs only as wide as it needs to be - e.g. skip terms never populated (speeding up download MR jobs)
doing the same query before each download query will likely reduce the width further depending on the biases in the data


## gist:cbbd1175bcfce8132746
Task Logs: 'attempt_201410110944_1171_m_000000_0'


stdout logs

Oozie Launcher starts

Heart beat
Starting the execution of prepare actions
	package org.gbif.hadoop.compress;

	import java.io.IOException;
	import java.io.OutputStream;
	import java.util.zip.CRC32;
	import java.util.zip.Checksum;
	import java.util.zip.Deflater;
	import java.util.zip.DeflaterOutputStream;

	/**
	/**
	* Writes the custom fixed length footer to the stream.
	*/
	@Override
	public void finish() throws IOException {
	flush(); // make sure deflater flushes, and counts are accurate

	// Push the custom footer to the output stream
	ByteBuffer footer = ByteBuffer.allocate(26);
	footer.put(FOOTER_CLOSE_DEFLATE); // 2 bytes: which means the deflate stream can be read in isolation
	/**
	* An end to end test that writes some random files and ensures that when deflated separately, merged and inflated
	* they represent the same byte sequence as a concatenation of the original files.
	*/
	@Test
	public void testParallelCompress() throws IOException {

	// generate the uncompressed files and create a merged version
	List<File> parts = Lists.newArrayList();
	for (int i = 0; i < NUMBER_PARTS; i++) {

	----
	-- Compress data (2.8 million record result set)
	-- Runtime: 37 secs
	----
	SET hive.exec.compress.output=true;
	SET io.seqfile.compression.type=BLOCK;
	SET mapred.output.compression.codec = org.gbif.hadoop.compress.d2.D2Codec;

	CREATE TABLE tim.occurrence_tab_def2

	1) get tab file (runtime 2 mins):
	$ hadoop dfs -getmerge /user/hive/warehouse/tim.db/occurrence_tab occurrence.txt
	-> problem #1. 5.4GB file just pulled of hadoop

	2) zip the file on the local filesystem (runtime 90 secs)
	$ zip local.zip occurrence_tab.txt
	# Detailed steps to distribute a new codec for Hadoop for use in Hive tab delimited files.

	##
	# 1: Copy up the compress jar around the cluster
	##
	$ scp hadoop-compress-1.0-SNAPSHOT.jar root@c2n1.gbif.org:/usr/local/lib
	$ scp hadoop-compress-1.0-SNAPSHOT.jar root@c2n2.gbif.org:/usr/local/lib
	$ scp hadoop-compress-1.0-SNAPSHOT.jar root@c2n3.gbif.org:/usr/local/lib

	##
	Task Logs: 'attempt_201410110944_1171_m_000000_0'



	stdout logs

	Oozie Launcher starts

	Heart beat
	Starting the execution of prepare actions