Aki Atoji akiatoji

## spacey_hash.rb
#  Add this to initializers
#
#   { foo: 'real foo' }.to_query  =>  real+foo
#   { foo: 'real foo' }.to_spacey_query =>  real%20foo

Hash.class_eval do
  def to_spacey_query
    collect do |key, value|
      "#{ key.to_param }=#{ ERB::Util::u(value.to_param.to_s)} "
    end.sort * '&'

## co_wrap.lua
-- Reimplemented coroutine.wrap, returning "nil, err" if the coroutine cannot
-- be resumed.
local co_wrap = function(func)
    local co = coroutine.create(func)
    ngx.log(ngx.DEBUG, "co created with status ", coroutine.status(co))
    return function(...)
        if coroutine.status(co) == "suspended" then
            return select(2, coroutine.resume(co, ...))
        else
            return nil, "can't resume a " .. coroutine.status(co) .. " coroutine"

## pig.rb
# If you installed Hadoop and Pig with Homebrew on OS X, then get this error:
#
# ERROR org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl - Error whiletrying to run jobs.java.lang.IncompatibleClassChangeError: Found interface org.apache.hadoop.mapreduce.JobContext, but class was expected.
#
# Then this formula is for you.   It downloads pig src, then builds it for >Hadoop 2.3
#
#

require 'formula'

## basic_mapred_build.xml
<!--
     Ever since Hadoop 2 deprecated hadoop-core.jar, building mapreduce jars isn't straightforward
     especially if you want to use newer bits.

     I just use this template ant build.xml that includes all hadoop jars, aaaaand all them variations
     of the same lib jars that hadoop jars depend on.

     hadoop.home is set to OS X homebrew, switch to whatever home is for the distribution.
-->

## Cassandra_on_CentOS_6.5_on_VirtualBox
# Enable eth0
vi network-scrips/ifcfg-eth0   # change to ONBOOT=yes
reboot now


# Install dependencies for VBoxGuest
yum -y install kernel-devel-2.6.32-431.el6..x86_64
yum -y groupinstall "Development tools"
yum -y groupinstall "Desktop" "Desktop Platform" "X Window System" "Fonts"
vi /etc/inittab   # Change default run level to 5

## kafka_connect_zookeeper.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                akiatoji
                / kafka_connect_zookeeper.md
            
            
              Created
              October 27, 2014 15:55
            
              
                Why can't Kafka connect to zookeeper?
              
          
    Kafka -> Zookeeper

Kafka seems to try IPv6 address to connect to Zookeeper first, but most hadoop installation does not use IPv6 (well, CDH5 anyways).
So you might get a half-working Kafka installation like the following:
[cdh5@cdh5 kafka_2.10-0.8.1.1]$ bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test
Created topic "test".

  
## Complex Avro in Hadoop.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                akiatoji
                / Complex Avro in Hadoop.md
            
            
              Last active
              August 29, 2015 14:18
            
          
    Complex Avro Objects in Hadoop

I've been working on Avro objects that's fairly complex, generated from heterogeneous systems (i.e. C#/Java).
The objects have arrays of maps, maps with arrays, maps with complex values, maps with complex values that has arrays of maps... and so on.
In trying to query these objects, I ran into surprising number of issues that took some time and effort to investigate.
Hive


## README.md

      
              4 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                akiatoji
                / README.md
            
            
              Created
              June 25, 2015 15:32
            
              
                Hadoop pseudo cluster on OS X
              
          
    Hadoop Config on OS X

Config to apply once Hadoop (2.7.0 as of now) is installed on OS X
Here's some thought on config:


Nodename = localhost means Hadoop will be accessed from the same host.  If accessing from another host, change localhost to actual host name.  If namenode name in core-site.xml doesn't match what client uses to connect, you'll get the dreaded connection refused error.


The system has plenty (16G) of RAM.  It's better to overcommit maximum allocation, so you might as well set maximum allocaiton even higher to like 24G.  Otherwise your Yarn jobs can get stuck in wait states.   Yarn's memory calculation doesn't reflect actual memory usage.


## Parallel R.r
# Quick and dirty comparison of parallelism in R

#import packages
library(doMC)
library(doMPI)
library(foreach)
library(doParallel)

#number of iterations and procs
iters<-1e4

## SparkR_OSX_HomeBrew.R
# SparkR RULES!

#Load libraries
library("rJava")
library(SparkR, lib.loc="/usr/local/Cellar/apache-spark/1.4.0/libexec/R/lib")

sc <- sparkR.init(sparkHome = "/usr/local/Cellar/apache-spark/1.4.0/libexec")
sqc <- sparkRSQL.init(sc)

# Create SparkR dataframe from R dataframe
	# Add this to initializers
	#
	# { foo: 'real foo' }.to_query => real+foo
	# { foo: 'real foo' }.to_spacey_query => real%20foo

	Hash.class_eval do
	def to_spacey_query
	collect do \|key, value\|
	"#{ key.to_param }=#{ ERB::Util::u(value.to_param.to_s)} "
	end.sort * '&'
	-- Reimplemented coroutine.wrap, returning "nil, err" if the coroutine cannot
	-- be resumed.
	local co_wrap = function(func)
	local co = coroutine.create(func)
	ngx.log(ngx.DEBUG, "co created with status ", coroutine.status(co))
	return function(...)
	if coroutine.status(co) == "suspended" then
	return select(2, coroutine.resume(co, ...))
	else
	return nil, "can't resume a " .. coroutine.status(co) .. " coroutine"
	# If you installed Hadoop and Pig with Homebrew on OS X, then get this error:
	#
	# ERROR org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl - Error whiletrying to run jobs.java.lang.IncompatibleClassChangeError: Found interface org.apache.hadoop.mapreduce.JobContext, but class was expected.
	#
	# Then this formula is for you. It downloads pig src, then builds it for >Hadoop 2.3
	#
	#

	require 'formula'
	<!--
	Ever since Hadoop 2 deprecated hadoop-core.jar, building mapreduce jars isn't straightforward
	especially if you want to use newer bits.

	I just use this template ant build.xml that includes all hadoop jars, aaaaand all them variations
	of the same lib jars that hadoop jars depend on.

	hadoop.home is set to OS X homebrew, switch to whatever home is for the distribution.
	-->
	# Enable eth0
	vi network-scrips/ifcfg-eth0 # change to ONBOOT=yes
	reboot now


	# Install dependencies for VBoxGuest
	yum -y install kernel-devel-2.6.32-431.el6..x86_64
	yum -y groupinstall "Development tools"
	yum -y groupinstall "Desktop" "Desktop Platform" "X Window System" "Fonts"
	vi /etc/inittab # Change default run level to 5
	# Quick and dirty comparison of parallelism in R

	#import packages
	library(doMC)
	library(doMPI)
	library(foreach)
	library(doParallel)

	#number of iterations and procs
	iters<-1e4
	# SparkR RULES!

	#Load libraries
	library("rJava")
	library(SparkR, lib.loc="/usr/local/Cellar/apache-spark/1.4.0/libexec/R/lib")

	sc <- sparkR.init(sparkHome = "/usr/local/Cellar/apache-spark/1.4.0/libexec")
	sqc <- sparkRSQL.init(sc)

	# Create SparkR dataframe from R dataframe