asw456 asw456

## gist:fa309f0967a2cfa2c4e2
//mapper
package com.tistory.devyongsik.hadoop.mapre;

import java.io.IOException;

import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


## mo_example.java
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import org.apache.cassandra.db.IColumn;
import org.apache.cassandra.thrift.*;
import org.apache.cassandra.hadoop.*;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;

## ann.py
import theano
from pylearn2.models import mlp
from pylearn2.train_extensions import best_params
from pylearn2.training_algorithms import sgd, learning_rule
from pylearn2.utils import serial
from pylearn2.termination_criteria import MonitorBased
from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix
from sklearn.preprocessing import StandardScaler
import numpy as np
from random import randint

## matplotlibrc
### MATPLOTLIBRC FORMAT

# This is a sample matplotlib configuration file - you can find a copy
# of it on your system in
# site-packages/matplotlib/mpl-data/matplotlibrc.  If you edit it
# there, please note that it will be overridden in your next install.
# If you want to keep a permanent local copy that will not be
# over-written, place it in HOME/.matplotlib/matplotlibrc (unix/linux
# like systems) and C:\Documents and Settings\yourname\.matplotlib
# (win32 systems).

## chi_square_kernel.py
import numpy as np
from sklearn.feature_extraction import image
from sklearn.cluster import MiniBatchKMeans
from sklearn import cross_validation, svm, datasets
from sklearn.datasets import fetch_olivetti_faces, fetch_mldata
from matplotlib import pylab as pl

def HIK_kernel(X,Y):
    return np.array([[np.sum(np.minimum(x,y)) for y in Y] for x in X])


## igamma.py
from scipy.stats import rv_continuous
from scipy.special import gammaln, gammaincinv, gammainc
from numpy import log,exp

class igamma_gen(rv_continuous):
  def _pdf(self, x, a, b):
      return exp(self._logpdf(x,a,b))
  def _logpdf(self, x, a, b):
      return a*log(b) - gammaln(a) -(a+1)*log(x) - b/x
  def _cdf(self, x, a, b):

## gist:2c748257897f595bb3a4
Scan scan = new Scan();
scan.setFilter(new MyFilter(appId)); // get only rows for the app with appId
Htable table = new HTable(config, Bytes.UTF8(tableName); // for this table
ResultScanner results = table.getScanner(scan); // apply the scan

## gist:6af8be185511a799621f
Scan scan = new Scan();
scan.setFilter(new ProxyFilter(new MyFilter(appId)));

## gist:764d961de61a10305aaa
public class ExampleRowKey
{
	long userId;
	String applicationId;

	public byte[] getBytes() throws IOException
	{
		ByteArrayOutputStream byteOutput = new ByteArrayOutputStream();
		DataOutputStream data = new DataOutputStream(byteOutput);


## 00-MultipleOutputs
********************************
Gist
********************************

Motivation
-----------
The typical mapreduce job creates files with the prefix "part-"..and then the "m" or "r" depending
on whether it is a map or a reduce output, and then the part number.  There are scenarios where we
may want to create separate files based on criteria-data keys and/or values.  Enter the "MultipleOutputs"
functionality.
	//mapper
	package com.tistory.devyongsik.hadoop.mapre;

	import java.io.IOException;

	import org.apache.hadoop.io.DoubleWritable;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Mapper;
	import java.io.IOException;
	import java.nio.ByteBuffer;
	import java.util.*;
	import org.apache.cassandra.db.IColumn;
	import org.apache.cassandra.thrift.*;
	import org.apache.cassandra.hadoop.*;
	import org.apache.cassandra.utils.ByteBufferUtil;
	import org.apache.hadoop.conf.*;
	import org.apache.hadoop.io.*;
	import org.apache.hadoop.mapreduce.*;
	import theano
	from pylearn2.models import mlp
	from pylearn2.train_extensions import best_params
	from pylearn2.training_algorithms import sgd, learning_rule
	from pylearn2.utils import serial
	from pylearn2.termination_criteria import MonitorBased
	from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix
	from sklearn.preprocessing import StandardScaler
	import numpy as np
	from random import randint
	### MATPLOTLIBRC FORMAT

	# This is a sample matplotlib configuration file - you can find a copy
	# of it on your system in
	# site-packages/matplotlib/mpl-data/matplotlibrc. If you edit it
	# there, please note that it will be overridden in your next install.
	# If you want to keep a permanent local copy that will not be
	# over-written, place it in HOME/.matplotlib/matplotlibrc (unix/linux
	# like systems) and C:\Documents and Settings\yourname\.matplotlib
	# (win32 systems).
	import numpy as np
	from sklearn.feature_extraction import image
	from sklearn.cluster import MiniBatchKMeans
	from sklearn import cross_validation, svm, datasets
	from sklearn.datasets import fetch_olivetti_faces, fetch_mldata
	from matplotlib import pylab as pl

	def HIK_kernel(X,Y):
	return np.array([[np.sum(np.minimum(x,y)) for y in Y] for x in X])
	from scipy.stats import rv_continuous
	from scipy.special import gammaln, gammaincinv, gammainc
	from numpy import log,exp

	class igamma_gen(rv_continuous):
	def _pdf(self, x, a, b):
	return exp(self._logpdf(x,a,b))
	def _logpdf(self, x, a, b):
	return alog(b) - gammaln(a) -(a+1)log(x) - b/x
	def _cdf(self, x, a, b):
	Scan scan = new Scan();
	scan.setFilter(new MyFilter(appId)); // get only rows for the app with appId
	Htable table = new HTable(config, Bytes.UTF8(tableName); // for this table
	ResultScanner results = table.getScanner(scan); // apply the scan
	Scan scan = new Scan();
	scan.setFilter(new ProxyFilter(new MyFilter(appId)));
	public class ExampleRowKey
	{
	long userId;
	String applicationId;

	public byte[] getBytes() throws IOException
	{
	ByteArrayOutputStream byteOutput = new ByteArrayOutputStream();
	DataOutputStream data = new DataOutputStream(byteOutput);
	********************************
	Gist
	********************************

	Motivation
	-----------
	The typical mapreduce job creates files with the prefix "part-"..and then the "m" or "r" depending
	on whether it is a map or a reduce output, and then the part number. There are scenarios where we
	may want to create separate files based on criteria-data keys and/or values. Enter the "MultipleOutputs"
	functionality.