Skip to content

Instantly share code, notes, and snippets.

@asw456
asw456 / gist:fa309f0967a2cfa2c4e2
Last active August 27, 2015 04:30 — forked from need4spd/gist:4584416
hadoop multiple outputs map/reduce sample
//mapper
package com.tistory.devyongsik.hadoop.mapre;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
@asw456
asw456 / mo_example.java
Last active August 27, 2015 04:30 — forked from rstrickland/mo_example.java
MultipleOutputs Example
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import org.apache.cassandra.db.IColumn;
import org.apache.cassandra.thrift.*;
import org.apache.cassandra.hadoop.*;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
@asw456
asw456 / ann.py
Created March 25, 2014 08:28 — forked from arngarden/ann.py
import theano
from pylearn2.models import mlp
from pylearn2.train_extensions import best_params
from pylearn2.training_algorithms import sgd, learning_rule
from pylearn2.utils import serial
from pylearn2.termination_criteria import MonitorBased
from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix
from sklearn.preprocessing import StandardScaler
import numpy as np
from random import randint
### MATPLOTLIBRC FORMAT
# This is a sample matplotlib configuration file - you can find a copy
# of it on your system in
# site-packages/matplotlib/mpl-data/matplotlibrc. If you edit it
# there, please note that it will be overridden in your next install.
# If you want to keep a permanent local copy that will not be
# over-written, place it in HOME/.matplotlib/matplotlibrc (unix/linux
# like systems) and C:\Documents and Settings\yourname\.matplotlib
# (win32 systems).
import numpy as np
from sklearn.feature_extraction import image
from sklearn.cluster import MiniBatchKMeans
from sklearn import cross_validation, svm, datasets
from sklearn.datasets import fetch_olivetti_faces, fetch_mldata
from matplotlib import pylab as pl
def HIK_kernel(X,Y):
return np.array([[np.sum(np.minimum(x,y)) for y in Y] for x in X])
@asw456
asw456 / igamma.py
Created April 28, 2014 03:15 — forked from sergeyf/igamma.py
from scipy.stats import rv_continuous
from scipy.special import gammaln, gammaincinv, gammainc
from numpy import log,exp
class igamma_gen(rv_continuous):
def _pdf(self, x, a, b):
return exp(self._logpdf(x,a,b))
def _logpdf(self, x, a, b):
return a*log(b) - gammaln(a) -(a+1)*log(x) - b/x
def _cdf(self, x, a, b):
Scan scan = new Scan();
scan.setFilter(new MyFilter(appId)); // get only rows for the app with appId
Htable table = new HTable(config, Bytes.UTF8(tableName); // for this table
ResultScanner results = table.getScanner(scan); // apply the scan
Scan scan = new Scan();
scan.setFilter(new ProxyFilter(new MyFilter(appId)));
public class ExampleRowKey
{
long userId;
String applicationId;
public byte[] getBytes() throws IOException
{
ByteArrayOutputStream byteOutput = new ByteArrayOutputStream();
DataOutputStream data = new DataOutputStream(byteOutput);
********************************
Gist
********************************
Motivation
-----------
The typical mapreduce job creates files with the prefix "part-"..and then the "m" or "r" depending
on whether it is a map or a reduce output, and then the part number. There are scenarios where we
may want to create separate files based on criteria-data keys and/or values. Enter the "MultipleOutputs"
functionality.