Skip to content

Instantly share code, notes, and snippets.

@geofferyzh
geofferyzh / iterator_to_arraylist.java
Created October 5, 2012 14:12
Hadoop 101 - Reducer Iterator to Arraylist
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.util.List;
import java.util.ArrayList;
public class CosineReducer2 extends Reducer<Text, Text, Text, IntWritable> {
// Reduce task ----------------------------------------------------------------
@geofferyzh
geofferyzh / chainjob.java
Created October 5, 2012 13:16
Hadoop 101 - Chaining Multiple Jobs
// Make sure you set output of job1 as input to job 2
public class ChainJob extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
//conf.addResource("CosineSim.xml");
@geofferyzh
geofferyzh / descendingsort.java
Created October 5, 2012 13:08
Hadoop 101 - Descending Sort
// Set Sort Comparator Class in the Driver code
job.setSortComparatorClass(SortFloatComparator.class);
// Write a Sort Comparator and save as "SortFloatComparator.java"
import java.io.*;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
@geofferyzh
geofferyzh / log4j.java
Created October 2, 2012 15:47
Hadoop 101 - Debug using Log4J.Logger
// Get a handle on a Logger by putting something like this in your class
import org.apache.log4j.Logger;
...
public class Foo {
private static final Logger sLogger = Logger.getLogger(Foo.class);
...
}
@geofferyzh
geofferyzh / bufferreader.java
Created October 2, 2012 15:34
Hadoop 101 - Read a local file using bufferreader
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
public static final String InitialActivationList = "/.../Initial_Activation_List.txt";
String toBeActivated;
public void loadInitialActivation(String InitialActivationList, int linenum_index) throws IOException {
String line;
int linenum = 0;
@geofferyzh
geofferyzh / GenericOptionsParser.java
Created October 2, 2012 14:11
GenericOptionsParser's -files flag to pass metadata to task nodes
// Pass the file to the task nodes using genericoptionsparser's -files flag
// $ hadoop jar MyJob.jar -conf /path/to/cluster-conf.xml -files /path/to/local-file.csv data/input data/output
public static class TheMapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
public void setup(Context context) throws IOException, InterruptedException {
CsvReader csv = new CsvReader(new File("local-file.csv"));
@geofferyzh
geofferyzh / SideDataDistribution.java
Created October 2, 2012 02:49
Hadoop 101 - Side Data Distribution using Job Configuration
// Set the side data info in job configuration file
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
@geofferyzh
geofferyzh / testdistcache.java
Created October 2, 2012 02:22
Hadoop 101 - Distributed Cache
// In the job driver code, specify cache file location, add file to distributed cache
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.filecache.DistributedCache;
public class testdistcache extends Configured implements Tool {
public static final String localCacheFile= "localpath/localCacheFile.txt";
public static final String hdfsCacheFile = "hdfspath/hdfsCacheFile.txt";
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
// Compares the composite key
public class CompositeKeyComparator extends WritableComparator {
/*s Constructor. */
protected CompositeKeyComparator() {
super(Text.class, true);
@geofferyzh
geofferyzh / Driver.java
Created August 27, 2012 14:55
PYMK Stage2 C
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.conf.Configured;