Created
February 16, 2018 20:44
-
-
Save DavidRdgz/fe9d4241ab3b3a33ba040f6c7172241b to your computer and use it in GitHub Desktop.
ML models like vectors or sparse vectors. Creating a pivot table in MapReduce can create a sparse vector.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
A (countryA = 1)(countryE = 1)(countryD = 1)(countryC = 1)(countryB = 2) | |
B (countryF = 1)(countryA = 1)(countryE = 1)(countryD = 4)(countryC = 1)(countryB = 1) | |
C (countryG = 1)(countryA = 1)(countryE = 1)(countryD = 4)(countryC = 3)(countryB = 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
A countryA | |
A countryB | |
A countryB | |
A countryC | |
A countryD | |
A countryE | |
B countryA | |
B countryB | |
B countryC | |
B countryD | |
B countryD | |
B countryD | |
B countryD | |
B countryE | |
B countryF | |
C countryA | |
C countryB | |
C countryC | |
C countryC | |
C countryC | |
C countryD | |
C countryD | |
C countryD | |
C countryD | |
C countryE | |
C countryG |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* gradle clean | |
* gradle build | |
* | |
* hadoop jar build/libs/pivot-table-1.0-SNAPSHOT.jar com.dvidr.PivotTable src/main/resources/pivotdata.txt src/main/resources/output | |
* | |
*/ | |
package com.dvidr; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.conf.Configured; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.hadoop.io.LongWritable; | |
import org.apache.hadoop.io.Text; | |
import org.apache.hadoop.mapreduce.Job; | |
import org.apache.hadoop.mapreduce.Mapper; | |
import org.apache.hadoop.mapreduce.Reducer; | |
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; | |
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; | |
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; | |
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; | |
import org.apache.hadoop.util.Tool; | |
import org.apache.hadoop.util.ToolRunner; | |
import java.io.IOException; | |
import java.util.HashMap; | |
import java.util.Iterator; | |
import java.util.Map; | |
public class PivotTableV2 extends Configured implements Tool { | |
public static class AMap extends Mapper<LongWritable, Text, Text, Text> { | |
private Text word = new Text(); | |
private Text word2 = new Text(); | |
@Override | |
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { | |
// file schema: hostname country | |
String[] parts = value.toString().split("\\s"); | |
word.set(parts[0]); | |
word2.set(parts[1]); | |
context.write(word, word2); | |
} | |
} | |
public static class AReduce extends Reducer<Text, Text, Text, Text> { | |
private Text word = new Text(); | |
private Map<String, Integer> map = new HashMap<String, Integer>(); | |
private static String stringMap(Map mp) { | |
Iterator it = mp.entrySet().iterator(); | |
StringBuilder sb = new StringBuilder(); | |
while (it.hasNext()) { | |
Map.Entry pair = (Map.Entry) it.next(); | |
sb.append("(" + pair.getKey() + " = " + pair.getValue().toString() + ")"); | |
it.remove(); | |
} | |
return sb.toString(); | |
} | |
@Override | |
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { | |
for (Text value : values) { | |
String factor = value.toString(); | |
Integer i = map.get(factor); | |
if (i == null) { | |
i = 0; | |
} | |
map.put(factor, i + 1); | |
} | |
word.set(stringMap(map)); | |
context.write(key, word); | |
} | |
} | |
public int run(String[] args) throws Exception { | |
Configuration conf = getConf(); | |
Job job = Job.getInstance(conf); | |
job.setJarByClass(PivotTable.class); | |
job.setOutputKeyClass(Text.class); | |
job.setOutputValueClass(Text.class); | |
job.setMapperClass(AMap.class); | |
job.setReducerClass(AReduce.class); | |
job.setInputFormatClass(TextInputFormat.class); | |
job.setOutputFormatClass(TextOutputFormat.class); | |
job.setNumReduceTasks(1); | |
FileInputFormat.setInputPaths(job, new Path(args[0])); | |
FileOutputFormat.setOutputPath(job, new Path(args[1])); | |
return (job.waitForCompletion(true) ? 0 : 1); | |
} | |
public static void main(String[] args) throws Exception { | |
int exitCode = ToolRunner.run(new Configuration(), new PivotTableV2(), args); | |
System.exit(exitCode); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment