Skip to content

Instantly share code, notes, and snippets.

@jorisbertomeu
Created April 27, 2017 08:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jorisbertomeu/df105d75c2e3712c5119c10eceff4424 to your computer and use it in GitHub Desktop.
Save jorisbertomeu/df105d75c2e3712c5119c10eceff4424 to your computer and use it in GitHub Desktop.
Hadoop CSV Map task
package hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import utils.Utils;
public class CSVMapper extends Mapper<Object, Text, IntWritable, Text> {
private Text word = new Text();
String[] search = null;
String originalSearch = null;
String idGroup = null;
Boolean showError = false;
Boolean first = true;
String type;
Boolean byTitre = false, byTag = false, byContenu = false;
List<String> contenuList = new ArrayList<String>();
Integer currentContenu = -1, contentToWrite = -1;
Boolean contenuWritten = false;
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
try {
String[] elems = value.toString().split(";");
String[] groups = elems[8].split(",");
String subFound = "";
if (this.first) {
this.originalSearch = context.getConfiguration().get("q").toLowerCase();
this.search = Utils.stripAccents(this.originalSearch).split(" ");
this.idGroup = context.getConfiguration().get("idgroup");
this.type = context.getConfiguration().get("type");
if (this.type.charAt(0) == '1') { this.byTitre = true;}
if (this.type.charAt(1) == '1') { this.byTag = true;}
if (this.type.charAt(2) == '1') { this.byContenu = true;}
this.first = false;
}
if (Arrays.asList(groups).contains(this.idGroup)) {
if (Integer.parseInt(elems[1]) != this.currentContenu) {
this.currentContenu = Integer.parseInt(elems[1]);
if (this.contenuWritten == true) {
for (int i = 0; i < this.contenuList.size(); i++) {
word.set(this.contenuList.get(i) + ";" + "");
context.write(new IntWritable(Integer.parseInt(elems[2])), word);
}
}
this.contenuWritten = false;
this.contenuList = new ArrayList<String>();
this.contenuList.add(value.toString());
} else {
this.contenuList.add(value.toString());
}
if (this.byTitre) { // Search on title
int found = 0;
for (int i = 0; i < this.search.length; i++) {
if (Utils.stripAccents(elems[3].toLowerCase()).contains(this.search[i]))
found++;
}
if (found == this.search.length) {
subFound += "titre:" + (Integer) (this.originalSearch.length() * 100 / elems[3].length()) + ",";
}
}
if (this.byContenu) { // Search on content
int found = 0;
for (int i = 0; i < this.search.length; i++) {
if (Utils.stripAccents(elems[6].toLowerCase()).contains(this.search[i]))
found++;
}
if (found == this.search.length) {
subFound += "contenu:" + (Integer) (this.originalSearch.length() * 100 / elems[6].length()) + ",";
}
}
if (this.byTag) { // Search on tags
int found = 0;
for (int i = 0; i < this.search.length; i++) {
if (Utils.stripAccents(elems[5].toLowerCase()).contains(this.search[i]))
found++;
}
if (found > 0) {
subFound += "tag:" + (Integer) (this.originalSearch.length() * 100 / elems[5].length()) + ",";
}
}
if (subFound.length() > 0) {
word.set(value + ";" + subFound);
context.write(new IntWritable(Integer.parseInt(elems[2])), word);
this.contenuWritten = true;
}
}
} catch (Exception e) {
if (this.showError) {
word.set(e.toString() + " => " + value.toString());
context.write(new IntWritable(-1), word);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment