jorisbertomeu/CSVMapper.java

## CSVMapper.java
package hadoop;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import utils.Utils;

public class CSVMapper extends Mapper<Object, Text, IntWritable, Text> {
	private Text word = new Text();
	String[] search = null;
	String originalSearch = null;
	String idGroup = null;
	Boolean showError = false;
	Boolean first = true;
	String type;
	Boolean byTitre = false, byTag = false, byContenu = false;
	List<String> contenuList = new ArrayList<String>();
	Integer currentContenu = -1, contentToWrite = -1;
	Boolean contenuWritten = false;

	public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
		try {
			String[] elems = value.toString().split(";");
			String[] groups = elems[8].split(",");
			String subFound = "";

			if (this.first) {
				this.originalSearch = context.getConfiguration().get("q").toLowerCase();
				this.search = Utils.stripAccents(this.originalSearch).split(" ");
				this.idGroup = context.getConfiguration().get("idgroup");
				this.type = context.getConfiguration().get("type");

				if (this.type.charAt(0) == '1') { this.byTitre = true;}
				if (this.type.charAt(1) == '1') { this.byTag = true;}
				if (this.type.charAt(2) == '1') { this.byContenu = true;}
				this.first = false;
			}

			if (Arrays.asList(groups).contains(this.idGroup)) {
				if (Integer.parseInt(elems[1]) != this.currentContenu) {
					this.currentContenu = Integer.parseInt(elems[1]);
					if (this.contenuWritten == true) {
						for (int i = 0; i < this.contenuList.size(); i++) {
							word.set(this.contenuList.get(i) + ";" + "");
							context.write(new IntWritable(Integer.parseInt(elems[2])), word);
						}
					}
					this.contenuWritten = false;
					this.contenuList = new ArrayList<String>();
					this.contenuList.add(value.toString());
				} else {
					this.contenuList.add(value.toString());
				}
				if (this.byTitre) { // Search on title
					int found = 0;

					for (int i = 0; i < this.search.length; i++) {
						if (Utils.stripAccents(elems[3].toLowerCase()).contains(this.search[i]))
							found++;
					}
					if (found == this.search.length) {
						subFound += "titre:" + (Integer) (this.originalSearch.length() * 100 / elems[3].length()) + ",";
					}
				}
				if (this.byContenu) { // Search on content
					int found = 0;

					for (int i = 0; i < this.search.length; i++) {
						if (Utils.stripAccents(elems[6].toLowerCase()).contains(this.search[i]))
							found++;
					}
					if (found == this.search.length) {
						subFound += "contenu:" + (Integer) (this.originalSearch.length() * 100 / elems[6].length()) + ",";
					}
				}
				if (this.byTag) { // Search on tags
					int found = 0;

					for (int i = 0; i < this.search.length; i++) {
						if (Utils.stripAccents(elems[5].toLowerCase()).contains(this.search[i]))
							found++;
					}
					if (found > 0) {
						subFound += "tag:" + (Integer) (this.originalSearch.length() * 100 / elems[5].length()) + ",";
					}
				}
				if (subFound.length() > 0) {
					word.set(value + ";" + subFound);
					context.write(new IntWritable(Integer.parseInt(elems[2])), word);
					this.contenuWritten = true;
				}
			}
		} catch (Exception e) {
			if (this.showError) {
				word.set(e.toString() + " => " + value.toString());
				context.write(new IntWritable(-1), word);
			}
		}
	}
}
	package hadoop;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;

	import org.apache.hadoop.io.IntWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Mapper;

	import utils.Utils;

	public class CSVMapper extends Mapper<Object, Text, IntWritable, Text> {
	private Text word = new Text();
	String[] search = null;
	String originalSearch = null;
	String idGroup = null;
	Boolean showError = false;
	Boolean first = true;
	String type;
	Boolean byTitre = false, byTag = false, byContenu = false;
	List<String> contenuList = new ArrayList<String>();
	Integer currentContenu = -1, contentToWrite = -1;
	Boolean contenuWritten = false;

	public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
	try {
	String[] elems = value.toString().split(";");
	String[] groups = elems[8].split(",");
	String subFound = "";

	if (this.first) {
	this.originalSearch = context.getConfiguration().get("q").toLowerCase();
	this.search = Utils.stripAccents(this.originalSearch).split(" ");
	this.idGroup = context.getConfiguration().get("idgroup");
	this.type = context.getConfiguration().get("type");

	if (this.type.charAt(0) == '1') { this.byTitre = true;}
	if (this.type.charAt(1) == '1') { this.byTag = true;}
	if (this.type.charAt(2) == '1') { this.byContenu = true;}
	this.first = false;
	}

	if (Arrays.asList(groups).contains(this.idGroup)) {
	if (Integer.parseInt(elems[1]) != this.currentContenu) {
	this.currentContenu = Integer.parseInt(elems[1]);
	if (this.contenuWritten == true) {
	for (int i = 0; i < this.contenuList.size(); i++) {
	word.set(this.contenuList.get(i) + ";" + "");
	context.write(new IntWritable(Integer.parseInt(elems[2])), word);
	}
	}
	this.contenuWritten = false;
	this.contenuList = new ArrayList<String>();
	this.contenuList.add(value.toString());
	} else {
	this.contenuList.add(value.toString());
	}
	if (this.byTitre) { // Search on title
	int found = 0;

	for (int i = 0; i < this.search.length; i++) {
	if (Utils.stripAccents(elems[3].toLowerCase()).contains(this.search[i]))
	found++;
	}
	if (found == this.search.length) {
	subFound += "titre:" + (Integer) (this.originalSearch.length() * 100 / elems[3].length()) + ",";
	}
	}
	if (this.byContenu) { // Search on content
	int found = 0;

	for (int i = 0; i < this.search.length; i++) {
	if (Utils.stripAccents(elems[6].toLowerCase()).contains(this.search[i]))
	found++;
	}
	if (found == this.search.length) {
	subFound += "contenu:" + (Integer) (this.originalSearch.length() * 100 / elems[6].length()) + ",";
	}
	}
	if (this.byTag) { // Search on tags
	int found = 0;

	for (int i = 0; i < this.search.length; i++) {
	if (Utils.stripAccents(elems[5].toLowerCase()).contains(this.search[i]))
	found++;
	}
	if (found > 0) {
	subFound += "tag:" + (Integer) (this.originalSearch.length() * 100 / elems[5].length()) + ",";
	}
	}
	if (subFound.length() > 0) {
	word.set(value + ";" + subFound);
	context.write(new IntWritable(Integer.parseInt(elems[2])), word);
	this.contenuWritten = true;
	}
	}
	} catch (Exception e) {
	if (this.showError) {
	word.set(e.toString() + " => " + value.toString());
	context.write(new IntWritable(-1), word);
	}
	}
	}
	}