Skip to content

Instantly share code, notes, and snippets.

@amalgjose
Created November 24, 2014 09:18
Embed
What would you like to do?
Mapreduce program for removing stop words from the given text files. Hadoop Distributed cache and counters are used in this program
package com.hadoop.skipper;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SkipMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private Text word = new Text();
private Set<String> stopWordList = new HashSet<String>();
private BufferedReader fis;
/*
* (non-Javadoc)
*
* @see
* org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.
* Mapper.Context)
*/
@SuppressWarnings("deprecation")
protected void setup(Context context) throws java.io.IOException,
InterruptedException {
try {
Path[] stopWordFiles = new Path[0];
stopWordFiles = context.getLocalCacheFiles();
System.out.println(stopWordFiles.toString());
if (stopWordFiles != null && stopWordFiles.length > 0) {
for (Path stopWordFile : stopWordFiles) {
readStopWordFile(stopWordFile);
}
}
} catch (IOException e) {
System.err.println("Exception reading stop word file: " + e);
}
}
/*
* Method to read the stop word file and get the stop words
*/
private void readStopWordFile(Path stopWordFile) {
try {
fis = new BufferedReader(new FileReader(stopWordFile.toString()));
String stopWord = null;
while ((stopWord = fis.readLine()) != null) {
stopWordList.add(stopWord);
}
} catch (IOException ioe) {
System.err.println("Exception while reading stop word file '"
+ stopWordFile + "' : " + ioe.toString());
}
}
/*
* (non-Javadoc)
*
* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN,
* org.apache.hadoop.mapreduce.Mapper.Context)
*/
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (stopWordList.contains(token)) {
context.getCounter(StopWordSkipper.COUNTERS.STOPWORDS)
.increment(1L);
} else {
context.getCounter(StopWordSkipper.COUNTERS.GOODWORDS)
.increment(1L);
word.set(token);
context.write(word, null);
}
}
}
}
package com.hadoop.skipper;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
@SuppressWarnings("deprecation")
public class StopWordSkipper {
public enum COUNTERS {
STOPWORDS,
GOODWORDS
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
args = parser.getRemainingArgs();
Job job = new Job(conf, "StopWordSkipper");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setJarByClass(StopWordSkipper.class);
job.setMapperClass(SkipMapper.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
List<String> other_args = new ArrayList<String>();
// Logic to read the location of stop word file from the command line
// The argument after -skip option will be taken as the location of stop
// word file
for (int i = 0; i < args.length; i++) {
if ("-skip".equals(args[i])) {
DistributedCache.addCacheFile(new Path(args[++i]).toUri(),
job.getConfiguration());
if (i+1 < args.length)
{
i++;
}
else
{
break;
}
}
other_args.add(args[i]);
}
FileInputFormat.setInputPaths(job, new Path(other_args.get(0)));
FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
job.waitForCompletion(true);
Counters counters = job.getCounters();
System.out.printf("Good Words: %d, Stop Words: %d\n",
counters.findCounter(COUNTERS.GOODWORDS).getValue(),
counters.findCounter(COUNTERS.STOPWORDS).getValue());
}
}
@Farheen2302
Copy link

When I am trynig to run above programme I am getting following error. How to rectify?

The method getLocalCacheFiles() is undefined for the type Mapper<LongWritable,Text,Text,NullWritable>.Context

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment