Skip to content

Instantly share code, notes, and snippets.

@dbasch
Created November 30, 2012 16:18
Show Gist options
  • Save dbasch/4176756 to your computer and use it in GitHub Desktop.
Save dbasch/4176756 to your computer and use it in GitHub Desktop.
Quick and dirty text indexer
import java.io.File;
import java.util.regex.Pattern;
import java.util.Scanner;
import com.google.common.collect.ArrayListMultimap;
/** create in-memory mappings from words to the files that contain them */
public class Indexer {
public static ArrayListMultimap<String,String> buildIndex(String dirName) throws java.io.IOException {
ArrayListMultimap<String,String> map = ArrayListMultimap.create();
Pattern p = Pattern.compile("[\\s#&!:,;\\.\\\\+-]+");
for (File f : new File(dirName).listFiles()) {
String contents = new Scanner(f).useDelimiter("\\Z").next();
for (String word : p.split(contents.toLowerCase())) {
map.put(word, f.getName());
}
}
return map;
}
//for testing
public static void main(String args[]) throws java.io.IOException {
ArrayListMultimap<String,String> index = Indexer.buildIndex(args[0]);
System.out.println(index.get(args[1]));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment