Skip to content

Instantly share code, notes, and snippets.

@daramcq
Created June 6, 2012 18:03
Show Gist options
  • Save daramcq/2883635 to your computer and use it in GitHub Desktop.
Save daramcq/2883635 to your computer and use it in GitHub Desktop.
RecDep Information Retrieval - Search documents based on keywords
/***********************************************
* RecDep - Information Retrieval System *
* @author - Dara McHugh *
* dara.mchugh22@student.computing.dcu.ie *
* *
* @version - 1.6 *
* @since - 2012 - 05 - 8 *
Screencast Available at: *
http://student.computing.dcu.ie/~mchugd22/java_ir.ogv
* *
* NB - additional DocRank and DocEntry classes
* are included at bottom for reference *
* *
***********************************************/
import java.util.*;
import java.io.*;
import org.tartarus.snowball.ext.*;
import org.tartarus.snowball.*;
public class RecDep
{
/**
Create a new Term Index - empty
*/
public static HashMap <String, ArrayList<DocEntry>> createTermIndex()
{
// Create a Hash Map to store input in format <Key = String, Value = ArrayList <DocEntry> >
HashMap <String, ArrayList <DocEntry>> termIndex = new HashMap<String, ArrayList <DocEntry>>();
return termIndex;
}
/**
Create a Document Index - empty
*/
public static HashMap <String, Integer> createDocIndex()
{
// Create a HashMap to store input in format Title (String), Wordcount (Integer)
HashMap <String, Integer> docIndex = new HashMap <String, Integer>();
return docIndex;
}
/**
Preprocess terms - Convert to lowercase, remove punctuation and stem
@param term term to be preprocessed
@return term term after being processed
*/
public static String preprocess(String term)
{
term = term.toLowerCase();
term = term.replaceAll("\\p{Punct}", "");
// term = stemTerm(term);
return term;
}
/**
Add a term into the TermList.
@param termList the Index of terms to be added to
@param term the term to be added
@param parentDoc the name of the term origin document
@param pos the position of the term within origin document
*/
public static void addTerm(HashMap <String, ArrayList<DocEntry> >termList, String term, String parentDoc, int pos)
{
// Check if word is in the Term Index already
if (termList.containsKey(term))
{
// Get the DocEntry ArrayList so it can be searched
ArrayList <DocEntry> postingList = termList.get(term);
// In each one, check if the title element of the DocEntry equals that we are given
for (int i=0; i<postingList.size(); i++)
{
// If it does, add the position of the term to the WDP Array & Exit
if (postingList.get(i).getParentDoc().equals(parentDoc))
{
postingList.get(i).addPosition(pos);
// Put the modified DocEntry Array into the correct bucket
termList.put(term, postingList);
return;
}
}
// If Document is not present in DocEntry ArrayList, create a new DocEntry
ArrayList <Integer> positions = new ArrayList<Integer>();
positions.add(pos);
DocEntry newDoc = new DocEntry(parentDoc, positions);
// Add this DocEntry to the PostingList
postingList.add(newDoc);
// Add modified PostingList to TermList and Exit
termList.put(term,postingList);
return;
}
else
{
// If it is not, create a DocEntry for this file (Title, Term Position)
ArrayList <Integer> positions = new ArrayList<Integer>();
positions.add(pos);
DocEntry newDoc = new DocEntry(parentDoc, positions);
// Create an array of DocEntries & Add first DocEntry
ArrayList<DocEntry> postingList = new ArrayList<DocEntry>();
postingList.add(newDoc);
// Put this array as new value into TermList
termList.put(term,postingList);
}
}
/**
Add a new document to the Term Index
@param termList the index of terms
@param docIndex the index of parent documents
@param textFile the document to be added
*/
public static void addDocument(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, File textFile)
{
try
{
// Start Scanning the document
Scanner sc = new Scanner(textFile);
int posCount = 0;
String parentDoc = textFile.getName();
while (sc.hasNext())
{
// Preprocess and add term to TermList
String term = sc.next();
term = preprocess(term);
addTerm(termList, term, parentDoc, posCount);
posCount++;
}
docIndex.put(parentDoc, posCount);
// If DocIndex contains Total value, increment it
if (docIndex.containsKey("total"))
{
int currentTotal = docIndex.get("total");
int newTotal = currentTotal + posCount;
docIndex.put("total",newTotal);
}
// Otherwise, create a new one and initialise at posCount
else
docIndex.put("total", posCount);
}
catch(Exception e)
{
}
}
/**
Add an array of documents to the Term Index
@param termList the index of terms
@param docIndex the index of parent documents
@param textFiles the array of documents to be added
*/
public static void addDocumentArray(HashMap<String, ArrayList<DocEntry>> termList, HashMap<String, Integer> docIndex ,File[] textFiles)
{
for (int i=0; i<textFiles.length ; i++)
{
addDocument(termList, docIndex, textFiles[i]);
}
}
public static String stemTerm(String term)
{
try
{
// This implementation of Porter2 (Snowball) is based on: http://stackoverflow.com/questions/4397107/is-there-a-java-implementation-of-porter2-stemmer
String lang = "english";
Class extClass = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer");
SnowballStemmer stemmer = (SnowballStemmer) extClass.newInstance();
stemmer.setCurrent(term);
stemmer.stem();
term = stemmer.getCurrent();
}
catch (Exception e)
{
String s = e.getMessage();
if (s != null)
{
System.out.println(s);
}
System.err.print("F3x3rd");
}
return term;
}
/**
Modify a DocRanking List based on word frequency search of Posting List for a single term
@param termList the index of terms to be checked
@param term the term to be checked
@param docRanking the Ranking List to be modified
@return ArrayList<DocRank> modified Array List
*/
public static ArrayList <DocRank> freqDocRank(HashMap<String, ArrayList<DocEntry>> termList, String term, ArrayList<DocRank> docRanking)
{
// If the term isn't found return negative message
if (!termList.containsKey(term))
System.out.println(term+ " not found");
else
{
ArrayList <DocEntry> postingList = termList.get(term);
boolean contains = false;
// We will iterate through all the Doc Entries of the Posting List
for (int i = 0; i < postingList.size(); i++)
{
int j=0;
// We will check if each Doc Entry already exists in Doc Ranking ArrayList
while (!contains && j<docRanking.size())
{
// Checking based on name
if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc()))
contains=true;
else
j++;
}
// If DocRanking list contains corresponding document, increase its Word Score
if (contains)
{
int increase = postingList.get(i).getTermFrequency();
docRanking.get(j).increaseWordScore(increase);
}
// Otherwise, create a new Doc Rank object
else
{
// Get the name of the Parent Document and term frequency
String parentDoc = postingList.get(i).getParentDoc();
int wdf = postingList.get(i).getTermFrequency();
// Create a new DocRank object with these variables & add to ArrayList
DocRank dr = new DocRank(parentDoc, wdf);
docRanking.add(i, dr);
}
}
}
return docRanking;
}
/**
Modify a DocRanking List based on occurrence search of Posting List for a single term
@param termList the index of terms to be checked
@param term the term to be checked
@param docRanking the Ranking List to be modified
@return ArrayList<DocRank> modified Array List
*/
public static ArrayList <DocRank> occDocRank(HashMap<String, ArrayList<DocEntry>> termList, String term, ArrayList<DocRank> docRanking)
{
// If the term isn't found return negative message
if (!termList.containsKey(term))
System.out.println(term+ " not found");
else
{
ArrayList <DocEntry> postingList = termList.get(term);
boolean contains = false;
// We will iterate through all the Doc Entries of the Posting List
for (int i = 0; i < postingList.size(); i++)
{
int j=0;
// We will check if each Doc Entry already exists in Doc Ranking ArrayList
while (!contains && j<docRanking.size())
{
// Checking based on name
if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc()))
contains=true;
else
j++;
}
// If DocRanking list contains corresponding document, increase its Word Score
if (contains)
{
docRanking.get(j).increaseWordScore(1);
}
// Otherwise, create a new Doc Rank object
else
{
// Get the name of the Parent Document and term frequency
String parentDoc = postingList.get(i).getParentDoc();
// Create a new DocRank object with these variables & add to ArrayList
DocRank dr = new DocRank(parentDoc, 1);
docRanking.add(i, dr);
}
}
}
return docRanking;
}
/**
Modify a DocRanking List based on weighted search of Posting List for a single term
@param termList the index of terms to be checked
@param term the term to be checked
@param docRanking the Ranking List to be modified
@return ArrayList<DocRank> modified Array List
*/
public static ArrayList <DocRank> weightedDocRank(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, String term, ArrayList<DocRank> docRanking)
{
// If the term isn't found return negative message
if (!termList.containsKey(term))
System.out.println(term+ " not found");
else
{
ArrayList <DocEntry> postingList = termList.get(term);
boolean contains = false;
// We will iterate through all the Doc Entries of the Posting List
for (int i = 0; i < postingList.size(); i++)
{
int j=0;
// We will check if each Doc Entry already exists in Doc Ranking ArrayList
while (!contains && j<docRanking.size())
{
// Checking based on name
if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc()))
contains=true;
else
j++;
}
// If DocRanking list contains corresponding document, increase its Word Score
if (contains)
{
// Get the Collection Frequency Weight of the term
// CFW(t) = log(N/n)
double totalDocs;
totalDocs= docIndex.size();
double postingSize;
postingSize = postingList.size();
double prelogWeight = totalDocs/postingSize;
double weight = Math.log(prelogWeight);
// Get the frequency of the term, divide by normalised document length
int wdf = postingList.get(i).getTermFrequency();
double avgLength;
avgLength = docIndex.get("total")/(docIndex.size()-1);
String title = docRanking.get(j).getParentDoc();
int wordLength = docIndex.get(title);
double ndl = wordLength/avgLength;
double termWeight = (wdf/ndl)*weight;
docRanking.get(j).increaseWordScore(termWeight);
}
// Otherwise, create a new Doc Rank object
else
{
// Get the Collection Frequency Weight of the term
// CFW(t) = log(N/n) i.e. Total Number of docs / Size of Posting List
double totalDocs;
totalDocs= docIndex.size();
double postingSize;
postingSize = postingList.size();
double prelogWeight = totalDocs/postingSize;
double weight = Math.log(prelogWeight);
// Get the name of the Parent Document and term frequency
String parentDoc = postingList.get(i).getParentDoc();
int wdf = postingList.get(i).getTermFrequency();
// Calculate the average length of documents in Doc Index
double avgLength;
avgLength = docIndex.get("total")/(docIndex.size()-1);
int wordLength = docIndex.get(parentDoc);
double ndl = wordLength/avgLength;
// Weight of this term's occurence = (freq/normalised doc length) * weight of this term
double termWeight = (wdf/ndl)*weight;
// Create a new DocRank object with these variables & add to ArrayList
DocRank dr = new DocRank(parentDoc, termWeight);
docRanking.add(i, dr);
}
}
}
return docRanking;
}
/** A Comparator for DocRank objects.
This is adapted from a similar method found here:
http://thejavablog.wordpress.com/2010/02/16/how-to-sort-a-list-of-objects/
@param dr1 ranking to be compared
@param dr2 ranking to be compared
@return compare result of comparison
*/
private static Comparator<DocRank> rankSorter = new Comparator<DocRank>()
{
public int compare(DocRank dr1, DocRank dr2)
{
double result = dr1.getWordScore()- dr2.getWordScore();
if (result>0)
return 1;
else if (result==0)
return 0;
else
return -1;
}
};
/**
Get Doc Ranking for array of terms
@param termList the Term Index to be searched
@param docIndex the Index of Documents that corresponds
@param terms the array of terms to be searched for
@param searchType the type of search to be done
@return a string listing of document names
*/
public static String search(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, String[] terms, String searchType)
{
ArrayList <DocRank> docRanking = new ArrayList<DocRank>();
// Do a Doc Ranking process for each of the terms in the array
for (int i=0; i<terms.length; i++)
{
String term = preprocess(terms[i]);
// If the search type selected is wordfreq, do this search
if (searchType.equals("wordFreq"))
{
docRanking = freqDocRank(termList, term, docRanking);
}
// Otherwise, do a word occurrence search
else if (searchType.equals("wordOcc"))
{
docRanking = occDocRank(termList,term,docRanking);
}
else
{
docRanking = weightedDocRank(termList, docIndex, term, docRanking);
}
}
// Sort the completed DocRanking array
Collections.sort(docRanking, rankSorter);
String result ="";
// Print out the various terms in descending order
for (int i=docRanking.size()-1; i>0; i--)
{
result+= docRanking.get(i).getParentDoc() +"\n";
}
return result;
}
public static void main(String[] args)
{
HashMap <String, ArrayList<DocEntry>> termIndex = createTermIndex();
HashMap <String, Integer> docIndex = createDocIndex();
File f1 = new File("BBC_2012_5_1_NATO_Afghanistan.txt");
File f2 = new File("BBC_2012_5_1_News_Corp.txt");
File f3 = new File("ATIMES_2012_4_28_BRIC_Future.txt");
File f4 = new File("ATIMES_2012_5_2_Afghanistan_peace.txt");
File f5 = new File("ATIMES_2012_5_2_Uzbekistan_Angren.txt");
File f6 = new File("BBC_2012_3_2_NATO_Afghanistan.txt");
File f7 = new File("BBC_2012_5_1_US_Afghanistan.txt");
File f8 = new File("Guardian_2012_5_1_Al_Qaida_Yemen.txt");
File f9 = new File("Guardian_2012_5_1_Germany_Spain_austerity.txt");
File f10 = new File("Guardian_2012_5_1_News_Corp.txt");
File f11 = new File("Guardian_2012_5_1_stock_exchange_protest.txt");
File f12 = new File("Independent_IE_2012_4_28_NAMA_Dalymount.txt");
File f13 = new File("Independent_IE_2012_5_1_Referendum_threats.txt");
File f14 = new File( "RTE_2009_9_29_OECD_unemployment.txt");
File f15 = new File("RTE_2012_5_1_Rupert_Murdoch.txt");
File f16 = new File("RTE_2012_5_1_FF_Referendum.txt");
File[] documents = { f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16 };
// File cranfield = new File("Cranfield");
// File[] testDocs = cranfield.listFiles();
addDocumentArray(termIndex, docIndex, documents);
// addDocumentArray(termIndex, docIndex, testDocs);
String[] searchTerm;
String searchType ="";
// If user wants to do a special search, identify it and remove term
if (args[0].equals("wordOcc") || args[0].equals("wordFreq"))
{
if (args[0].equals("wordOcc"))
{
searchType = "wordOcc";
}
else
{
searchType = "wordFreq";
}
searchTerm = new String[args.length-1];
for (int i = 0; i<args.length-1; i++)
{
searchTerm[i]=args[i+1];
}
}
else
{
searchTerm=args;
}
System.out.println(search(termIndex, docIndex, searchTerm, searchType));
}
}
/**************************************
DocEntry Class
- A class for storing information about term positions in documents
* @author - Dara McHugh *
* dara.mchugh22@student.computing.dcu.ie *
* *
* @version - 1.6 *
* @since - 2012 - 05 - 8
*/
import java.util.*;
public class DocEntry
{
private String parentDoc; // Title String of parent document
private ArrayList <Integer> withinDocPos; // Int Array List of positions in doc where term appears
/**
Create a new DocEntry object
@param parentDoc the name of the origin document
@param withinDocPos the list of positions term occurs in
@return DocEntry object
*/
public DocEntry(String parentDoc, ArrayList <Integer> withinDocPos)
{
this.parentDoc = parentDoc;
this.withinDocPos = withinDocPos;
}
/**
Get name of origin document
@return Origin document name
*/
public String getParentDoc()
{
return this.parentDoc;
}
/**
Get the list of positions
@return list of positions
*/
public ArrayList<Integer> getPositions()
{
return this.withinDocPos;
}
/**
Add a new term position to list
@param pos the position to be added
*/
public void addPosition(int pos)
{
withinDocPos.add(pos);
}
/**
Get the term frequency for this document
@return size of positions list
*/
public int getTermFrequency()
{
return this.withinDocPos.size();
}
}
/***********************************
* DocRank class
* A class for storing information about
* ranking of documents
* @author - Dara McHugh *
* dara.mchugh22@student.computing.dcu.ie *
* *
* @version - 1.6 *
* @since - 2012 - 05 - 8
***********************************/
import java.util.*;
public class DocRank
{
private String parentDoc;
private double wordScore;
/**
Create a new Doc Rank object
@param parentDoc name of origin document
@param wordScore value of ranking score
*/
public DocRank(String parentDoc, double wordScore)
{
this.parentDoc = parentDoc;
this.wordScore = wordScore;
}
/**
Create a new Doc Rank object
@param parentDoc name of origin document
*/
public DocRank(String parentDoc)
{
this.parentDoc = parentDoc;
this.wordScore = 0;
}
/**
Get the name of the origin document
@return name of origin document
*/
public String getParentDoc()
{
return parentDoc;
}
/**
Get the Ranking Score of Entry
@return Ranking Score
*/
public double getWordScore()
{
return wordScore;
}
/**
Increase the Ranking Score of Entry
@param increase the integer amount to increase by
*/
public void increaseWordScore(int increase)
{
double doubInc = (double)increase;
this.wordScore += doubInc;
}
/**
Increase the Ranking Score of Entry
@param increase the double amount to increase by
*/
public void increaseWordScore(double increase)
{
this.wordScore += increase;
}
/**
A comparable method. Deprecated
@param dr Doc Rank to compare with
@return 1 if current object is greater, 0 if equal, -1 if smaller
*/
public int compareTo(DocRank dr)
{
if (this.getWordScore()>dr.getWordScore())
return 1;
else if (this.getWordScore()==dr.getWordScore())
return 0;
else
return -1;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment