Created
June 6, 2012 18:03
-
-
Save daramcq/2883635 to your computer and use it in GitHub Desktop.
RecDep Information Retrieval - Search documents based on keywords
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*********************************************** | |
* RecDep - Information Retrieval System * | |
* @author - Dara McHugh * | |
* dara.mchugh22@student.computing.dcu.ie * | |
* * | |
* @version - 1.6 * | |
* @since - 2012 - 05 - 8 * | |
Screencast Available at: * | |
http://student.computing.dcu.ie/~mchugd22/java_ir.ogv | |
* * | |
* NB - additional DocRank and DocEntry classes | |
* are included at bottom for reference * | |
* * | |
***********************************************/ | |
import java.util.*; | |
import java.io.*; | |
import org.tartarus.snowball.ext.*; | |
import org.tartarus.snowball.*; | |
public class RecDep | |
{ | |
/** | |
Create a new Term Index - empty | |
*/ | |
public static HashMap <String, ArrayList<DocEntry>> createTermIndex() | |
{ | |
// Create a Hash Map to store input in format <Key = String, Value = ArrayList <DocEntry> > | |
HashMap <String, ArrayList <DocEntry>> termIndex = new HashMap<String, ArrayList <DocEntry>>(); | |
return termIndex; | |
} | |
/** | |
Create a Document Index - empty | |
*/ | |
public static HashMap <String, Integer> createDocIndex() | |
{ | |
// Create a HashMap to store input in format Title (String), Wordcount (Integer) | |
HashMap <String, Integer> docIndex = new HashMap <String, Integer>(); | |
return docIndex; | |
} | |
/** | |
Preprocess terms - Convert to lowercase, remove punctuation and stem | |
@param term term to be preprocessed | |
@return term term after being processed | |
*/ | |
public static String preprocess(String term) | |
{ | |
term = term.toLowerCase(); | |
term = term.replaceAll("\\p{Punct}", ""); | |
// term = stemTerm(term); | |
return term; | |
} | |
/** | |
Add a term into the TermList. | |
@param termList the Index of terms to be added to | |
@param term the term to be added | |
@param parentDoc the name of the term origin document | |
@param pos the position of the term within origin document | |
*/ | |
public static void addTerm(HashMap <String, ArrayList<DocEntry> >termList, String term, String parentDoc, int pos) | |
{ | |
// Check if word is in the Term Index already | |
if (termList.containsKey(term)) | |
{ | |
// Get the DocEntry ArrayList so it can be searched | |
ArrayList <DocEntry> postingList = termList.get(term); | |
// In each one, check if the title element of the DocEntry equals that we are given | |
for (int i=0; i<postingList.size(); i++) | |
{ | |
// If it does, add the position of the term to the WDP Array & Exit | |
if (postingList.get(i).getParentDoc().equals(parentDoc)) | |
{ | |
postingList.get(i).addPosition(pos); | |
// Put the modified DocEntry Array into the correct bucket | |
termList.put(term, postingList); | |
return; | |
} | |
} | |
// If Document is not present in DocEntry ArrayList, create a new DocEntry | |
ArrayList <Integer> positions = new ArrayList<Integer>(); | |
positions.add(pos); | |
DocEntry newDoc = new DocEntry(parentDoc, positions); | |
// Add this DocEntry to the PostingList | |
postingList.add(newDoc); | |
// Add modified PostingList to TermList and Exit | |
termList.put(term,postingList); | |
return; | |
} | |
else | |
{ | |
// If it is not, create a DocEntry for this file (Title, Term Position) | |
ArrayList <Integer> positions = new ArrayList<Integer>(); | |
positions.add(pos); | |
DocEntry newDoc = new DocEntry(parentDoc, positions); | |
// Create an array of DocEntries & Add first DocEntry | |
ArrayList<DocEntry> postingList = new ArrayList<DocEntry>(); | |
postingList.add(newDoc); | |
// Put this array as new value into TermList | |
termList.put(term,postingList); | |
} | |
} | |
/** | |
Add a new document to the Term Index | |
@param termList the index of terms | |
@param docIndex the index of parent documents | |
@param textFile the document to be added | |
*/ | |
public static void addDocument(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, File textFile) | |
{ | |
try | |
{ | |
// Start Scanning the document | |
Scanner sc = new Scanner(textFile); | |
int posCount = 0; | |
String parentDoc = textFile.getName(); | |
while (sc.hasNext()) | |
{ | |
// Preprocess and add term to TermList | |
String term = sc.next(); | |
term = preprocess(term); | |
addTerm(termList, term, parentDoc, posCount); | |
posCount++; | |
} | |
docIndex.put(parentDoc, posCount); | |
// If DocIndex contains Total value, increment it | |
if (docIndex.containsKey("total")) | |
{ | |
int currentTotal = docIndex.get("total"); | |
int newTotal = currentTotal + posCount; | |
docIndex.put("total",newTotal); | |
} | |
// Otherwise, create a new one and initialise at posCount | |
else | |
docIndex.put("total", posCount); | |
} | |
catch(Exception e) | |
{ | |
} | |
} | |
/** | |
Add an array of documents to the Term Index | |
@param termList the index of terms | |
@param docIndex the index of parent documents | |
@param textFiles the array of documents to be added | |
*/ | |
public static void addDocumentArray(HashMap<String, ArrayList<DocEntry>> termList, HashMap<String, Integer> docIndex ,File[] textFiles) | |
{ | |
for (int i=0; i<textFiles.length ; i++) | |
{ | |
addDocument(termList, docIndex, textFiles[i]); | |
} | |
} | |
public static String stemTerm(String term) | |
{ | |
try | |
{ | |
// This implementation of Porter2 (Snowball) is based on: http://stackoverflow.com/questions/4397107/is-there-a-java-implementation-of-porter2-stemmer | |
String lang = "english"; | |
Class extClass = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer"); | |
SnowballStemmer stemmer = (SnowballStemmer) extClass.newInstance(); | |
stemmer.setCurrent(term); | |
stemmer.stem(); | |
term = stemmer.getCurrent(); | |
} | |
catch (Exception e) | |
{ | |
String s = e.getMessage(); | |
if (s != null) | |
{ | |
System.out.println(s); | |
} | |
System.err.print("F3x3rd"); | |
} | |
return term; | |
} | |
/** | |
Modify a DocRanking List based on word frequency search of Posting List for a single term | |
@param termList the index of terms to be checked | |
@param term the term to be checked | |
@param docRanking the Ranking List to be modified | |
@return ArrayList<DocRank> modified Array List | |
*/ | |
public static ArrayList <DocRank> freqDocRank(HashMap<String, ArrayList<DocEntry>> termList, String term, ArrayList<DocRank> docRanking) | |
{ | |
// If the term isn't found return negative message | |
if (!termList.containsKey(term)) | |
System.out.println(term+ " not found"); | |
else | |
{ | |
ArrayList <DocEntry> postingList = termList.get(term); | |
boolean contains = false; | |
// We will iterate through all the Doc Entries of the Posting List | |
for (int i = 0; i < postingList.size(); i++) | |
{ | |
int j=0; | |
// We will check if each Doc Entry already exists in Doc Ranking ArrayList | |
while (!contains && j<docRanking.size()) | |
{ | |
// Checking based on name | |
if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc())) | |
contains=true; | |
else | |
j++; | |
} | |
// If DocRanking list contains corresponding document, increase its Word Score | |
if (contains) | |
{ | |
int increase = postingList.get(i).getTermFrequency(); | |
docRanking.get(j).increaseWordScore(increase); | |
} | |
// Otherwise, create a new Doc Rank object | |
else | |
{ | |
// Get the name of the Parent Document and term frequency | |
String parentDoc = postingList.get(i).getParentDoc(); | |
int wdf = postingList.get(i).getTermFrequency(); | |
// Create a new DocRank object with these variables & add to ArrayList | |
DocRank dr = new DocRank(parentDoc, wdf); | |
docRanking.add(i, dr); | |
} | |
} | |
} | |
return docRanking; | |
} | |
/** | |
Modify a DocRanking List based on occurrence search of Posting List for a single term | |
@param termList the index of terms to be checked | |
@param term the term to be checked | |
@param docRanking the Ranking List to be modified | |
@return ArrayList<DocRank> modified Array List | |
*/ | |
public static ArrayList <DocRank> occDocRank(HashMap<String, ArrayList<DocEntry>> termList, String term, ArrayList<DocRank> docRanking) | |
{ | |
// If the term isn't found return negative message | |
if (!termList.containsKey(term)) | |
System.out.println(term+ " not found"); | |
else | |
{ | |
ArrayList <DocEntry> postingList = termList.get(term); | |
boolean contains = false; | |
// We will iterate through all the Doc Entries of the Posting List | |
for (int i = 0; i < postingList.size(); i++) | |
{ | |
int j=0; | |
// We will check if each Doc Entry already exists in Doc Ranking ArrayList | |
while (!contains && j<docRanking.size()) | |
{ | |
// Checking based on name | |
if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc())) | |
contains=true; | |
else | |
j++; | |
} | |
// If DocRanking list contains corresponding document, increase its Word Score | |
if (contains) | |
{ | |
docRanking.get(j).increaseWordScore(1); | |
} | |
// Otherwise, create a new Doc Rank object | |
else | |
{ | |
// Get the name of the Parent Document and term frequency | |
String parentDoc = postingList.get(i).getParentDoc(); | |
// Create a new DocRank object with these variables & add to ArrayList | |
DocRank dr = new DocRank(parentDoc, 1); | |
docRanking.add(i, dr); | |
} | |
} | |
} | |
return docRanking; | |
} | |
/** | |
Modify a DocRanking List based on weighted search of Posting List for a single term | |
@param termList the index of terms to be checked | |
@param term the term to be checked | |
@param docRanking the Ranking List to be modified | |
@return ArrayList<DocRank> modified Array List | |
*/ | |
public static ArrayList <DocRank> weightedDocRank(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, String term, ArrayList<DocRank> docRanking) | |
{ | |
// If the term isn't found return negative message | |
if (!termList.containsKey(term)) | |
System.out.println(term+ " not found"); | |
else | |
{ | |
ArrayList <DocEntry> postingList = termList.get(term); | |
boolean contains = false; | |
// We will iterate through all the Doc Entries of the Posting List | |
for (int i = 0; i < postingList.size(); i++) | |
{ | |
int j=0; | |
// We will check if each Doc Entry already exists in Doc Ranking ArrayList | |
while (!contains && j<docRanking.size()) | |
{ | |
// Checking based on name | |
if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc())) | |
contains=true; | |
else | |
j++; | |
} | |
// If DocRanking list contains corresponding document, increase its Word Score | |
if (contains) | |
{ | |
// Get the Collection Frequency Weight of the term | |
// CFW(t) = log(N/n) | |
double totalDocs; | |
totalDocs= docIndex.size(); | |
double postingSize; | |
postingSize = postingList.size(); | |
double prelogWeight = totalDocs/postingSize; | |
double weight = Math.log(prelogWeight); | |
// Get the frequency of the term, divide by normalised document length | |
int wdf = postingList.get(i).getTermFrequency(); | |
double avgLength; | |
avgLength = docIndex.get("total")/(docIndex.size()-1); | |
String title = docRanking.get(j).getParentDoc(); | |
int wordLength = docIndex.get(title); | |
double ndl = wordLength/avgLength; | |
double termWeight = (wdf/ndl)*weight; | |
docRanking.get(j).increaseWordScore(termWeight); | |
} | |
// Otherwise, create a new Doc Rank object | |
else | |
{ | |
// Get the Collection Frequency Weight of the term | |
// CFW(t) = log(N/n) i.e. Total Number of docs / Size of Posting List | |
double totalDocs; | |
totalDocs= docIndex.size(); | |
double postingSize; | |
postingSize = postingList.size(); | |
double prelogWeight = totalDocs/postingSize; | |
double weight = Math.log(prelogWeight); | |
// Get the name of the Parent Document and term frequency | |
String parentDoc = postingList.get(i).getParentDoc(); | |
int wdf = postingList.get(i).getTermFrequency(); | |
// Calculate the average length of documents in Doc Index | |
double avgLength; | |
avgLength = docIndex.get("total")/(docIndex.size()-1); | |
int wordLength = docIndex.get(parentDoc); | |
double ndl = wordLength/avgLength; | |
// Weight of this term's occurence = (freq/normalised doc length) * weight of this term | |
double termWeight = (wdf/ndl)*weight; | |
// Create a new DocRank object with these variables & add to ArrayList | |
DocRank dr = new DocRank(parentDoc, termWeight); | |
docRanking.add(i, dr); | |
} | |
} | |
} | |
return docRanking; | |
} | |
/** A Comparator for DocRank objects. | |
This is adapted from a similar method found here: | |
http://thejavablog.wordpress.com/2010/02/16/how-to-sort-a-list-of-objects/ | |
@param dr1 ranking to be compared | |
@param dr2 ranking to be compared | |
@return compare result of comparison | |
*/ | |
private static Comparator<DocRank> rankSorter = new Comparator<DocRank>() | |
{ | |
public int compare(DocRank dr1, DocRank dr2) | |
{ | |
double result = dr1.getWordScore()- dr2.getWordScore(); | |
if (result>0) | |
return 1; | |
else if (result==0) | |
return 0; | |
else | |
return -1; | |
} | |
}; | |
/** | |
Get Doc Ranking for array of terms | |
@param termList the Term Index to be searched | |
@param docIndex the Index of Documents that corresponds | |
@param terms the array of terms to be searched for | |
@param searchType the type of search to be done | |
@return a string listing of document names | |
*/ | |
public static String search(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, String[] terms, String searchType) | |
{ | |
ArrayList <DocRank> docRanking = new ArrayList<DocRank>(); | |
// Do a Doc Ranking process for each of the terms in the array | |
for (int i=0; i<terms.length; i++) | |
{ | |
String term = preprocess(terms[i]); | |
// If the search type selected is wordfreq, do this search | |
if (searchType.equals("wordFreq")) | |
{ | |
docRanking = freqDocRank(termList, term, docRanking); | |
} | |
// Otherwise, do a word occurrence search | |
else if (searchType.equals("wordOcc")) | |
{ | |
docRanking = occDocRank(termList,term,docRanking); | |
} | |
else | |
{ | |
docRanking = weightedDocRank(termList, docIndex, term, docRanking); | |
} | |
} | |
// Sort the completed DocRanking array | |
Collections.sort(docRanking, rankSorter); | |
String result =""; | |
// Print out the various terms in descending order | |
for (int i=docRanking.size()-1; i>0; i--) | |
{ | |
result+= docRanking.get(i).getParentDoc() +"\n"; | |
} | |
return result; | |
} | |
public static void main(String[] args) | |
{ | |
HashMap <String, ArrayList<DocEntry>> termIndex = createTermIndex(); | |
HashMap <String, Integer> docIndex = createDocIndex(); | |
File f1 = new File("BBC_2012_5_1_NATO_Afghanistan.txt"); | |
File f2 = new File("BBC_2012_5_1_News_Corp.txt"); | |
File f3 = new File("ATIMES_2012_4_28_BRIC_Future.txt"); | |
File f4 = new File("ATIMES_2012_5_2_Afghanistan_peace.txt"); | |
File f5 = new File("ATIMES_2012_5_2_Uzbekistan_Angren.txt"); | |
File f6 = new File("BBC_2012_3_2_NATO_Afghanistan.txt"); | |
File f7 = new File("BBC_2012_5_1_US_Afghanistan.txt"); | |
File f8 = new File("Guardian_2012_5_1_Al_Qaida_Yemen.txt"); | |
File f9 = new File("Guardian_2012_5_1_Germany_Spain_austerity.txt"); | |
File f10 = new File("Guardian_2012_5_1_News_Corp.txt"); | |
File f11 = new File("Guardian_2012_5_1_stock_exchange_protest.txt"); | |
File f12 = new File("Independent_IE_2012_4_28_NAMA_Dalymount.txt"); | |
File f13 = new File("Independent_IE_2012_5_1_Referendum_threats.txt"); | |
File f14 = new File( "RTE_2009_9_29_OECD_unemployment.txt"); | |
File f15 = new File("RTE_2012_5_1_Rupert_Murdoch.txt"); | |
File f16 = new File("RTE_2012_5_1_FF_Referendum.txt"); | |
File[] documents = { f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16 }; | |
// File cranfield = new File("Cranfield"); | |
// File[] testDocs = cranfield.listFiles(); | |
addDocumentArray(termIndex, docIndex, documents); | |
// addDocumentArray(termIndex, docIndex, testDocs); | |
String[] searchTerm; | |
String searchType =""; | |
// If user wants to do a special search, identify it and remove term | |
if (args[0].equals("wordOcc") || args[0].equals("wordFreq")) | |
{ | |
if (args[0].equals("wordOcc")) | |
{ | |
searchType = "wordOcc"; | |
} | |
else | |
{ | |
searchType = "wordFreq"; | |
} | |
searchTerm = new String[args.length-1]; | |
for (int i = 0; i<args.length-1; i++) | |
{ | |
searchTerm[i]=args[i+1]; | |
} | |
} | |
else | |
{ | |
searchTerm=args; | |
} | |
System.out.println(search(termIndex, docIndex, searchTerm, searchType)); | |
} | |
} | |
/************************************** | |
DocEntry Class | |
- A class for storing information about term positions in documents | |
* @author - Dara McHugh * | |
* dara.mchugh22@student.computing.dcu.ie * | |
* * | |
* @version - 1.6 * | |
* @since - 2012 - 05 - 8 | |
*/ | |
import java.util.*; | |
public class DocEntry | |
{ | |
private String parentDoc; // Title String of parent document | |
private ArrayList <Integer> withinDocPos; // Int Array List of positions in doc where term appears | |
/** | |
Create a new DocEntry object | |
@param parentDoc the name of the origin document | |
@param withinDocPos the list of positions term occurs in | |
@return DocEntry object | |
*/ | |
public DocEntry(String parentDoc, ArrayList <Integer> withinDocPos) | |
{ | |
this.parentDoc = parentDoc; | |
this.withinDocPos = withinDocPos; | |
} | |
/** | |
Get name of origin document | |
@return Origin document name | |
*/ | |
public String getParentDoc() | |
{ | |
return this.parentDoc; | |
} | |
/** | |
Get the list of positions | |
@return list of positions | |
*/ | |
public ArrayList<Integer> getPositions() | |
{ | |
return this.withinDocPos; | |
} | |
/** | |
Add a new term position to list | |
@param pos the position to be added | |
*/ | |
public void addPosition(int pos) | |
{ | |
withinDocPos.add(pos); | |
} | |
/** | |
Get the term frequency for this document | |
@return size of positions list | |
*/ | |
public int getTermFrequency() | |
{ | |
return this.withinDocPos.size(); | |
} | |
} | |
/*********************************** | |
* DocRank class | |
* A class for storing information about | |
* ranking of documents | |
* @author - Dara McHugh * | |
* dara.mchugh22@student.computing.dcu.ie * | |
* * | |
* @version - 1.6 * | |
* @since - 2012 - 05 - 8 | |
***********************************/ | |
import java.util.*; | |
public class DocRank | |
{ | |
private String parentDoc; | |
private double wordScore; | |
/** | |
Create a new Doc Rank object | |
@param parentDoc name of origin document | |
@param wordScore value of ranking score | |
*/ | |
public DocRank(String parentDoc, double wordScore) | |
{ | |
this.parentDoc = parentDoc; | |
this.wordScore = wordScore; | |
} | |
/** | |
Create a new Doc Rank object | |
@param parentDoc name of origin document | |
*/ | |
public DocRank(String parentDoc) | |
{ | |
this.parentDoc = parentDoc; | |
this.wordScore = 0; | |
} | |
/** | |
Get the name of the origin document | |
@return name of origin document | |
*/ | |
public String getParentDoc() | |
{ | |
return parentDoc; | |
} | |
/** | |
Get the Ranking Score of Entry | |
@return Ranking Score | |
*/ | |
public double getWordScore() | |
{ | |
return wordScore; | |
} | |
/** | |
Increase the Ranking Score of Entry | |
@param increase the integer amount to increase by | |
*/ | |
public void increaseWordScore(int increase) | |
{ | |
double doubInc = (double)increase; | |
this.wordScore += doubInc; | |
} | |
/** | |
Increase the Ranking Score of Entry | |
@param increase the double amount to increase by | |
*/ | |
public void increaseWordScore(double increase) | |
{ | |
this.wordScore += increase; | |
} | |
/** | |
A comparable method. Deprecated | |
@param dr Doc Rank to compare with | |
@return 1 if current object is greater, 0 if equal, -1 if smaller | |
*/ | |
public int compareTo(DocRank dr) | |
{ | |
if (this.getWordScore()>dr.getWordScore()) | |
return 1; | |
else if (this.getWordScore()==dr.getWordScore()) | |
return 0; | |
else | |
return -1; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment