daramcq/RecDep.java

## RecDep.java
/***********************************************
* RecDep - Information Retrieval System        *
* @author - Dara McHugh                        *
* dara.mchugh22@student.computing.dcu.ie       *
*                                              *
* @version - 1.6                               *
* @since - 2012 - 05 - 8                       *
Screencast Available at:                       *
http://student.computing.dcu.ie/~mchugd22/java_ir.ogv
*                                              *
*   NB - additional DocRank and DocEntry classes
*    are included at bottom for reference      *
*                                              *
***********************************************/

import java.util.*;
import java.io.*;
import org.tartarus.snowball.ext.*;
import org.tartarus.snowball.*;


public class RecDep

{

    /**
        Create a new Term Index - empty
    */
    public static HashMap <String, ArrayList<DocEntry>> createTermIndex()
    {
        // Create a Hash Map to store input in format <Key = String, Value = ArrayList <DocEntry> >
        HashMap <String, ArrayList <DocEntry>> termIndex = new HashMap<String, ArrayList <DocEntry>>();

        return termIndex;
    }

    /**
     Create a Document Index - empty
     */
    public static HashMap <String, Integer> createDocIndex()
    {
        // Create a HashMap to store input in format Title (String), Wordcount (Integer)
        HashMap <String, Integer> docIndex = new HashMap <String, Integer>();
        return docIndex;
    }
    /**
    Preprocess terms - Convert to lowercase, remove punctuation and stem
        @param term term to be preprocessed
        @return term term after being processed
     */
    public static String preprocess(String term)
    {
        term = term.toLowerCase();
        term = term.replaceAll("\\p{Punct}", "");
       // term = stemTerm(term);

        return term;
    }

    /**
    Add a term into the TermList.
    @param termList the Index of terms to be added to
    @param term the term to be added
    @param parentDoc the name of the term origin document
    @param pos the position of the term within origin document

    */
    public static void addTerm(HashMap <String, ArrayList<DocEntry> >termList, String term, String parentDoc, int pos)
    {
        // Check if word is in the Term Index already
        if (termList.containsKey(term))
        {
            // Get the DocEntry ArrayList so it can be searched
            ArrayList <DocEntry> postingList = termList.get(term);

            // In each one, check if the title element of the DocEntry equals that we are given
            for (int i=0; i<postingList.size(); i++)
            {
                // If it does, add the position of the term to the WDP Array & Exit
                if (postingList.get(i).getParentDoc().equals(parentDoc))
                {
                    postingList.get(i).addPosition(pos);

                    // Put the modified DocEntry Array into the correct bucket
                    termList.put(term, postingList);
                    return;
                }
            }
            // If Document is not present in DocEntry ArrayList, create a new DocEntry
            ArrayList <Integer> positions = new ArrayList<Integer>();
            positions.add(pos);
            DocEntry newDoc = new DocEntry(parentDoc, positions);

            // Add this DocEntry to the PostingList
            postingList.add(newDoc);

            // Add modified PostingList to TermList and Exit
            termList.put(term,postingList);
            return;
         }


       else
       {
            // If it is not, create a DocEntry for this file (Title, Term Position)
            ArrayList <Integer> positions = new ArrayList<Integer>();
            positions.add(pos);
            DocEntry newDoc = new DocEntry(parentDoc, positions);

            // Create an array of DocEntries & Add first DocEntry
            ArrayList<DocEntry> postingList = new ArrayList<DocEntry>();
            postingList.add(newDoc);

            // Put this array as new value into TermList
            termList.put(term,postingList);
       }

    }

    /**
        Add a new document to the Term Index
        @param termList the index of terms
        @param docIndex the index of parent documents
        @param textFile the document to be added
    */
    public static void addDocument(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, File textFile)
    {
        try
        {
            // Start Scanning the document
            Scanner sc = new Scanner(textFile);
            int posCount = 0;
            String parentDoc = textFile.getName();

            while (sc.hasNext())
            {
                // Preprocess and add term to TermList
                String term = sc.next();
                term = preprocess(term);
                addTerm(termList, term, parentDoc, posCount);
                posCount++;
            }
            docIndex.put(parentDoc, posCount);

            // If DocIndex contains Total value, increment it
            if (docIndex.containsKey("total"))
            {
                int currentTotal = docIndex.get("total");
                int newTotal = currentTotal + posCount;
                docIndex.put("total",newTotal);
            }
            // Otherwise, create a new one and initialise at posCount
            else
                docIndex.put("total", posCount);


        }
        catch(Exception e)
        {
        }
    }
    /**
        Add an array of documents to the Term Index
        @param termList the index of terms
        @param docIndex the index of parent documents
        @param textFiles the array of documents to be added
    */
    public static void addDocumentArray(HashMap<String, ArrayList<DocEntry>> termList, HashMap<String, Integer> docIndex ,File[] textFiles)
    {

          for (int i=0; i<textFiles.length ; i++)
          {
            addDocument(termList, docIndex, textFiles[i]);
          }
    }

    public static String stemTerm(String term)
    {

        try
        {
        // This implementation of Porter2 (Snowball) is based on: http://stackoverflow.com/questions/4397107/is-there-a-java-implementation-of-porter2-stemmer

            String lang = "english";

            Class extClass = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer");
            SnowballStemmer stemmer = (SnowballStemmer) extClass.newInstance();
            stemmer.setCurrent(term);
            stemmer.stem();
            term = stemmer.getCurrent();

        }

        catch (Exception e)
        {
           String s = e.getMessage();
           if (s != null)
           {
               System.out.println(s);
           }
            System.err.print("F3x3rd");
        }
        return term;
    }

    /**
        Modify a DocRanking List based on word frequency search of Posting List for a single term
        @param termList  the index of terms to be checked
        @param term  the term to be checked
        @param docRanking the Ranking List to be modified
        @return ArrayList<DocRank> modified Array List
    */
    public static ArrayList <DocRank> freqDocRank(HashMap<String, ArrayList<DocEntry>> termList, String term, ArrayList<DocRank> docRanking)
    {
        // If the term isn't found return negative message
        if (!termList.containsKey(term))
            System.out.println(term+ " not found");

        else
        {
            ArrayList <DocEntry> postingList = termList.get(term);
            boolean contains = false;

            // We will iterate through all the Doc Entries of the Posting List
            for (int i = 0; i < postingList.size(); i++)
            {
                int j=0;

                // We will check if each Doc Entry already exists in Doc Ranking ArrayList
                while (!contains && j<docRanking.size())
                {
                    // Checking based on name
                    if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc()))
                            contains=true;
                     else
                        j++;
                }
                // If DocRanking list contains corresponding document, increase its Word Score
                if (contains)
                {
                    int increase = postingList.get(i).getTermFrequency();
                    docRanking.get(j).increaseWordScore(increase);
                }
                // Otherwise, create a new Doc Rank object
                else
                {
                    // Get the name of the Parent Document and term frequency
                    String parentDoc = postingList.get(i).getParentDoc();
                    int wdf = postingList.get(i).getTermFrequency();

                    // Create a new DocRank object with these variables & add to ArrayList
                    DocRank dr = new DocRank(parentDoc, wdf);
                    docRanking.add(i, dr);
                }


            }

       }
         return docRanking;
    }

     /**
        Modify a DocRanking List based on occurrence search of Posting List for a single term
        @param termList  the index of terms to be checked
        @param term  the term to be checked
        @param docRanking the Ranking List to be modified
        @return ArrayList<DocRank> modified Array List
    */
    public static ArrayList <DocRank> occDocRank(HashMap<String, ArrayList<DocEntry>> termList, String term, ArrayList<DocRank> docRanking)
    {
        // If the term isn't found return negative message
        if (!termList.containsKey(term))
            System.out.println(term+ " not found");

        else
        {
            ArrayList <DocEntry> postingList = termList.get(term);
            boolean contains = false;

            // We will iterate through all the Doc Entries of the Posting List
            for (int i = 0; i < postingList.size(); i++)
            {
                int j=0;

                // We will check if each Doc Entry already exists in Doc Ranking ArrayList
                while (!contains && j<docRanking.size())
                {
                    // Checking based on name
                    if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc()))
                            contains=true;
                     else
                        j++;
                }
                // If DocRanking list contains corresponding document, increase its Word Score
                if (contains)
                {
                    docRanking.get(j).increaseWordScore(1);
                }
                // Otherwise, create a new Doc Rank object
                else
                {
                    // Get the name of the Parent Document and term frequency
                    String parentDoc = postingList.get(i).getParentDoc();

                    // Create a new DocRank object with these variables & add to ArrayList
                    DocRank dr = new DocRank(parentDoc, 1);
                    docRanking.add(i, dr);
                }


            }

       }
         return docRanking;
    }

      /**
        Modify a DocRanking List based on weighted search of Posting List for a single term
        @param termList  the index of terms to be checked
        @param term  the term to be checked
        @param docRanking the Ranking List to be modified
        @return ArrayList<DocRank> modified Array List
    */
    public static ArrayList <DocRank> weightedDocRank(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, String term, ArrayList<DocRank> docRanking)
    {
        // If the term isn't found return negative message
        if (!termList.containsKey(term))
            System.out.println(term+ " not found");

        else
        {
            ArrayList <DocEntry> postingList = termList.get(term);
            boolean contains = false;

            // We will iterate through all the Doc Entries of the Posting List
            for (int i = 0; i < postingList.size(); i++)
            {
                int j=0;

                // We will check if each Doc Entry already exists in Doc Ranking ArrayList
                while (!contains && j<docRanking.size())
                {
                    // Checking based on name
                    if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc()))
                            contains=true;
                     else
                        j++;
                }
                // If DocRanking list contains corresponding document, increase its Word Score
                if (contains)
                {
                    // Get the Collection Frequency Weight of the term
                    // CFW(t) = log(N/n)
                    double totalDocs;
                    totalDocs= docIndex.size();
                    double postingSize;
                    postingSize = postingList.size();
                    double prelogWeight = totalDocs/postingSize;
                    double weight = Math.log(prelogWeight);

                    // Get the frequency of the term, divide by normalised document length
                    int wdf = postingList.get(i).getTermFrequency();

                    double avgLength;
                    avgLength = docIndex.get("total")/(docIndex.size()-1);
                    String title = docRanking.get(j).getParentDoc();
                    int wordLength = docIndex.get(title);
                    double ndl = wordLength/avgLength;

                    double termWeight = (wdf/ndl)*weight;

                    docRanking.get(j).increaseWordScore(termWeight);
                }
                // Otherwise, create a new Doc Rank object
                else
                {
                    // Get the Collection Frequency Weight of the term
                    // CFW(t) = log(N/n) i.e. Total Number of docs / Size of Posting List
                    double totalDocs;
                    totalDocs= docIndex.size();
                    double postingSize;
                    postingSize = postingList.size();
                    double prelogWeight = totalDocs/postingSize;
                    double weight = Math.log(prelogWeight);

                    // Get the name of the Parent Document and term frequency
                    String parentDoc = postingList.get(i).getParentDoc();
                    int wdf = postingList.get(i).getTermFrequency();

                    // Calculate the average length of documents in Doc Index
                    double avgLength;
                    avgLength = docIndex.get("total")/(docIndex.size()-1);

                    int wordLength = docIndex.get(parentDoc);
                    double ndl = wordLength/avgLength;

                    // Weight of this term's occurence = (freq/normalised doc length) * weight of this term
                    double termWeight = (wdf/ndl)*weight;

                    // Create a new DocRank object with these variables & add to ArrayList
                    DocRank dr = new DocRank(parentDoc, termWeight);
                    docRanking.add(i, dr);
                }


            }

       }
         return docRanking;
    }

    /** A Comparator for DocRank objects.
    This is adapted from a similar method found here:
     http://thejavablog.wordpress.com/2010/02/16/how-to-sort-a-list-of-objects/
     @param dr1 ranking to be compared
     @param dr2 ranking to be compared
     @return compare result of comparison
     */
    private static Comparator<DocRank> rankSorter = new Comparator<DocRank>()
    {
        public int compare(DocRank dr1, DocRank dr2)
        {
            double result = dr1.getWordScore()- dr2.getWordScore();

            if (result>0)
                return 1;
            else if (result==0)
                return 0;
            else
                return -1;
        }

    };


   /**
    Get Doc Ranking for array of terms
    @param termList the Term Index to be searched
    @param docIndex the Index of Documents that corresponds
    @param terms the array of terms to be searched for
    @param searchType the type of search to be done
    @return a string listing of document names
   */
    public static String search(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, String[] terms, String searchType)
    {

           ArrayList <DocRank> docRanking = new ArrayList<DocRank>();

           // Do a Doc Ranking process for each of the terms in the array
           for (int i=0; i<terms.length; i++)
           {
                String term = preprocess(terms[i]);

                // If the search type selected is wordfreq, do this search
                if (searchType.equals("wordFreq"))
                {
                    docRanking = freqDocRank(termList, term, docRanking);
                }
                // Otherwise, do a word occurrence search
                else if (searchType.equals("wordOcc"))
                {
                    docRanking = occDocRank(termList,term,docRanking);
                }
                else
                {
                    docRanking = weightedDocRank(termList, docIndex, term, docRanking);
                }
           }

           // Sort the completed DocRanking array
           Collections.sort(docRanking, rankSorter);

           String result ="";

           // Print out the various terms in descending order
           for (int i=docRanking.size()-1; i>0; i--)
           {
                result+= docRanking.get(i).getParentDoc() +"\n";
           }

           return result;

    }

    public static void main(String[] args)
    {
        HashMap <String, ArrayList<DocEntry>> termIndex = createTermIndex();
        HashMap <String, Integer> docIndex = createDocIndex();

        File f1 = new File("BBC_2012_5_1_NATO_Afghanistan.txt");
        File f2 = new File("BBC_2012_5_1_News_Corp.txt");
        File f3 = new File("ATIMES_2012_4_28_BRIC_Future.txt");
        File f4 = new File("ATIMES_2012_5_2_Afghanistan_peace.txt");
        File f5 = new File("ATIMES_2012_5_2_Uzbekistan_Angren.txt");
        File f6 = new File("BBC_2012_3_2_NATO_Afghanistan.txt");
        File f7 = new File("BBC_2012_5_1_US_Afghanistan.txt");
        File f8 = new File("Guardian_2012_5_1_Al_Qaida_Yemen.txt");
        File f9 = new File("Guardian_2012_5_1_Germany_Spain_austerity.txt");
        File f10 = new File("Guardian_2012_5_1_News_Corp.txt");
        File f11 = new File("Guardian_2012_5_1_stock_exchange_protest.txt");
        File f12 = new File("Independent_IE_2012_4_28_NAMA_Dalymount.txt");
        File f13 = new File("Independent_IE_2012_5_1_Referendum_threats.txt");
        File f14 = new File( "RTE_2009_9_29_OECD_unemployment.txt");
        File f15 = new File("RTE_2012_5_1_Rupert_Murdoch.txt");
        File f16 = new File("RTE_2012_5_1_FF_Referendum.txt");
        File[] documents = { f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16 };

  //      File cranfield = new File("Cranfield");
  //      File[] testDocs = cranfield.listFiles();

        addDocumentArray(termIndex, docIndex, documents);


  //      addDocumentArray(termIndex, docIndex, testDocs);

        String[] searchTerm;
        String searchType ="";

        // If user wants to do a special search, identify it and remove term
        if (args[0].equals("wordOcc") || args[0].equals("wordFreq"))
        {
            if (args[0].equals("wordOcc"))
            {
                searchType = "wordOcc";
            }
            else
            {
                searchType = "wordFreq";
            }

            searchTerm = new String[args.length-1];
            for (int i = 0; i<args.length-1; i++)
            {
                searchTerm[i]=args[i+1];
            }
        }

        else
        {
            searchTerm=args;
        }

       System.out.println(search(termIndex, docIndex, searchTerm, searchType));

    }
}

/**************************************
    DocEntry Class
    - A class for storing information about term positions in documents
* @author - Dara McHugh                        *
* dara.mchugh22@student.computing.dcu.ie       *
*                                              *
* @version - 1.6                               *
* @since - 2012 - 05 - 8
*/


import java.util.*;

public class DocEntry
{
    private String parentDoc;       // Title String of parent document
    private ArrayList <Integer> withinDocPos;     // Int Array List of positions in doc where term appears

    /**
    Create a new DocEntry object
        @param parentDoc the name of the origin document
        @param withinDocPos the list of positions term occurs in
        @return DocEntry object
    */
    public DocEntry(String parentDoc, ArrayList <Integer> withinDocPos)
    {
        this.parentDoc = parentDoc;
        this.withinDocPos = withinDocPos;
    }
    /**
        Get name of origin document
        @return Origin document name
    */
    public String getParentDoc()
    {
        return this.parentDoc;
    }
    /**
    Get the list of positions
        @return list of positions
    */
    public ArrayList<Integer> getPositions()
    {
        return this.withinDocPos;
    }
    /**
        Add a new term position to list
        @param pos the position to be added
    */
    public void addPosition(int pos)
    {
        withinDocPos.add(pos);
    }
    /**
        Get the term frequency for this document
        @return size of positions list
    */
    public int getTermFrequency()
    {
        return this.withinDocPos.size();
    }

 }
 /***********************************
* DocRank class
* A class for storing information about
* ranking of documents
* @author - Dara McHugh                        *
* dara.mchugh22@student.computing.dcu.ie       *
*                                              *
* @version - 1.6                               *
* @since - 2012 - 05 - 8
***********************************/

import java.util.*;

public class DocRank
{
    private String parentDoc;
    private double wordScore;

    /**
        Create a new Doc Rank object
        @param parentDoc name of origin document
        @param wordScore value of ranking score
    */
    public DocRank(String parentDoc, double wordScore)
    {
        this.parentDoc = parentDoc;
        this.wordScore = wordScore;
    }
    /**
        Create a new Doc Rank object
        @param parentDoc name of origin document
    */
    public DocRank(String parentDoc)
    {
        this.parentDoc = parentDoc;
        this.wordScore = 0;
    }
    /**
        Get the name of the origin document
        @return name of origin document
    */
    public String getParentDoc()
    {
        return parentDoc;
    }
    /**
        Get the Ranking Score of Entry
        @return Ranking Score
    */
    public double getWordScore()
    {
        return wordScore;
    }
    /**
        Increase the Ranking Score of Entry
        @param increase the integer amount to increase by

    */
    public void increaseWordScore(int increase)
    {
        double doubInc = (double)increase;
        this.wordScore += doubInc;
    }
     /**
        Increase the Ranking Score of Entry
        @param increase the double amount to increase by

    */
    public void increaseWordScore(double increase)
    {
        this.wordScore += increase;
    }
    /**
        A comparable method. Deprecated
        @param dr Doc Rank to compare with
        @return 1 if current object is greater, 0 if equal, -1 if smaller
    */
    public int compareTo(DocRank dr)
    {
        if (this.getWordScore()>dr.getWordScore())
            return 1;
        else if (this.getWordScore()==dr.getWordScore())
            return 0;
        else
            return -1;
    }

}
	/***********************************************
	* RecDep - Information Retrieval System *
	* @author - Dara McHugh *
	* dara.mchugh22@student.computing.dcu.ie *
	* *
	* @version - 1.6 *
	* @since - 2012 - 05 - 8 *
	Screencast Available at: *
	http://student.computing.dcu.ie/~mchugd22/java_ir.ogv
	* *
	* NB - additional DocRank and DocEntry classes
	* are included at bottom for reference *
	* *
	***********************************************/

	import java.util.*;
	import java.io.*;
	import org.tartarus.snowball.ext.*;
	import org.tartarus.snowball.*;


	public class RecDep

	{

	/**
	Create a new Term Index - empty
	*/
	public static HashMap <String, ArrayList<DocEntry>> createTermIndex()
	{
	// Create a Hash Map to store input in format <Key = String, Value = ArrayList <DocEntry> >
	HashMap <String, ArrayList <DocEntry>> termIndex = new HashMap<String, ArrayList <DocEntry>>();

	return termIndex;
	}

	/**
	Create a Document Index - empty
	*/
	public static HashMap <String, Integer> createDocIndex()
	{
	// Create a HashMap to store input in format Title (String), Wordcount (Integer)
	HashMap <String, Integer> docIndex = new HashMap <String, Integer>();
	return docIndex;
	}
	/**
	Preprocess terms - Convert to lowercase, remove punctuation and stem
	@param term term to be preprocessed
	@return term term after being processed
	*/
	public static String preprocess(String term)
	{
	term = term.toLowerCase();
	term = term.replaceAll("\\p{Punct}", "");
	// term = stemTerm(term);

	return term;
	}

	/**
	Add a term into the TermList.
	@param termList the Index of terms to be added to
	@param term the term to be added
	@param parentDoc the name of the term origin document
	@param pos the position of the term within origin document

	*/
	public static void addTerm(HashMap <String, ArrayList<DocEntry> >termList, String term, String parentDoc, int pos)
	{
	// Check if word is in the Term Index already
	if (termList.containsKey(term))
	{
	// Get the DocEntry ArrayList so it can be searched
	ArrayList <DocEntry> postingList = termList.get(term);

	// In each one, check if the title element of the DocEntry equals that we are given
	for (int i=0; i<postingList.size(); i++)
	{
	// If it does, add the position of the term to the WDP Array & Exit
	if (postingList.get(i).getParentDoc().equals(parentDoc))
	{
	postingList.get(i).addPosition(pos);

	// Put the modified DocEntry Array into the correct bucket
	termList.put(term, postingList);
	return;
	}
	}
	// If Document is not present in DocEntry ArrayList, create a new DocEntry
	ArrayList <Integer> positions = new ArrayList<Integer>();
	positions.add(pos);
	DocEntry newDoc = new DocEntry(parentDoc, positions);

	// Add this DocEntry to the PostingList
	postingList.add(newDoc);

	// Add modified PostingList to TermList and Exit
	termList.put(term,postingList);
	return;
	}


	else
	{
	// If it is not, create a DocEntry for this file (Title, Term Position)
	ArrayList <Integer> positions = new ArrayList<Integer>();
	positions.add(pos);
	DocEntry newDoc = new DocEntry(parentDoc, positions);

	// Create an array of DocEntries & Add first DocEntry
	ArrayList<DocEntry> postingList = new ArrayList<DocEntry>();
	postingList.add(newDoc);

	// Put this array as new value into TermList
	termList.put(term,postingList);
	}

	}

	/**
	Add a new document to the Term Index
	@param termList the index of terms
	@param docIndex the index of parent documents
	@param textFile the document to be added
	*/
	public static void addDocument(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, File textFile)
	{
	try
	{
	// Start Scanning the document
	Scanner sc = new Scanner(textFile);
	int posCount = 0;
	String parentDoc = textFile.getName();

	while (sc.hasNext())
	{
	// Preprocess and add term to TermList
	String term = sc.next();
	term = preprocess(term);
	addTerm(termList, term, parentDoc, posCount);
	posCount++;
	}
	docIndex.put(parentDoc, posCount);

	// If DocIndex contains Total value, increment it
	if (docIndex.containsKey("total"))
	{
	int currentTotal = docIndex.get("total");
	int newTotal = currentTotal + posCount;
	docIndex.put("total",newTotal);
	}
	// Otherwise, create a new one and initialise at posCount
	else
	docIndex.put("total", posCount);


	}
	catch(Exception e)
	{
	}
	}
	/**
	Add an array of documents to the Term Index
	@param termList the index of terms
	@param docIndex the index of parent documents
	@param textFiles the array of documents to be added
	*/
	public static void addDocumentArray(HashMap<String, ArrayList<DocEntry>> termList, HashMap<String, Integer> docIndex ,File[] textFiles)
	{

	for (int i=0; i<textFiles.length ; i++)
	{
	addDocument(termList, docIndex, textFiles[i]);
	}
	}

	public static String stemTerm(String term)
	{

	try
	{
	// This implementation of Porter2 (Snowball) is based on: http://stackoverflow.com/questions/4397107/is-there-a-java-implementation-of-porter2-stemmer

	String lang = "english";

	Class extClass = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer");
	SnowballStemmer stemmer = (SnowballStemmer) extClass.newInstance();
	stemmer.setCurrent(term);
	stemmer.stem();
	term = stemmer.getCurrent();

	}

	catch (Exception e)
	{
	String s = e.getMessage();
	if (s != null)
	{
	System.out.println(s);
	}
	System.err.print("F3x3rd");
	}
	return term;
	}

	/**
	Modify a DocRanking List based on word frequency search of Posting List for a single term
	@param termList the index of terms to be checked
	@param term the term to be checked
	@param docRanking the Ranking List to be modified
	@return ArrayList<DocRank> modified Array List
	*/
	public static ArrayList <DocRank> freqDocRank(HashMap<String, ArrayList<DocEntry>> termList, String term, ArrayList<DocRank> docRanking)
	{
	// If the term isn't found return negative message
	if (!termList.containsKey(term))
	System.out.println(term+ " not found");

	else
	{
	ArrayList <DocEntry> postingList = termList.get(term);
	boolean contains = false;

	// We will iterate through all the Doc Entries of the Posting List
	for (int i = 0; i < postingList.size(); i++)
	{
	int j=0;

	// We will check if each Doc Entry already exists in Doc Ranking ArrayList
	while (!contains && j<docRanking.size())
	{
	// Checking based on name
	if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc()))
	contains=true;
	else
	j++;
	}
	// If DocRanking list contains corresponding document, increase its Word Score
	if (contains)
	{
	int increase = postingList.get(i).getTermFrequency();
	docRanking.get(j).increaseWordScore(increase);
	}
	// Otherwise, create a new Doc Rank object
	else
	{
	// Get the name of the Parent Document and term frequency
	String parentDoc = postingList.get(i).getParentDoc();
	int wdf = postingList.get(i).getTermFrequency();

	// Create a new DocRank object with these variables & add to ArrayList
	DocRank dr = new DocRank(parentDoc, wdf);
	docRanking.add(i, dr);
	}


	}

	}
	return docRanking;
	}

	/**
	Modify a DocRanking List based on occurrence search of Posting List for a single term
	@param termList the index of terms to be checked
	@param term the term to be checked
	@param docRanking the Ranking List to be modified
	@return ArrayList<DocRank> modified Array List
	*/
	public static ArrayList <DocRank> occDocRank(HashMap<String, ArrayList<DocEntry>> termList, String term, ArrayList<DocRank> docRanking)
	{
	// If the term isn't found return negative message
	if (!termList.containsKey(term))
	System.out.println(term+ " not found");

	else
	{
	ArrayList <DocEntry> postingList = termList.get(term);
	boolean contains = false;

	// We will iterate through all the Doc Entries of the Posting List
	for (int i = 0; i < postingList.size(); i++)
	{
	int j=0;

	// We will check if each Doc Entry already exists in Doc Ranking ArrayList
	while (!contains && j<docRanking.size())
	{
	// Checking based on name
	if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc()))
	contains=true;
	else
	j++;
	}
	// If DocRanking list contains corresponding document, increase its Word Score
	if (contains)
	{
	docRanking.get(j).increaseWordScore(1);
	}
	// Otherwise, create a new Doc Rank object
	else
	{
	// Get the name of the Parent Document and term frequency
	String parentDoc = postingList.get(i).getParentDoc();

	// Create a new DocRank object with these variables & add to ArrayList
	DocRank dr = new DocRank(parentDoc, 1);
	docRanking.add(i, dr);
	}


	}

	}
	return docRanking;
	}

	/**
	Modify a DocRanking List based on weighted search of Posting List for a single term
	@param termList the index of terms to be checked
	@param term the term to be checked
	@param docRanking the Ranking List to be modified
	@return ArrayList<DocRank> modified Array List
	*/
	public static ArrayList <DocRank> weightedDocRank(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, String term, ArrayList<DocRank> docRanking)
	{
	// If the term isn't found return negative message
	if (!termList.containsKey(term))
	System.out.println(term+ " not found");

	else
	{
	ArrayList <DocEntry> postingList = termList.get(term);
	boolean contains = false;

	// We will iterate through all the Doc Entries of the Posting List
	for (int i = 0; i < postingList.size(); i++)
	{
	int j=0;

	// We will check if each Doc Entry already exists in Doc Ranking ArrayList
	while (!contains && j<docRanking.size())
	{
	// Checking based on name
	if (postingList.get(i).getParentDoc().equalsIgnoreCase(docRanking.get(j).getParentDoc()))
	contains=true;
	else
	j++;
	}
	// If DocRanking list contains corresponding document, increase its Word Score
	if (contains)
	{
	// Get the Collection Frequency Weight of the term
	// CFW(t) = log(N/n)
	double totalDocs;
	totalDocs= docIndex.size();
	double postingSize;
	postingSize = postingList.size();
	double prelogWeight = totalDocs/postingSize;
	double weight = Math.log(prelogWeight);

	// Get the frequency of the term, divide by normalised document length
	int wdf = postingList.get(i).getTermFrequency();

	double avgLength;
	avgLength = docIndex.get("total")/(docIndex.size()-1);
	String title = docRanking.get(j).getParentDoc();
	int wordLength = docIndex.get(title);
	double ndl = wordLength/avgLength;

	double termWeight = (wdf/ndl)*weight;

	docRanking.get(j).increaseWordScore(termWeight);
	}
	// Otherwise, create a new Doc Rank object
	else
	{
	// Get the Collection Frequency Weight of the term
	// CFW(t) = log(N/n) i.e. Total Number of docs / Size of Posting List
	double totalDocs;
	totalDocs= docIndex.size();
	double postingSize;
	postingSize = postingList.size();
	double prelogWeight = totalDocs/postingSize;
	double weight = Math.log(prelogWeight);

	// Get the name of the Parent Document and term frequency
	String parentDoc = postingList.get(i).getParentDoc();
	int wdf = postingList.get(i).getTermFrequency();

	// Calculate the average length of documents in Doc Index
	double avgLength;
	avgLength = docIndex.get("total")/(docIndex.size()-1);

	int wordLength = docIndex.get(parentDoc);
	double ndl = wordLength/avgLength;

	// Weight of this term's occurence = (freq/normalised doc length) * weight of this term
	double termWeight = (wdf/ndl)*weight;

	// Create a new DocRank object with these variables & add to ArrayList
	DocRank dr = new DocRank(parentDoc, termWeight);
	docRanking.add(i, dr);
	}


	}

	}
	return docRanking;
	}

	/** A Comparator for DocRank objects.
	This is adapted from a similar method found here:
	http://thejavablog.wordpress.com/2010/02/16/how-to-sort-a-list-of-objects/
	@param dr1 ranking to be compared
	@param dr2 ranking to be compared
	@return compare result of comparison
	*/
	private static Comparator<DocRank> rankSorter = new Comparator<DocRank>()
	{
	public int compare(DocRank dr1, DocRank dr2)
	{
	double result = dr1.getWordScore()- dr2.getWordScore();

	if (result>0)
	return 1;
	else if (result==0)
	return 0;
	else
	return -1;
	}

	};


	/**
	Get Doc Ranking for array of terms
	@param termList the Term Index to be searched
	@param docIndex the Index of Documents that corresponds
	@param terms the array of terms to be searched for
	@param searchType the type of search to be done
	@return a string listing of document names
	*/
	public static String search(HashMap<String, ArrayList<DocEntry>> termList, HashMap <String, Integer> docIndex, String[] terms, String searchType)
	{

	ArrayList <DocRank> docRanking = new ArrayList<DocRank>();

	// Do a Doc Ranking process for each of the terms in the array
	for (int i=0; i<terms.length; i++)
	{
	String term = preprocess(terms[i]);

	// If the search type selected is wordfreq, do this search
	if (searchType.equals("wordFreq"))
	{
	docRanking = freqDocRank(termList, term, docRanking);
	}
	// Otherwise, do a word occurrence search
	else if (searchType.equals("wordOcc"))
	{
	docRanking = occDocRank(termList,term,docRanking);
	}
	else
	{
	docRanking = weightedDocRank(termList, docIndex, term, docRanking);
	}
	}

	// Sort the completed DocRanking array
	Collections.sort(docRanking, rankSorter);

	String result ="";

	// Print out the various terms in descending order
	for (int i=docRanking.size()-1; i>0; i--)
	{
	result+= docRanking.get(i).getParentDoc() +"\n";
	}

	return result;

	}

	public static void main(String[] args)
	{
	HashMap <String, ArrayList<DocEntry>> termIndex = createTermIndex();
	HashMap <String, Integer> docIndex = createDocIndex();

	File f1 = new File("BBC_2012_5_1_NATO_Afghanistan.txt");
	File f2 = new File("BBC_2012_5_1_News_Corp.txt");
	File f3 = new File("ATIMES_2012_4_28_BRIC_Future.txt");
	File f4 = new File("ATIMES_2012_5_2_Afghanistan_peace.txt");
	File f5 = new File("ATIMES_2012_5_2_Uzbekistan_Angren.txt");
	File f6 = new File("BBC_2012_3_2_NATO_Afghanistan.txt");
	File f7 = new File("BBC_2012_5_1_US_Afghanistan.txt");
	File f8 = new File("Guardian_2012_5_1_Al_Qaida_Yemen.txt");
	File f9 = new File("Guardian_2012_5_1_Germany_Spain_austerity.txt");
	File f10 = new File("Guardian_2012_5_1_News_Corp.txt");
	File f11 = new File("Guardian_2012_5_1_stock_exchange_protest.txt");
	File f12 = new File("Independent_IE_2012_4_28_NAMA_Dalymount.txt");
	File f13 = new File("Independent_IE_2012_5_1_Referendum_threats.txt");
	File f14 = new File( "RTE_2009_9_29_OECD_unemployment.txt");
	File f15 = new File("RTE_2012_5_1_Rupert_Murdoch.txt");
	File f16 = new File("RTE_2012_5_1_FF_Referendum.txt");
	File[] documents = { f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16 };

	// File cranfield = new File("Cranfield");
	// File[] testDocs = cranfield.listFiles();

	addDocumentArray(termIndex, docIndex, documents);


	// addDocumentArray(termIndex, docIndex, testDocs);

	String[] searchTerm;
	String searchType ="";

	// If user wants to do a special search, identify it and remove term
	if (args[0].equals("wordOcc") \|\| args[0].equals("wordFreq"))
	{
	if (args[0].equals("wordOcc"))
	{
	searchType = "wordOcc";
	}
	else
	{
	searchType = "wordFreq";
	}

	searchTerm = new String[args.length-1];
	for (int i = 0; i<args.length-1; i++)
	{
	searchTerm[i]=args[i+1];
	}
	}

	else
	{
	searchTerm=args;
	}

	System.out.println(search(termIndex, docIndex, searchTerm, searchType));

	}
	}

	/**************************************
	DocEntry Class
	- A class for storing information about term positions in documents
	* @author - Dara McHugh *
	* dara.mchugh22@student.computing.dcu.ie *
	* *
	* @version - 1.6 *
	* @since - 2012 - 05 - 8
	*/


	import java.util.*;

	public class DocEntry
	{
	private String parentDoc; // Title String of parent document
	private ArrayList <Integer> withinDocPos; // Int Array List of positions in doc where term appears

	/**
	Create a new DocEntry object
	@param parentDoc the name of the origin document
	@param withinDocPos the list of positions term occurs in
	@return DocEntry object
	*/
	public DocEntry(String parentDoc, ArrayList <Integer> withinDocPos)
	{
	this.parentDoc = parentDoc;
	this.withinDocPos = withinDocPos;
	}
	/**
	Get name of origin document
	@return Origin document name
	*/
	public String getParentDoc()
	{
	return this.parentDoc;
	}
	/**
	Get the list of positions
	@return list of positions
	*/
	public ArrayList<Integer> getPositions()
	{
	return this.withinDocPos;
	}
	/**
	Add a new term position to list
	@param pos the position to be added
	*/
	public void addPosition(int pos)
	{
	withinDocPos.add(pos);
	}
	/**
	Get the term frequency for this document
	@return size of positions list
	*/
	public int getTermFrequency()
	{
	return this.withinDocPos.size();
	}

	}
	/***********************************
	* DocRank class
	* A class for storing information about
	* ranking of documents
	* @author - Dara McHugh *
	* dara.mchugh22@student.computing.dcu.ie *
	* *
	* @version - 1.6 *
	* @since - 2012 - 05 - 8
	***********************************/

	import java.util.*;

	public class DocRank
	{
	private String parentDoc;
	private double wordScore;

	/**
	Create a new Doc Rank object
	@param parentDoc name of origin document
	@param wordScore value of ranking score
	*/
	public DocRank(String parentDoc, double wordScore)
	{
	this.parentDoc = parentDoc;
	this.wordScore = wordScore;
	}
	/**
	Create a new Doc Rank object
	@param parentDoc name of origin document
	*/
	public DocRank(String parentDoc)
	{
	this.parentDoc = parentDoc;
	this.wordScore = 0;
	}
	/**
	Get the name of the origin document
	@return name of origin document
	*/
	public String getParentDoc()
	{
	return parentDoc;
	}
	/**
	Get the Ranking Score of Entry
	@return Ranking Score
	*/
	public double getWordScore()
	{
	return wordScore;
	}
	/**
	Increase the Ranking Score of Entry
	@param increase the integer amount to increase by

	*/
	public void increaseWordScore(int increase)
	{
	double doubInc = (double)increase;
	this.wordScore += doubInc;
	}
	/**
	Increase the Ranking Score of Entry
	@param increase the double amount to increase by

	*/
	public void increaseWordScore(double increase)
	{
	this.wordScore += increase;
	}
	/**
	A comparable method. Deprecated
	@param dr Doc Rank to compare with
	@return 1 if current object is greater, 0 if equal, -1 if smaller
	*/
	public int compareTo(DocRank dr)
	{
	if (this.getWordScore()>dr.getWordScore())
	return 1;
	else if (this.getWordScore()==dr.getWordScore())
	return 0;
	else
	return -1;
	}

	}