Skip to content

Instantly share code, notes, and snippets.

@johnconroy
Created September 27, 2011 14:59
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save johnconroy/1245277 to your computer and use it in GitHub Desktop.
Save johnconroy/1245277 to your computer and use it in GitHub Desktop.
Term frequency Inverse Document Frequency Java
package tfidf;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
public class makeTfidfLongvectors {
public static int numDocuments;
public ArrayList loadUserDocs() throws IOException{
//nb returns arraylist, each element is an array size 2
ArrayList userDocs= new ArrayList();
ArrayList tempUserDoc= new ArrayList();
// one document per line. format: [username**MARK**document content.....]
String docPath="/path/to/documentsFile";
BufferedReader br= new BufferedReader(new FileReader(new File(docPath)));
String line;
String doc;
String user;
String[] userAndDoc;
int countLine=0;
int parseErrs=0;
while ((line=br.readLine())!=null){
//System.out.println(line);
try{
//each line contains the user's name, then their document, seperated by "**MARK**"
userAndDoc=line.split("\\*\\*MARK\\*\\*");
user=userAndDoc[0];
doc=userAndDoc[1];
//System.out.println(user+doc);
if (doc.length()>3){
userDocs.add(userAndDoc);
}
countLine++;
}catch (Exception e){parseErrs++;}
}
System.out.println(parseErrs);
System.out.println("Num lines: "+countLine);
this.numDocuments=userDocs.size();
System.out.println("num docs: "+this.numDocuments);
return userDocs;
}
public HashMap loadVocabMap() throws IOException{
//contains each unique word in the corpus, plus the number of documents it's found in.
//format: [word frequency]
//returned as a word:frequency map
String vocabFilePath="/path/to/docFreqs.data";
HashMap<String,Integer> vocabCount=new HashMap();
String line="";
BufferedReader br= new BufferedReader(new FileReader(new File(vocabFilePath)));
String[] thisWordAndFreq;
String key;
Integer value;
while((line=br.readLine())!=null){
thisWordAndFreq=line.split(" ");
key=thisWordAndFreq[0];
value=Integer.parseInt(thisWordAndFreq[1]);
if (thisWordAndFreq[0].length()>2){ //ie if a word is actually there and not whitespace etc.
vocabCount.put(key, value);
}
}
return vocabCount;
}
public static void main(String[] args) throws IOException{
int count=0;
make_tfidf_longvectors mtl= new make_tfidf_longvectors();
ArrayList vocabList= new ArrayList();
HashMap vocabAndFreq= mtl.loadVocabMap();
vocabList=mtl.makeVocabList(); //update vocabList defined in class
System.out.println("vocab list size: "+vocabList.size());
ArrayList documents=mtl.loadUserDocs(); //rem that each elem is [[uname][doc]]
ArrayList<Double> initDocMatrix;
ArrayList docMatrices;
ArrayList<Double> tfidfLongMatrix;
String[] docSplit;
String docStr;
for(int i=0;i<documents.size();i++){
initDocMatrix=mtl.initialiseDocMatrix(vocabList);
String[] thisDocList=(String[]) documents.get(i);
String user=thisDocList[0];
String userDoc=thisDocList[1];
tfidfLongMatrix=makeTfidfMatrix(userDoc, vocabAndFreq, initDocMatrix,vocabList);
mtl.writeLine(user, tfidfLongMatrix);
if (i%500==0){
System.out.println(i+" of "+ documents.size()+" written");
}
}
}
private void writeLine(String user, ArrayList<Double> tfidfLongMatrix) throws IOException {
//writes tf-idf weighted vectors to file
String matrixFilePath="/destinationFolder/tfidfVectors.data";
FileWriter fw=new FileWriter(matrixFilePath,true);
fw.write(user+" ");
DecimalFormat fourDForm = new DecimalFormat("#.#####");
Iterator iter= tfidfLongMatrix.iterator();
while (iter.hasNext()){
fw.write(String.valueOf(fourDForm.format(iter.next()))+" ");
}
fw.write("\n");
fw.close();
}
private ArrayList makeVocabList() throws IOException{
//as well as vocab/frequency hashmap, i need an arraylist, which is used to ensure the placing of tf-idf scores in the same order in the vector.
String vocabFilePath="C://datasets//twitter_data//sep11//forCossim//docFreqs_790-839.data";
ArrayList vocab=new ArrayList();
String line="";
BufferedReader br= new BufferedReader(new FileReader(new File(vocabFilePath)));
String[] thisWordAndFreq;
String word;
while((line=br.readLine())!=null){
thisWordAndFreq=line.split(" ");
word=thisWordAndFreq[0];
if (thisWordAndFreq[0].length()>2){ //ie if a word is actually there and not whitespace etc.
vocab.add(word);
}
}
return vocab;
}
private static ArrayList<Double> makeTfidfMatrix(String userDoc, HashMap vocabAndFreq, ArrayList<Double> docMatrix,ArrayList vocabList) {
String[] docSplit=userDoc.split(" ");
//find unique set of words
Set<String> wordSet=new HashSet(Arrays.asList(docSplit));
Iterator setIter= wordSet.iterator();
int docLen=docSplit.length;
int errs=0;
while (setIter.hasNext()){
String word=(String) setIter.next();
try{
Double wordTfidfScore=getWordTfidf(word, docSplit, vocabAndFreq, docLen);
//find place of that word in vocab
int place=vocabList.indexOf(word);
docMatrix.set(place, wordTfidfScore);
}catch(Exception e){errs++;//ie word isn't in vocab. ie was a stop word etc.
}
}
//System.out.println(errs);
return docMatrix;
}
private static Double getWordTfidf(String word, String[] docSplit, HashMap vocabAndFreq, int docLen) {
double tf=getTf(word, docSplit,docLen);
double idf=getIdf(word, (Integer)vocabAndFreq.get(word));
double tfidf=tf*idf;
return tfidf;
}
private static double getIdf(String word, int numDocsContainingWord) {
return Math.log(((numDocuments*1.0)/numDocsContainingWord));
}
private static double getTf(String word, String[] docSplit, int docLen) {
//number of occurences of this word in document
int termFreq=0;
for(int k=0;k<docSplit.length;k++){
if (word==docSplit[k]){
termFreq++;
}
}
return (termFreq/(float)docSplit.length);
}
private ArrayList initialiseDocMatrix(ArrayList vocabList) {
//set up an initial vector of the correct size (the size of the corpus vocab.) comprised of zeros
ArrayList initDocMatrix= new ArrayList();
for (int i=0;i<vocabList.size();i++){
initDocMatrix.add(0.0);
}
return initDocMatrix;
}
}
@VamshiKrishnaGaddam
Copy link

How to execute this. Where i would place the file path.

@roshanrathod
Copy link

Function : getIdf(String word, int numDocsContainingWord)

Values passed : getIdf(word, (Integer)vocabAndFreq.get(word));

vocabAndFreq is vocab and its frequency eg. test,11

for idf you need document frequency. How will this code [ (Integer)vocabAndFreq.get(word)) ] work? It gives you no. of occurences/frequency. what if word is repeated 5 times in one document? In that case, you df would be incorrect if you use the code above.

Correct me, if I'm wrong.

@SubirSam
Copy link

String vocabFilePath="C://datasets//twitter_data//sep11//forCossim//docFreqs_790-839.data"; //(within makeVocabList())
String vocabFilePath="/path/to/docFreqs.data"; //(within loadVocabMap())

what are these to do? can you explain ?

@SJ110
Copy link

SJ110 commented Jul 10, 2017

hello, I learn a lot from your code, but would tell me what does
String vocabFilePath="C://datasets//twitter_data//sep11//forCossim//docFreqs_790-839.data"; this mean? and
String matrixFilePath="/destinationFolder/tfidfVectors.data"; and
String vocabFilePath="/path/to/docFreqs.data";? please? I need your anwser .

@karandhurve28
Copy link

getting error in this line
make_tfidf_longvectors mtl= new make_tfidf_longvectors();
where do i find this method 'make_tfidf_longvectors()' or how do i use this make method()?

@mohammedriyasudheen-a
Copy link

i can't able to run this code....can you suggest me few tips

@mohammedriyasudheen-a
Copy link

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
public class makeTfidfLongvectors {
public static int numDocuments;

public ArrayList loadUserDocs() throws IOException{
	//nb returns arraylist, each element is an array size 2
	ArrayList userDocs= new ArrayList();
	ArrayList tempUserDoc= new ArrayList();
	
            // one document per line. format: [username**MARK**document content.....]
	String docPath="C:\\Users\\Kaz Innovations\\eclipse-workspace\\TFIDF Calculator\\documents\\dataset2.txt";
	BufferedReader br= new BufferedReader(new FileReader(new File(docPath)));
	
	String line;
	String doc;
	String user;
	String[] userAndDoc;
	int countLine=0;
	int parseErrs=0;
	
	while ((line=br.readLine())!=null){
		//System.out.println(line);
		try{
                    //each line contains the user's name, then their document, seperated by "**MARK**"
		userAndDoc=line.split("\\*\\*MARK\\*\\*");
		user=userAndDoc[0];
		doc=userAndDoc[1];
		//System.out.println(user+doc);
		//if (doc.length()>3){
			//userDocs.add(userAndDoc);
		//}
					
		countLine++;
		}catch (Exception e){parseErrs++;}
		
		
	}
	System.out.println(parseErrs);
	
	System.out.println("Num lines: "+countLine);
	this.numDocuments=userDocs.size();
	System.out.println("num docs: "+this.numDocuments);
	
	return userDocs; 
}


public HashMap loadVocabMap() throws IOException{
            //contains each unique word in the corpus, plus the number of documents it's found in.
            //format: [word frequency]
            //returned as a word:frequency map

	String vocabFilePath="C:\\Users\\Kaz Innovations\\eclipse-workspace\\TFIDF Calculator\\documents\\document.txt";
	
	HashMap<String,Integer> vocabCount=new HashMap();
	String line="";
	BufferedReader br= new BufferedReader(new FileReader(new File(vocabFilePath)));
	String[] thisWordAndFreq;
	String key;
	Integer value;
	while((line=br.readLine())!=null){
		thisWordAndFreq=line.split(" ");
		key=thisWordAndFreq[0];
		value=Integer.parseInt(thisWordAndFreq[1]);
		if (thisWordAndFreq[0].length()>2){ //ie if a word is actually there and not whitespace etc.
			vocabCount.put(key, value);
		}
	}
	return vocabCount;
	
}

public static void main(String[] args) throws IOException{
	int count=0;
	makeTfidfLongvectors mtl= new makeTfidfLongvectors();
	ArrayList vocabList= new ArrayList();
	
	HashMap vocabAndFreq= mtl.loadVocabMap();
	vocabList=mtl.makeVocabList(); //update vocabList defined in class
	System.out.println("vocab list size:  "+vocabList.size());
	ArrayList documents=mtl.loadUserDocs(); //rem that each elem is [[uname][doc]]
	ArrayList<Double> initDocMatrix;
	ArrayList docMatrices;
	ArrayList<Double> tfidfLongMatrix;
	String[] docSplit;
	String docStr;
	
	
	for(int i=0;i<documents.size();i++){
		
		initDocMatrix=mtl.initialiseDocMatrix(vocabList);
		
		String[] thisDocList=(String[]) documents.get(i);
		String user=thisDocList[0];
		String userDoc=thisDocList[1];
		tfidfLongMatrix=makeTfidfMatrix(userDoc, vocabAndFreq, initDocMatrix,vocabList);
		mtl.writeLine(user, tfidfLongMatrix);
		
		if (i%500==0){
			System.out.println(i+" of "+ documents.size()+" written");
		}
		
	}
	
}


private void writeLine(String user, ArrayList<Double> tfidfLongMatrix) throws IOException {
            //writes tf-idf weighted vectors to file

	String matrixFilePath="C:\\Users\\Kaz Innovations\\eclipse-workspace\\TFIDF Calculator\\documents\\tfidfVectors.txt";
	FileWriter fw=new FileWriter(matrixFilePath,true);
	fw.write(user+" ");
	DecimalFormat fourDForm = new DecimalFormat("#.#####");
	Iterator iter= tfidfLongMatrix.iterator();
	while (iter.hasNext()){
		fw.write(String.valueOf(fourDForm.format(iter.next()))+" ");
	}
	fw.write("\n");
	fw.close();	
	
}


private ArrayList makeVocabList() throws IOException{
            //as well as vocab/frequency hashmap, i need an arraylist, which is used to ensure the placing of tf-idf scores in the same order in the vector.               

	String vocabFilePath="C:\\\\Users\\\\Kaz Innovations\\\\eclipse-workspace\\\\TFIDF Calculator\\\\documents\\\\document.txt";
	ArrayList vocab=new ArrayList();
	String line="";
	BufferedReader br= new BufferedReader(new FileReader(new File(vocabFilePath)));
	String[] thisWordAndFreq;
	String word;
	
	
	while((line=br.readLine())!=null){
		thisWordAndFreq=line.split(" ");
		word=thisWordAndFreq[0];
		if (thisWordAndFreq[0].length()>2){ //ie if a word is actually there and not whitespace etc.
			vocab.add(word);
		}
	}
	return vocab;
	
}


private static ArrayList<Double> makeTfidfMatrix(String userDoc, HashMap vocabAndFreq, ArrayList<Double> docMatrix,ArrayList vocabList) {
	String[] docSplit=userDoc.split(" ");
	//find unique set of words
	Set<String> wordSet=new HashSet(Arrays.asList(docSplit));

	Iterator setIter= wordSet.iterator();
	int docLen=docSplit.length;
	int errs=0;

	while (setIter.hasNext()){
		String word=(String) setIter.next();
		try{
			Double wordTfidfScore=getWordTfidf(word, docSplit, vocabAndFreq, docLen);
			//find place of that word in vocab
			int place=vocabList.indexOf(word);
			docMatrix.set(place, wordTfidfScore);
			
		}catch(Exception e){errs++;//ie word isn't in vocab. ie was a stop word etc.
		}
		
	}
	//System.out.println(errs);
	return docMatrix;
}


private static Double getWordTfidf(String word, String[] docSplit, HashMap vocabAndFreq, int docLen) {
	double tf=getTf(word, docSplit,docLen);
	double idf=getIdf(word, (Integer)vocabAndFreq.get(word));
	double tfidf=tf*idf;
	
	
	return tfidf;
	
}



private static double getIdf(String word, int numDocsContainingWord) {
	
	
	return Math.log(((numDocuments*1.0)/numDocsContainingWord));
}


private static double getTf(String word, String[] docSplit, int docLen) {
	//number of occurences of this word in document
	int termFreq=0;
	for(int k=0;k<docSplit.length;k++){
		if (word==docSplit[k]){
			termFreq++;
		}
	}
	return (termFreq/(float)docSplit.length);
}




private ArrayList initialiseDocMatrix(ArrayList vocabList) {
            //set up an initial vector of the correct size (the size of the corpus vocab.) comprised of zeros
	ArrayList initDocMatrix= new ArrayList();
	for (int i=0;i<vocabList.size();i++){
		initDocMatrix.add(0.0);
		
	}
	return initDocMatrix;
}

}
see this i have given the input like this ... is there any problem in giving input if so please let me know

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment