Skip to content

Instantly share code, notes, and snippets.

@celleychen
Forked from guenodz/TFIDFCalculator.java
Created August 10, 2020 06:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save celleychen/89bf83e91ca68268789d9e02acde3dbb to your computer and use it in GitHub Desktop.
Save celleychen/89bf83e91ca68268789d9e02acde3dbb to your computer and use it in GitHub Desktop.
a simple implementation of TF-IDF algorithm in Java.
package com.guendouz.textclustering.preprocessing;
import java.util.Arrays;
import java.util.List;
/**
* @author Mohamed Guendouz
*/
public class TFIDFCalculator {
/**
* @param doc list of strings
* @param term String represents a term
* @return term frequency of term in document
*/
public double tf(List<String> doc, String term) {
double result = 0;
for (String word : doc) {
if (term.equalsIgnoreCase(word))
result++;
}
return result / doc.size();
}
/**
* @param docs list of list of strings represents the dataset
* @param term String represents a term
* @return the inverse term frequency of term in documents
*/
public double idf(List<List<String>> docs, String term) {
double n = 0;
for (List<String> doc : docs) {
for (String word : doc) {
if (term.equalsIgnoreCase(word)) {
n++;
break;
}
}
}
return Math.log(docs.size() / n);
}
/**
* @param doc a text document
* @param docs all documents
* @param term term
* @return the TF-IDF of term
*/
public double tfIdf(List<String> doc, List<List<String>> docs, String term) {
return tf(doc, term) * idf(docs, term);
}
public static void main(String[] args) {
List<String> doc1 = Arrays.asList("Lorem", "ipsum", "dolor", "ipsum", "sit", "ipsum");
List<String> doc2 = Arrays.asList("Vituperata", "incorrupte", "at", "ipsum", "pro", "quo");
List<String> doc3 = Arrays.asList("Has", "persius", "disputationi", "id", "simul");
List<List<String>> documents = Arrays.asList(doc1, doc2, doc3);
TFIDFCalculator calculator = new TFIDFCalculator();
double tfidf = calculator.tfIdf(doc1, documents, "ipsum");
System.out.println("TF-IDF (ipsum) = " + tfidf);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment