Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Last active October 20, 2018 11:40
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mocobeta/5525864 to your computer and use it in GitHub Desktop.
Save mocobeta/5525864 to your computer and use it in GitHub Desktop.
Lucene API を使って文書類似度を計算するテスト
package termvector;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
public class CalcCosineSimilarityTest {
private IndexReader reader = null;
public CalcCosineSimilarityTest() throws IOException {
Directory directory = FSDirectory.open(new File("tumblrdata"));
reader = DirectoryReader.open(directory);
}
public void close() throws IOException {
reader.close();
}
/**
* 2つのエントリの文書類似度(文書ベクトルのなす角度)を計算する
*/
public double computeSimilarity(String postid1, String postid2) throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
int docId1 = -1;
int docId2 = -1;
TopDocs hits = searcher.search(new TermQuery(new Term("id", postid1)), 1);
if (hits.scoreDocs.length > 0) {
docId1 = hits.scoreDocs[0].doc;
}
if (docId1 < 0) {
System.out.println("No Such Doc: " + postid1);
return 0.0;
}
hits = searcher.search(new TermQuery(new Term("id", postid2)), 1);
if (hits.scoreDocs.length > 0) {
docId2 = hits.scoreDocs[0].doc;
}
if (docId2 < 0) {
System.out.println("No Such Doc: " + postid2);
return 0.0;
}
Map<String, Double> vec1 = buildDocumentVector(docId1);
Map<String, Double> vec2 = buildDocumentVector(docId2);
return computeAngle(vec1, vec2);
}
/**
* 指定されたドキュメントの文書ベクトル(単語のTF-IDF値を並べたもの)を計算する
* 出現しない単語のTF-IDF値は省略
**/
private Map<String, Double> buildDocumentVector(int docId) throws IOException {
int maxDoc = reader.maxDoc();
Terms vector = reader.getTermVector(docId, "content");
// Term Vector から TF-IDF値を算出
TermsEnum itr = vector.iterator(null);
BytesRef ref = null;
List<TermFreq> list = new ArrayList<TermFreq>();
long tcSum = 0;
while ((ref = itr.next()) != null) {
String term = ref.utf8ToString();
TermFreq freq = new TermFreq(term, maxDoc);
freq.setTc(itr.totalTermFreq());
freq.setDf(reader.docFreq(new Term("content", term)));
list.add(freq);
tcSum += itr.totalTermFreq();
}
// TF-IDF値を並べたベクトル
Map<String, Double> docVector = new HashMap<String, Double>();
for (TermFreq freq : list) {
freq.setTcSum(tcSum);
docVector.put(freq.getTerm(), freq.calcTFIDF());
}
return docVector;
}
/** 2つのベクトルのなす角度を算出 */
private double computeAngle(Map<String, Double> vec1, Map<String, Double> vec2) throws IOException {
double dotProduct = 0; // 内積
for (String term : vec1.keySet()) {
if (vec2.containsKey(term)) {
dotProduct += vec1.get(term) * vec2.get(term);
}
}
double denominator = getNorm(vec1) * getNorm(vec2); // ベクトルの大きさの積
double ratio = dotProduct / denominator; // コサイン値
return Math.acos(ratio);
}
private double getNorm(Map<String, Double> vec) {
double sumOfSquares = 0;
for (Double val : vec.values()){
sumOfSquares += val * val;
}
return Math.sqrt(sumOfSquares);
}
/** 出現頻度の統計値を保持するクラス */
private static class TermFreq {
private final String term; /** term */
private long tc; /** 単語出現回数 */
private long tcSum; /** ドキュメント内の、全単語出現回数 */
private int df; /** 単語が出現するドキュメント数 */
private int maxDoc; /** インデックスに含まれる全ドキュメント数 */
TermFreq(String term, int maxDoc) {
this.term = term;
this.maxDoc = maxDoc;
}
String getTerm() { return term; }
void setTc(long tc) { this.tc = tc; }
long getTc() { return tc; }
void setTcSum(long tcSum) { this.tcSum = tcSum; }
void setDf(int df) { this.df = df; }
int getDf() { return df; }
/** TF-IDF値の計算 */
double calcTFIDF() {
double tf = (double)tc / (double)tcSum;
double idf = Math.log((double)maxDoc / (double)df);
return tf * idf;
}
@Override
public String toString() {
return term + "\tTF "+ tc + "\tTF-IDF: " + calcTFIDF();
}
}
public static void main(String[] args) throws IOException {
CalcCosineSimilarityTest test = new CalcCosineSimilarityTest();
double sim1 = test.computeSimilarity("33643175608", "33709242760");
System.out.println(sim1);
double sim2 = test.computeSimilarity("33643175608", "34171929890");
System.out.println(sim2);
double sim3 = test.computeSimilarity("33643175608", "32047551295");
System.out.println(sim3);
test.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment