Last active
October 20, 2018 11:40
-
-
Save mocobeta/5525864 to your computer and use it in GitHub Desktop.
Lucene API を使って文書類似度を計算するテスト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package termvector; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.Term; | |
import org.apache.lucene.index.Terms; | |
import org.apache.lucene.index.TermsEnum; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.TermQuery; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
import org.apache.lucene.util.BytesRef; | |
public class CalcCosineSimilarityTest { | |
private IndexReader reader = null; | |
public CalcCosineSimilarityTest() throws IOException { | |
Directory directory = FSDirectory.open(new File("tumblrdata")); | |
reader = DirectoryReader.open(directory); | |
} | |
public void close() throws IOException { | |
reader.close(); | |
} | |
/** | |
* 2つのエントリの文書類似度(文書ベクトルのなす角度)を計算する | |
*/ | |
public double computeSimilarity(String postid1, String postid2) throws IOException { | |
IndexSearcher searcher = new IndexSearcher(reader); | |
int docId1 = -1; | |
int docId2 = -1; | |
TopDocs hits = searcher.search(new TermQuery(new Term("id", postid1)), 1); | |
if (hits.scoreDocs.length > 0) { | |
docId1 = hits.scoreDocs[0].doc; | |
} | |
if (docId1 < 0) { | |
System.out.println("No Such Doc: " + postid1); | |
return 0.0; | |
} | |
hits = searcher.search(new TermQuery(new Term("id", postid2)), 1); | |
if (hits.scoreDocs.length > 0) { | |
docId2 = hits.scoreDocs[0].doc; | |
} | |
if (docId2 < 0) { | |
System.out.println("No Such Doc: " + postid2); | |
return 0.0; | |
} | |
Map<String, Double> vec1 = buildDocumentVector(docId1); | |
Map<String, Double> vec2 = buildDocumentVector(docId2); | |
return computeAngle(vec1, vec2); | |
} | |
/** | |
* 指定されたドキュメントの文書ベクトル(単語のTF-IDF値を並べたもの)を計算する | |
* 出現しない単語のTF-IDF値は省略 | |
**/ | |
private Map<String, Double> buildDocumentVector(int docId) throws IOException { | |
int maxDoc = reader.maxDoc(); | |
Terms vector = reader.getTermVector(docId, "content"); | |
// Term Vector から TF-IDF値を算出 | |
TermsEnum itr = vector.iterator(null); | |
BytesRef ref = null; | |
List<TermFreq> list = new ArrayList<TermFreq>(); | |
long tcSum = 0; | |
while ((ref = itr.next()) != null) { | |
String term = ref.utf8ToString(); | |
TermFreq freq = new TermFreq(term, maxDoc); | |
freq.setTc(itr.totalTermFreq()); | |
freq.setDf(reader.docFreq(new Term("content", term))); | |
list.add(freq); | |
tcSum += itr.totalTermFreq(); | |
} | |
// TF-IDF値を並べたベクトル | |
Map<String, Double> docVector = new HashMap<String, Double>(); | |
for (TermFreq freq : list) { | |
freq.setTcSum(tcSum); | |
docVector.put(freq.getTerm(), freq.calcTFIDF()); | |
} | |
return docVector; | |
} | |
/** 2つのベクトルのなす角度を算出 */ | |
private double computeAngle(Map<String, Double> vec1, Map<String, Double> vec2) throws IOException { | |
double dotProduct = 0; // 内積 | |
for (String term : vec1.keySet()) { | |
if (vec2.containsKey(term)) { | |
dotProduct += vec1.get(term) * vec2.get(term); | |
} | |
} | |
double denominator = getNorm(vec1) * getNorm(vec2); // ベクトルの大きさの積 | |
double ratio = dotProduct / denominator; // コサイン値 | |
return Math.acos(ratio); | |
} | |
private double getNorm(Map<String, Double> vec) { | |
double sumOfSquares = 0; | |
for (Double val : vec.values()){ | |
sumOfSquares += val * val; | |
} | |
return Math.sqrt(sumOfSquares); | |
} | |
/** 出現頻度の統計値を保持するクラス */ | |
private static class TermFreq { | |
private final String term; /** term */ | |
private long tc; /** 単語出現回数 */ | |
private long tcSum; /** ドキュメント内の、全単語出現回数 */ | |
private int df; /** 単語が出現するドキュメント数 */ | |
private int maxDoc; /** インデックスに含まれる全ドキュメント数 */ | |
TermFreq(String term, int maxDoc) { | |
this.term = term; | |
this.maxDoc = maxDoc; | |
} | |
String getTerm() { return term; } | |
void setTc(long tc) { this.tc = tc; } | |
long getTc() { return tc; } | |
void setTcSum(long tcSum) { this.tcSum = tcSum; } | |
void setDf(int df) { this.df = df; } | |
int getDf() { return df; } | |
/** TF-IDF値の計算 */ | |
double calcTFIDF() { | |
double tf = (double)tc / (double)tcSum; | |
double idf = Math.log((double)maxDoc / (double)df); | |
return tf * idf; | |
} | |
@Override | |
public String toString() { | |
return term + "\tTF "+ tc + "\tTF-IDF: " + calcTFIDF(); | |
} | |
} | |
public static void main(String[] args) throws IOException { | |
CalcCosineSimilarityTest test = new CalcCosineSimilarityTest(); | |
double sim1 = test.computeSimilarity("33643175608", "33709242760"); | |
System.out.println(sim1); | |
double sim2 = test.computeSimilarity("33643175608", "34171929890"); | |
System.out.println(sim2); | |
double sim3 = test.computeSimilarity("33643175608", "32047551295"); | |
System.out.println(sim3); | |
test.close(); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment