Skip to content

Instantly share code, notes, and snippets.

@otknoy
Created July 20, 2013 22:25
Show Gist options
  • Save otknoy/6046655 to your computer and use it in GitHub Desktop.
Save otknoy/6046655 to your computer and use it in GitHub Desktop.
#!/opt/local/bin/python
import MeCab
import math
def mecab(text):
tagger = MeCab.Tagger('-Ochasen')
node = tagger.parseToNode(text)
morphemes = []
while node:
features = node.feature.split(',')
morphemes.append({'surface': node.surface,
'term': features[6],
'posid': node.posid})
node = node.next
return morphemes
def extractNoun(morphemes):
node_id = range(41, 47)
node_id.append(38)
nouns = []
for m in morphemes:
if m['posid'] in node_id:
nouns.append(m)
return nouns
def calcTF(terms):
tf = {}
for t in terms:
if not tf.has_key(t):
tf[t] = 0
tf[t] += 1
return tf
def calcDF(tfList):
df = {}
for tf in tfList:
for t, f in tf.items():
if not df.has_key(t):
df[t] = 0
df[t] += 1
return df
def calcTFIDF(tf, df, n):
tfidf = {}
for t in tf.keys():
tfidf[t] = tf[t] * math.log(n / df[t])
return tfidf
if __name__ == "__main__":
import sys
filenames = sys.argv[1:]
tfList = []
for filename in filenames:
f = open(filename, 'r')
morphemes = mecab(f.read())
nouns = extractNoun(morphemes)
terms = [m['term'] for m in nouns]
tf = calcTF(terms)
tfList.append(tf)
f.close()
df = calcDF(tfList)
for tf in tfList:
tfidf = calcTFIDF(tf, df, len(tfList))
print '***** Document *****'
for k, v in sorted(tfidf.items(), key=lambda x:x[1]):
print k, v
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment