Skip to content

Instantly share code, notes, and snippets.

@d2207197
Created December 17, 2012 13:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save d2207197/4318298 to your computer and use it in GitHub Desktop.
Save d2207197/4318298 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
from mrjob.job import MRJob
from collections import Counter
from math import log
from nltk import word_tokenize
import re
import string
class Tf_Idf(MRJob):
# mapper_init 0
def gen_tf_init(self, args=None):
# 若在 mapper 0 - gen_tf() -開檔,會使得每行 input 都開一次檔,相當慢。
# 故需建立一個 mapper_init 來負責讀取 vocab.mail.txt
self.vocabs = map(
string.strip, open("vocab.mail.txt", "r").readlines())
# mapper 0
def gen_tf(self, _, line):
term_cnt = Counter()
m = re.match(r'<mail filename="(?P<fname>\d+)">(?P<text>.*$)', line)
fname = m.group('fname')
terms = word_tokenize(m.group(
'text').replace('\\\\n', ' ')) # from nltk import word_tokenize
for term in terms:
if term in self.vocabs:
term_cnt[term] += 1
for term in term_cnt:
yield term, (fname, term_cnt[term])
del m, fname, term_cnt
# reducer 0
def gen_tf_idf(self, term, fname_cnt):
fname_cnt = list(fname_cnt)
df = len(fname_cnt)
idf = 54000 / df
for fname, tf in fname_cnt:
tf_idf = tf * idf
yield fname, (term, tf_idf)
# mapper 2
def just_cat(self, fname, terms_tfidf):
yield fname, terms_tfidf
# reducer 2
def reduce_by_fname(self, fname, tf_idfs):
tf_idfs = list(tf_idfs)
tf_idf_sum = sum(tf_idf for term, tf_idf in tf_idfs)
for i, (term, tf_idf) in enumerate(tf_idfs):
tf_idfs[i][1] = tf_idf / tf_idf_sum
yield fname, tf_idfs
# 定義 map reduce 的步驟
def steps(self):
return [
self.mr( # mapreduce step 0
mapper_init=self.gen_tf_init,
mapper=self.gen_tf,
reducer=self.gen_tf_idf,
),
self.mr( # mapreduce step 1
mapper=self.just_cat,
reducer=self.reduce_by_fname
)
]
if __name__ == '__main__':
Tf_Idf.run()
# docid term count
# docid term tf*N/df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment