Skip to content

Instantly share code, notes, and snippets.

@devmacrile
Created February 5, 2015 16:29
Show Gist options
  • Save devmacrile/20f42808078490ebb773 to your computer and use it in GitHub Desktop.
Save devmacrile/20f42808078490ebb773 to your computer and use it in GitHub Desktop.
Modifiable map-reduce code for running TF-IDF via Hadoop Streaming jobs.
#!/usr/bin/python
import sys
import re
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#input comes from standard input
for line in sys.stdin:
#separate incident id from text
id = line.split('\t', 1)[0]
incident = line.split('\t', 1)[1]
#split incident into words
words = incident.split()
for word in words:
word = word.lower()
pattern = re.compile('[\W_]+')
word = pattern.sub('', word)
#remove stop words, words that start with #s
if word in stop_words:
continue
if len(word) == 0:
continue
if word[0].isdigit():
continue
#write results to stdout
print('%s\t%s\t%s' %(word, id, 1))
#!/usr/bin/python
import sys
import re
#input comes from standard input
for line in sys.stdin:
#separate term, incident, and tf from red1 output
word, incident, tf = line.strip().split('\t', 3)
#print out tuple of form (term, (file, tf, 1))
print('%s\t%s\t%s' %(word, incident, tf))
#!/usr/bin/python
import sys
import math
N = 668245.0 #hard coded for now
#input comes from standard input
for line in sys.stdin:
#separate term, incident, and tf from red1 output
#the strip is necessary to remove eol characters
word, incident, tf, df = line.strip().split('\t', 3)
tf = int(tf)
df = int(df)
tfidf = tf * math.log10(N/df)
#print out tuple of form (term, (file, tf, 1))
print('%s\t%s\t%s' %(word, incident, tfidf))
#!/usr/bin/python
import sys
current_word = None
current_incident = None
current_count = 0
word = None
#input comes from stdin
for line in sys.stdin:
#remove leading and trailing whitespace
line = line.strip()
#parse input from mapper.py
word, incident, count = line.split('\t', 2)
try:
count = int(count)
except ValueError:
#count not a number, discard line
continue
if current_word == word and current_incident == incident:
current_count += count
else:
if current_word:
#write results to stdout
print('%s\t%s\t%s' % (current_word, current_incident, current_count))
current_count = count
current_word = word
current_incident = incident
#have to output last word
if current_word == word:
print('%s\t%s\t%s' % (current_word, current_incident, current_count))
#!/usr/bin/python
import sys
current_word, current_incident, current_tf = sys.stdin.readline().strip().split('\t', 2)
current_df = 1
buffer = []
for line in sys.stdin:
#parse input from map2.py
word, incident, tf = line.strip().split('\t', 2)
if word == current_word:
current_df += 1
tobuff = ('%s\t%s\t%s' %(word, incident, tf))
buffer.append(tobuff)
else:
for ln in buffer:
print('%s\t%s' %(ln, current_df))
print('%s\t%s\t%s\t%s' %(current_word, current_incident, current_tf, current_df))
buffer = []
current_word = word
current_incident = incident
current_tf = tf
current_df = 1
lastdf = len(buffer) + 1
for ln in buffer:
print('%s\t%s' %(ln, lastdf))
print('%s\t%s\t%s\t%s' % (current_word, current_incident, current_tf, lastdf))
#!/usr/bin/python
import sys
for line in sys.stdin:
line = line.strip()
print line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment