devmacrile/map1.py

## map1.py
#!/usr/bin/python
import sys
import re
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
#input comes from standard input
for line in sys.stdin:
	#separate incident id from text
	id = line.split('\t', 1)[0]
	incident = line.split('\t', 1)[1]
	#split incident into words
	words = incident.split()
	for word in words:
		word = word.lower()
		pattern = re.compile('[\W_]+')
		word = pattern.sub('', word)
		#remove stop words, words that start with #s
		if word in stop_words:
			continue
		if len(word) == 0:
			continue
		if word[0].isdigit():
			continue
		#write results to stdout
		print('%s\t%s\t%s' %(word, id, 1))

## map2.py
#!/usr/bin/python
import sys
import re


#input comes from standard input
for line in sys.stdin:
	#separate term, incident, and tf from red1 output
	word, incident, tf = line.strip().split('\t', 3)
	#print out tuple of form (term, (file, tf, 1))
	print('%s\t%s\t%s' %(word, incident, tf))

## map3.py
#!/usr/bin/python

import sys
import math

N = 668245.0 #hard coded for now
#input comes from standard input
for line in sys.stdin:
	#separate term, incident, and tf from red1 output
	#the strip is necessary to remove eol characters
	word, incident, tf, df = line.strip().split('\t', 3)
	tf = int(tf)
	df = int(df)
	tfidf = tf * math.log10(N/df)
	#print out tuple of form (term, (file, tf, 1))
	print('%s\t%s\t%s' %(word, incident, tfidf))

## red1.py
#!/usr/bin/python

import sys

current_word = None
current_incident = None
current_count = 0
word = None

#input comes from stdin
for line in sys.stdin:
	#remove leading and trailing whitespace
	line = line.strip()

	#parse input from mapper.py
	word, incident, count = line.split('\t', 2)

	try:
		count = int(count)
	except ValueError:
		#count not a number, discard line
		continue

	if current_word == word and current_incident == incident:
		current_count += count
	else:
		if current_word:
			#write results to stdout
			print('%s\t%s\t%s' % (current_word, current_incident, current_count))
		current_count = count
		current_word = word
		current_incident = incident

#have to output last word
if current_word == word:
	print('%s\t%s\t%s' % (current_word, current_incident, current_count))


## red2.py
#!/usr/bin/python

import sys

current_word, current_incident, current_tf = sys.stdin.readline().strip().split('\t', 2)
current_df = 1
buffer = []
for line in sys.stdin:
	#parse input from map2.py
	word, incident, tf = line.strip().split('\t', 2)

	if word == current_word:
		current_df += 1
		tobuff = ('%s\t%s\t%s' %(word, incident, tf))
		buffer.append(tobuff)

	else:
		for ln in buffer:
			print('%s\t%s' %(ln, current_df))
		print('%s\t%s\t%s\t%s' %(current_word, current_incident, current_tf, current_df))
		buffer = []
		current_word = word
		current_incident = incident
		current_tf = tf
		current_df = 1

lastdf = len(buffer) + 1
for ln in buffer:
	print('%s\t%s' %(ln, lastdf))
print('%s\t%s\t%s\t%s' % (current_word, current_incident, current_tf, lastdf))


## red3.py
#!/usr/bin/python

import sys

for line in sys.stdin:
	line = line.strip()
	print line
	#!/usr/bin/python
	import sys
	import re
	import nltk
	from nltk.corpus import stopwords

	stop_words = stopwords.words('english')
	#input comes from standard input
	for line in sys.stdin:
	#separate incident id from text
	id = line.split('\t', 1)[0]
	incident = line.split('\t', 1)[1]
	#split incident into words
	words = incident.split()
	for word in words:
	word = word.lower()
	pattern = re.compile('[\W_]+')
	word = pattern.sub('', word)
	#remove stop words, words that start with #s
	if word in stop_words:
	continue
	if len(word) == 0:
	continue
	if word[0].isdigit():
	continue
	#write results to stdout
	print('%s\t%s\t%s' %(word, id, 1))
	#!/usr/bin/python

	import sys
	import math

	N = 668245.0 #hard coded for now
	#input comes from standard input
	for line in sys.stdin:
	#separate term, incident, and tf from red1 output
	#the strip is necessary to remove eol characters
	word, incident, tf, df = line.strip().split('\t', 3)
	tf = int(tf)
	df = int(df)
	tfidf = tf * math.log10(N/df)
	#print out tuple of form (term, (file, tf, 1))
	print('%s\t%s\t%s' %(word, incident, tfidf))
	#!/usr/bin/python

	import sys

	current_word = None
	current_incident = None
	current_count = 0
	word = None

	#input comes from stdin
	for line in sys.stdin:
	#remove leading and trailing whitespace
	line = line.strip()

	#parse input from mapper.py
	word, incident, count = line.split('\t', 2)

	try:
	count = int(count)
	except ValueError:
	#count not a number, discard line
	continue

	if current_word == word and current_incident == incident:
	current_count += count
	else:
	if current_word:
	#write results to stdout
	print('%s\t%s\t%s' % (current_word, current_incident, current_count))
	current_count = count
	current_word = word
	current_incident = incident

	#have to output last word
	if current_word == word:
	print('%s\t%s\t%s' % (current_word, current_incident, current_count))
	#!/usr/bin/python

	import sys

	current_word, current_incident, current_tf = sys.stdin.readline().strip().split('\t', 2)
	current_df = 1
	buffer = []
	for line in sys.stdin:
	#parse input from map2.py
	word, incident, tf = line.strip().split('\t', 2)

	if word == current_word:
	current_df += 1
	tobuff = ('%s\t%s\t%s' %(word, incident, tf))
	buffer.append(tobuff)

	else:
	for ln in buffer:
	print('%s\t%s' %(ln, current_df))
	print('%s\t%s\t%s\t%s' %(current_word, current_incident, current_tf, current_df))
	buffer = []
	current_word = word
	current_incident = incident
	current_tf = tf
	current_df = 1

	lastdf = len(buffer) + 1
	for ln in buffer:
	print('%s\t%s' %(ln, lastdf))
	print('%s\t%s\t%s\t%s' % (current_word, current_incident, current_tf, lastdf))