anjesh/news_score.py

## news_score.py
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer, PunktWordTokenizer
import string
from os import listdir
from os.path import isfile, join
import logging

logger = logging.getLogger(__name__)

handler = logging.FileHandler('news-scrapper-debug.log')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

def extract_words(text):
    '''
    here we are extracting features to use in our classifier. We want to pull all the words in our input
    porterstem them and grab the most significant bigrams to add to the mix as well.
    '''
    # text = text.translate(None, string.punctuation)
    stemmer = PorterStemmer()

    tokenizer = PunktWordTokenizer()
    tokens = tokenizer.tokenize(text)
    result =  [stemmer.stem(x.lower()) for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 1]
    return result

class FeatureCollection:
    features = {}
    def add(self, id, name, text):
        logger.info("Adding feature id:%s name:%s to collection" % (id, name))
        if id in self.features:
            logger.critical("Duplication id:%s already exists in FeatureCollection" % (id))
            raise Exception('Duplication: ' + id + ' already exists in FeatureCollection')
        self.features[id] = FeatureClass(id, name, text)
        return self.features[id]

    def getFeatureObj(self, id):
        return self.features[id]

class FeatureClass:
    id = 0
    name = ""
    text = ""
    feature = {}

    def __init__(self, id, name, text):
        self.id = id
        self.name = name
        self.text = text
        self.feature = list(set(extract_words(self.text)))
        self.featureCount = len(self.feature)

def countWord(word, tokens):
    if str(type(word)) == "<type 'unicode'>":
        word = word.encode('utf-8')
    if word in tokens:
        return tokens.count(word)
    return 0

class NewsScorer:
    def __init__(self, content, featureCollection = ""):
        self.content = content
        self.tokens = extract_words(content)
        self.featureCollection = featureCollection
        self.scores = {}
        if featureCollection:
            self.calculateAll()

    def calculateAll(self):
        for featureId in self.featureCollection.features:
            logger.info("Calculating score for news for feature id:%s name:%s" % (featureId, self.featureCollection.features[featureId].name))
            self.scores[featureId] = self.calculateFeatureScore(self.featureCollection.features[featureId])['score']


    def calculateFeatureScore(self, featureObj):
        score = 0
        found = {}
        for word in featureObj.feature:
            count = countWord(word, self.tokens)
            if count >= 1:
                found[word] = count
                score = score + count
        logger.debug("Found words: %s" % found)
        score = 1.0 * score / featureObj.featureCount
        return {'score': score, 'words': found}


if __name__ == '__main__':
    featureCollection = FeatureCollection()
    featureCollection.add('agriculture', 'agriculture', open('../data/features/agriculture-1.txt', 'r').read())
    featureCollection.add('tourism', 'tourism', open('../data/features/tourism-1.txt', 'r').read())

    # newsScore = NewsScorer(open('../data/news/agri2.txt').read(),featureCollection)
    # print newsScore.scores

    mypath = "../data/news"
    onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
    for myfile in onlyfiles:
        content = open(join(mypath,myfile)).read()
        newsScore = NewsScorer(content,featureCollection)
        print myfile, newsScore.scores


## test.py
# This Python file uses the following encoding: utf-8
# place this file in the same folder containing news_score.py

from news_score import *

featureCollection = FeatureCollection()
feature = featureCollection.add('agriculture-1', 'agriculture', "Vegetable farmers agriculture")


newsScore = NewsScorer("Vegetable is good. Farmers should produce more vegetables.")
result = newsScore.calculateFeatureScore(feature)

print result

featureunicode = featureCollection.add('agriculture-unicode', 'agriculture', "किसान तरकारी".decode("utf8"))

for word in featureunicode.feature:
	print word.encode('utf-8')

newsScore = NewsScorer("किसान मेहेनत गर्छ")
result = newsScore.calculateFeatureScore(featureunicode)
print result
for word in result['words']:
	print word.encode('utf-8'), ":", result['words'][word]
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from nltk.tokenize import WordPunctTokenizer, PunktWordTokenizer
	import string
	from os import listdir
	from os.path import isfile, join
	import logging

	logger = logging.getLogger(__name__)

	handler = logging.FileHandler('news-scrapper-debug.log')
	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	handler.setFormatter(formatter)

	logger.addHandler(handler)
	logger.setLevel(logging.DEBUG)

	def extract_words(text):
	'''
	here we are extracting features to use in our classifier. We want to pull all the words in our input
	porterstem them and grab the most significant bigrams to add to the mix as well.
	'''
	# text = text.translate(None, string.punctuation)
	stemmer = PorterStemmer()

	tokenizer = PunktWordTokenizer()
	tokens = tokenizer.tokenize(text)
	result = [stemmer.stem(x.lower()) for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 1]
	return result

	class FeatureCollection:
	features = {}
	def add(self, id, name, text):
	logger.info("Adding feature id:%s name:%s to collection" % (id, name))
	if id in self.features:
	logger.critical("Duplication id:%s already exists in FeatureCollection" % (id))
	raise Exception('Duplication: ' + id + ' already exists in FeatureCollection')
	self.features[id] = FeatureClass(id, name, text)
	return self.features[id]

	def getFeatureObj(self, id):
	return self.features[id]

	class FeatureClass:
	id = 0
	name = ""
	text = ""
	feature = {}

	def __init__(self, id, name, text):
	self.id = id
	self.name = name
	self.text = text
	self.feature = list(set(extract_words(self.text)))
	self.featureCount = len(self.feature)

	def countWord(word, tokens):
	if str(type(word)) == "<type 'unicode'>":
	word = word.encode('utf-8')
	if word in tokens:
	return tokens.count(word)
	return 0

	class NewsScorer:
	def __init__(self, content, featureCollection = ""):
	self.content = content
	self.tokens = extract_words(content)
	self.featureCollection = featureCollection
	self.scores = {}
	if featureCollection:
	self.calculateAll()

	def calculateAll(self):
	for featureId in self.featureCollection.features:
	logger.info("Calculating score for news for feature id:%s name:%s" % (featureId, self.featureCollection.features[featureId].name))
	self.scores[featureId] = self.calculateFeatureScore(self.featureCollection.features[featureId])['score']


	def calculateFeatureScore(self, featureObj):
	score = 0
	found = {}
	for word in featureObj.feature:
	count = countWord(word, self.tokens)
	if count >= 1:
	found[word] = count
	score = score + count
	logger.debug("Found words: %s" % found)
	score = 1.0 * score / featureObj.featureCount
	return {'score': score, 'words': found}


	if __name__ == '__main__':
	featureCollection = FeatureCollection()
	featureCollection.add('agriculture', 'agriculture', open('../data/features/agriculture-1.txt', 'r').read())
	featureCollection.add('tourism', 'tourism', open('../data/features/tourism-1.txt', 'r').read())

	# newsScore = NewsScorer(open('../data/news/agri2.txt').read(),featureCollection)
	# print newsScore.scores

	mypath = "../data/news"
	onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
	for myfile in onlyfiles:
	content = open(join(mypath,myfile)).read()
	newsScore = NewsScorer(content,featureCollection)
	print myfile, newsScore.scores
	# This Python file uses the following encoding: utf-8
	# place this file in the same folder containing news_score.py

	from news_score import *

	featureCollection = FeatureCollection()
	feature = featureCollection.add('agriculture-1', 'agriculture', "Vegetable farmers agriculture")


	newsScore = NewsScorer("Vegetable is good. Farmers should produce more vegetables.")
	result = newsScore.calculateFeatureScore(feature)

	print result

	featureunicode = featureCollection.add('agriculture-unicode', 'agriculture', "किसान तरकारी".decode("utf8"))

	for word in featureunicode.feature:
	print word.encode('utf-8')

	newsScore = NewsScorer("किसान मेहेनत गर्छ")
	result = newsScore.calculateFeatureScore(featureunicode)
	print result
	for word in result['words']:
	print word.encode('utf-8'), ":", result['words'][word]