Skip to content

Instantly share code, notes, and snippets.

@anjesh
Created August 12, 2014 10:30
Show Gist options
  • Save anjesh/7a39b583e3bf111b6d80 to your computer and use it in GitHub Desktop.
Save anjesh/7a39b583e3bf111b6d80 to your computer and use it in GitHub Desktop.
This scores the given news based on the words defined in the feature.
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer, PunktWordTokenizer
import string
from os import listdir
from os.path import isfile, join
import logging
logger = logging.getLogger(__name__)
handler = logging.FileHandler('news-scrapper-debug.log')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
def extract_words(text):
'''
here we are extracting features to use in our classifier. We want to pull all the words in our input
porterstem them and grab the most significant bigrams to add to the mix as well.
'''
# text = text.translate(None, string.punctuation)
stemmer = PorterStemmer()
tokenizer = PunktWordTokenizer()
tokens = tokenizer.tokenize(text)
result = [stemmer.stem(x.lower()) for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 1]
return result
class FeatureCollection:
features = {}
def add(self, id, name, text):
logger.info("Adding feature id:%s name:%s to collection" % (id, name))
if id in self.features:
logger.critical("Duplication id:%s already exists in FeatureCollection" % (id))
raise Exception('Duplication: ' + id + ' already exists in FeatureCollection')
self.features[id] = FeatureClass(id, name, text)
return self.features[id]
def getFeatureObj(self, id):
return self.features[id]
class FeatureClass:
id = 0
name = ""
text = ""
feature = {}
def __init__(self, id, name, text):
self.id = id
self.name = name
self.text = text
self.feature = list(set(extract_words(self.text)))
self.featureCount = len(self.feature)
def countWord(word, tokens):
if str(type(word)) == "<type 'unicode'>":
word = word.encode('utf-8')
if word in tokens:
return tokens.count(word)
return 0
class NewsScorer:
def __init__(self, content, featureCollection = ""):
self.content = content
self.tokens = extract_words(content)
self.featureCollection = featureCollection
self.scores = {}
if featureCollection:
self.calculateAll()
def calculateAll(self):
for featureId in self.featureCollection.features:
logger.info("Calculating score for news for feature id:%s name:%s" % (featureId, self.featureCollection.features[featureId].name))
self.scores[featureId] = self.calculateFeatureScore(self.featureCollection.features[featureId])['score']
def calculateFeatureScore(self, featureObj):
score = 0
found = {}
for word in featureObj.feature:
count = countWord(word, self.tokens)
if count >= 1:
found[word] = count
score = score + count
logger.debug("Found words: %s" % found)
score = 1.0 * score / featureObj.featureCount
return {'score': score, 'words': found}
if __name__ == '__main__':
featureCollection = FeatureCollection()
featureCollection.add('agriculture', 'agriculture', open('../data/features/agriculture-1.txt', 'r').read())
featureCollection.add('tourism', 'tourism', open('../data/features/tourism-1.txt', 'r').read())
# newsScore = NewsScorer(open('../data/news/agri2.txt').read(),featureCollection)
# print newsScore.scores
mypath = "../data/news"
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
for myfile in onlyfiles:
content = open(join(mypath,myfile)).read()
newsScore = NewsScorer(content,featureCollection)
print myfile, newsScore.scores
# This Python file uses the following encoding: utf-8
# place this file in the same folder containing news_score.py
from news_score import *
featureCollection = FeatureCollection()
feature = featureCollection.add('agriculture-1', 'agriculture', "Vegetable farmers agriculture")
newsScore = NewsScorer("Vegetable is good. Farmers should produce more vegetables.")
result = newsScore.calculateFeatureScore(feature)
print result
featureunicode = featureCollection.add('agriculture-unicode', 'agriculture', "किसान तरकारी".decode("utf8"))
for word in featureunicode.feature:
print word.encode('utf-8')
newsScore = NewsScorer("किसान मेहेनत गर्छ")
result = newsScore.calculateFeatureScore(featureunicode)
print result
for word in result['words']:
print word.encode('utf-8'), ":", result['words'][word]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment