Created
February 1, 2013 03:49
-
-
Save tedwards/4689044 to your computer and use it in GitHub Desktop.
Bayesian assessment of TRAC tickets using nltk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import time | |
import smtplib | |
import datetime | |
import pickle | |
import random | |
import nltk | |
from pysqlite2 import dbapi2 as sqlite | |
from nltk.tokenize import word_tokenize | |
TEST=True | |
today=datetime.datetime.now() | |
class Collector: | |
def __init__(self, dbfile): | |
self.sql = '''SELECT id , summary, t.type AS type, | |
t.status as status, c.value as service, | |
date(time/1000000, 'unixepoch') AS created, | |
date(t.changetime/1000000, 'unixepoch') as last_updated , | |
reporter | |
FROM ticket t, ticket_custom c | |
WHERE status='closed' | |
AND c.ticket = t.id AND c.name = 'service' | |
''' | |
## 0 id 1 summary 2 type 3 status 4 service 5 created 6 last_updated 7 reporter | |
self.db = sqlite.connect(dbfile) | |
self.cursor = self.db.cursor() | |
self.cursor.execute(self.sql) | |
self.tickets = self.cursor.fetchall() | |
def service_type_features(self): | |
''' Returns a feature set | |
Mapping ticket summaries to service types''' | |
featureset=[] | |
for aTicket in self.tickets: | |
service=aTicket[4] | |
for aWord in word_tokenize(aTicket[1]): | |
featureset.append( ( {'contains-word(%s)'%aWord : True}, service) ) | |
return featureset | |
if __name__=='__main__': | |
collector=Collector(sqlitedb) | |
classifier=nltk.NaiveBayesClassifier.train(collector.service_type_features()) | |
print nltk.classify.accuracy(classifier, collector.service_type_features()) | |
print classifier.show_most_informative_features(5) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment