Skip to content

Instantly share code, notes, and snippets.

Created February 1, 2013 03:49
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save tedwards/4689044 to your computer and use it in GitHub Desktop.
Bayesian assessment of TRAC tickets using nltk
#!/usr/bin/env python
import sys
import time
import smtplib
import datetime
import pickle
import random
import nltk
from pysqlite2 import dbapi2 as sqlite
from nltk.tokenize import word_tokenize
class Collector:
def __init__(self, dbfile):
self.sql = '''SELECT id , summary, t.type AS type,
t.status as status, c.value as service,
date(time/1000000, 'unixepoch') AS created,
date(t.changetime/1000000, 'unixepoch') as last_updated ,
FROM ticket t, ticket_custom c
WHERE status='closed'
AND c.ticket = AND = 'service'
## 0 id 1 summary 2 type 3 status 4 service 5 created 6 last_updated 7 reporter
self.db = sqlite.connect(dbfile)
self.cursor = self.db.cursor()
self.cursor.execute(self.sql) = self.cursor.fetchall()
def service_type_features(self):
''' Returns a feature set
Mapping ticket summaries to service types'''
for aTicket in
for aWord in word_tokenize(aTicket[1]):
featureset.append( ( {'contains-word(%s)'%aWord : True}, service) )
return featureset
if __name__=='__main__':
print nltk.classify.accuracy(classifier, collector.service_type_features())
print classifier.show_most_informative_features(5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment