Skip to content

Instantly share code, notes, and snippets.

@MattJermyWright
Last active August 29, 2015 14:11
Show Gist options
  • Save MattJermyWright/5f32221b2e53ffec5cfc to your computer and use it in GitHub Desktop.
Save MattJermyWright/5f32221b2e53ffec5cfc to your computer and use it in GitHub Desktop.
Bayesian Classifier - classify mobile scripts
#!/usr/bin/env python
import fileinput
import re
import math
import pprint
import sqlite3
def getWords(str):
splitter = re.compile(r"\W*")
words = [s.lower() for s in splitter.split(str) if len(s)>2 and len(s)<40]
return words
class classifier:
def __init__(self,name,getfeatures,filename=None,dbName = "bayesianFilter.db"):
# The name of the classifier
self.name = name
# Counts of feature / category combinations
self.fc={}
# Counts of documents in each category
self.cc={}
self.getfeatures = getfeatures
# Initialize the SQL Database
self.conn = sqlite3.connect(dbName)
# Create tables if they don't already exist
self.checkTables(True)
def sqlExecuteAndIgnoreErrors(self,sql):
try:
self.conn.execute(sql)
self.conn.commit();
except sqlite3.Error:
pass
def checkTables(self, createTablesFlag):
# Check and see if the tables are already created
cursor = self.conn.cursor()
try:
cursor.execute("select count(*) from " + self.name + "_features")
except sqlite3.Error:
tablesExistFlag = True
if createTablesFlag ==True and tablesExistFlag == True:
self.sqlExecuteAndIgnoreErrors("drop table "+self.name+"_features")
self.sqlExecuteAndIgnoreErrors("drop table "+self.name+"_categories")
if createTablesFlag == True:
self.sqlExecuteAndIgnoreErrors("create table "+self.name+"_features (feature varchar(50))")
self.sqlExecuteAndIgnoreErrors("create table "+self.name+"_categories (category varchar(50), freq bigint default 0)")
def addCategoryIfNotExists(self,categoryName):
cursor = self.conn.cursor()
try:
cursor.execute("select count(*) from "+self.name+"_categories where category = ?",categoryName)
found = int(cursor.fetchone()[0])>0
except sqlite3.Error:
pass
if not found:
try:
self.conn.execute("alter table "+self.name+"_features add column "+categoryName+" bigint DEFAULT 0")
self.conn.execute("insert into "+self.name+"_categories values(?)", categoryName)
self.conn.commit;
except sqlite3.Error:
print("ERROR: Issue with inserting category")
def incrementFeature(self,feature,category,amount):
self.addCategoryIfNotExists(category)
# Check and see if the feature already exists, and retrieve the value
cursor = self.conn.cursor()
currentFeatureCategoryValue = 0
found = True
try:
cursor.execute("select sum("+category+") from "+self.name+"_features where feature = ?",feature)
currentFeatureCategoryValue = int(cursor.fetchone()[0])
except sqlite3.Error:
found = False
if not found:
try:
self.conn.execute("insert into "+self.name+"_features ('feature') values(?)", feature)
self.conn.commit;
except sqlite3.Error:
print("ERROR: Issue with inserting category")
try:
conn.execute("update "+self.name+"_features set "+category+"=? where feature = ?",currentFeatureCategoryValue+amount,feature)
conn.execute("update "+self.name+"_categories set freq=? where category = ?",currentFeatureCategoryValue+amount,category)
self.conn.commit;
except sqlite3.Error:
print("ERROR: Issue with inserting feature")
# def incf(self,f,cat):
# self.fc.setdefault(f,{})
# self.fc[f].setdefault(cat,0)
# self.fc[f][cat]+=1
# def incc(self, cat):
# self.cc.setdefault(cat,0)
# self.cc[cat]+=1
def fcount(self,feature,category):
count = 0
try:
cursor = self.conn.cursor()
cursor.execute("select sum("+category+") from "+self.name+"_features where feature =? ",feature)
count = cursor.fetchone()[0]
except sqlite3.Error:
print("ERROR: fcount")
return float(count)
def catcount(self,category):
count = 0
try:
cursor = self.conn.cursor()
cursor.execute("select sum(freq) from "+self.name+"_categories where category =? ",category)
count = cursor.fetchone()[0]
except sqlite3.Error:
print("ERROR: catcount")
return float(count)
def totalcount(self):
count = 0
try:
cursor = self.conn.cursor()
cursor.execute("select sum(freq) from "+self.name+"_categories")
count = cursor.fetchone()[0]
except sqlite3.Error:
print("ERROR: catcount")
return float(count)
def categories(self):
try:
cursor = self.conn.cursor()
return [row for row in cursor.execute("select category "+self.name+"_categories")]
except sqlite3.Error:
print("ERROR: catcount")
return []
# def fcount(self,f,cat):
# if f in self.fc and cat in self.fc[f]:
# return float(self.fc[f][cat])
# return 0.0
#def catcount(self,cat):
#def totalcount(self):
#def categories(self):
def main():
pp = pprint.PrettyPrinter(indent=4)
for line in fileinput.input():
pp.pprint(getWords(line))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment