Last active
August 29, 2015 14:11
-
-
Save MattJermyWright/5f32221b2e53ffec5cfc to your computer and use it in GitHub Desktop.
Bayesian Classifier - classify mobile scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import fileinput | |
import re | |
import math | |
import pprint | |
import sqlite3 | |
def getWords(str): | |
splitter = re.compile(r"\W*") | |
words = [s.lower() for s in splitter.split(str) if len(s)>2 and len(s)<40] | |
return words | |
class classifier: | |
def __init__(self,name,getfeatures,filename=None,dbName = "bayesianFilter.db"): | |
# The name of the classifier | |
self.name = name | |
# Counts of feature / category combinations | |
self.fc={} | |
# Counts of documents in each category | |
self.cc={} | |
self.getfeatures = getfeatures | |
# Initialize the SQL Database | |
self.conn = sqlite3.connect(dbName) | |
# Create tables if they don't already exist | |
self.checkTables(True) | |
def sqlExecuteAndIgnoreErrors(self,sql): | |
try: | |
self.conn.execute(sql) | |
self.conn.commit(); | |
except sqlite3.Error: | |
pass | |
def checkTables(self, createTablesFlag): | |
# Check and see if the tables are already created | |
cursor = self.conn.cursor() | |
try: | |
cursor.execute("select count(*) from " + self.name + "_features") | |
except sqlite3.Error: | |
tablesExistFlag = True | |
if createTablesFlag ==True and tablesExistFlag == True: | |
self.sqlExecuteAndIgnoreErrors("drop table "+self.name+"_features") | |
self.sqlExecuteAndIgnoreErrors("drop table "+self.name+"_categories") | |
if createTablesFlag == True: | |
self.sqlExecuteAndIgnoreErrors("create table "+self.name+"_features (feature varchar(50))") | |
self.sqlExecuteAndIgnoreErrors("create table "+self.name+"_categories (category varchar(50), freq bigint default 0)") | |
def addCategoryIfNotExists(self,categoryName): | |
cursor = self.conn.cursor() | |
try: | |
cursor.execute("select count(*) from "+self.name+"_categories where category = ?",categoryName) | |
found = int(cursor.fetchone()[0])>0 | |
except sqlite3.Error: | |
pass | |
if not found: | |
try: | |
self.conn.execute("alter table "+self.name+"_features add column "+categoryName+" bigint DEFAULT 0") | |
self.conn.execute("insert into "+self.name+"_categories values(?)", categoryName) | |
self.conn.commit; | |
except sqlite3.Error: | |
print("ERROR: Issue with inserting category") | |
def incrementFeature(self,feature,category,amount): | |
self.addCategoryIfNotExists(category) | |
# Check and see if the feature already exists, and retrieve the value | |
cursor = self.conn.cursor() | |
currentFeatureCategoryValue = 0 | |
found = True | |
try: | |
cursor.execute("select sum("+category+") from "+self.name+"_features where feature = ?",feature) | |
currentFeatureCategoryValue = int(cursor.fetchone()[0]) | |
except sqlite3.Error: | |
found = False | |
if not found: | |
try: | |
self.conn.execute("insert into "+self.name+"_features ('feature') values(?)", feature) | |
self.conn.commit; | |
except sqlite3.Error: | |
print("ERROR: Issue with inserting category") | |
try: | |
conn.execute("update "+self.name+"_features set "+category+"=? where feature = ?",currentFeatureCategoryValue+amount,feature) | |
conn.execute("update "+self.name+"_categories set freq=? where category = ?",currentFeatureCategoryValue+amount,category) | |
self.conn.commit; | |
except sqlite3.Error: | |
print("ERROR: Issue with inserting feature") | |
# def incf(self,f,cat): | |
# self.fc.setdefault(f,{}) | |
# self.fc[f].setdefault(cat,0) | |
# self.fc[f][cat]+=1 | |
# def incc(self, cat): | |
# self.cc.setdefault(cat,0) | |
# self.cc[cat]+=1 | |
def fcount(self,feature,category): | |
count = 0 | |
try: | |
cursor = self.conn.cursor() | |
cursor.execute("select sum("+category+") from "+self.name+"_features where feature =? ",feature) | |
count = cursor.fetchone()[0] | |
except sqlite3.Error: | |
print("ERROR: fcount") | |
return float(count) | |
def catcount(self,category): | |
count = 0 | |
try: | |
cursor = self.conn.cursor() | |
cursor.execute("select sum(freq) from "+self.name+"_categories where category =? ",category) | |
count = cursor.fetchone()[0] | |
except sqlite3.Error: | |
print("ERROR: catcount") | |
return float(count) | |
def totalcount(self): | |
count = 0 | |
try: | |
cursor = self.conn.cursor() | |
cursor.execute("select sum(freq) from "+self.name+"_categories") | |
count = cursor.fetchone()[0] | |
except sqlite3.Error: | |
print("ERROR: catcount") | |
return float(count) | |
def categories(self): | |
try: | |
cursor = self.conn.cursor() | |
return [row for row in cursor.execute("select category "+self.name+"_categories")] | |
except sqlite3.Error: | |
print("ERROR: catcount") | |
return [] | |
# def fcount(self,f,cat): | |
# if f in self.fc and cat in self.fc[f]: | |
# return float(self.fc[f][cat]) | |
# return 0.0 | |
#def catcount(self,cat): | |
#def totalcount(self): | |
#def categories(self): | |
def main(): | |
pp = pprint.PrettyPrinter(indent=4) | |
for line in fileinput.input(): | |
pp.pprint(getWords(line)) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment