Skip to content

Instantly share code, notes, and snippets.

@duarteocarmo
Created December 4, 2017 19:59
Show Gist options
  • Save duarteocarmo/c9cea611577ea2d33b5f2cb0614b70c6 to your computer and use it in GitHub Desktop.
Save duarteocarmo/c9cea611577ea2d33b5f2cb0614b70c6 to your computer and use it in GitHub Desktop.
import numpy as np
from subprocess import run
import re
import os
import time
from sys import platform
import matplotlib.pyplot as plt
import seaborn as sns
from Project_Clean_data import raw, header, standardize_this
from writeapriorifile import *
X = raw
X = binarize(X)
np.savetxt("test_file.txt",X)
WriteAprioriFile(X)
if platform.startswith('linux'): #== "linux" or platform == "linux2":
ext = '' # Linux
dir_sep = '/'
elif platform.startswith('darwin'): #== "darwin":
ext = 'MAC' # OS X
dir_sep = '/'
elif platform.startswith('win'): #== "win32":
ext = '.exe' # Windows
dir_sep = '\\'
else:
raise NotImplementedError()
filename = "AprioriFile.txt"
minSup = 30
minConf = 60
maxRule = 4
# Run Apriori Algorithm
print('Mining for frequent itemsets by the Apriori algorithm')
status1 = run('..{0}Tools{0}apriori{1} -f"," -s{2} -v"[Sup. %S]" {3} apriori_temp1.txt'
.format(dir_sep, ext, minSup, filename ), shell=True)
if status1.returncode != 0:
print('An error occurred while calling apriori, a likely cause is that minSup was set to high such that no '
'frequent itemsets were generated or spaces are included in the path to the apriori files.')
exit()
if minConf > 0:
print('Mining for associations by the Apriori algorithm')
status2 = run('..{0}Tools{0}apriori{1} -tr -f"," -n{2} -c{3} -s{4} -v"[Conf. %C,Sup. %S]" {5} apriori_temp2.txt'
.format(dir_sep, ext, maxRule, minConf, minSup, filename ), shell=True)
if status2.returncode != 0:
print('An error occurred while calling apriori')
exit()
print('Apriori analysis done, extracting results')
# Extract information from stored files apriori_temp1.txt and apriori_temp2.txt
f = open('apriori_temp1.txt', 'r')
lines = f.readlines()
f.close()
# Extract Frequent Itemsets
FrequentItemsets = [''] * len(lines)
sup = np.zeros((len(lines), 1))
for i, line in enumerate(lines):
FrequentItemsets[i] = line[0:-1]
sup[i] = re.findall(' [-+]?\d*\.\d+|\d+]', line)[0][1:-1]
os.remove('apriori_temp1.txt')
# Read the file
f = open('apriori_temp2.txt', 'r')
lines = f.readlines()
f.close()
# Extract Association rules
AssocRules = [''] * len(lines)
conf = np.zeros((len(lines), 1))
for i, line in enumerate(lines):
AssocRules[i] = line[0:-1]
conf[i] = re.findall(' [-+]?\d*\.\d+|\d+,', line)[0][1:-1]
os.remove('apriori_temp2.txt')
# sort (FrequentItemsets by support value, AssocRules by confidence value)
AssocRulesSorted = [AssocRules[item] for item in np.argsort(conf, axis=0).ravel()]
AssocRulesSorted.reverse()
FrequentItemsetsSorted = [FrequentItemsets[item] for item in np.argsort(sup, axis=0).ravel()]
FrequentItemsetsSorted.reverse()
# Print the results
time.sleep(.5)
print('\n')
print('RESULTS:\n')
print('Frequent itemsets:')
for i, item in enumerate(FrequentItemsetsSorted):
print('Item: {0}'.format(item))
print('\n')
print('Association rules:')
for i, item in enumerate(AssocRulesSorted):
print('Rule: {0}'.format(item))
sns.set()
pos = list(range(len(FrequentItemsetsSorted)))
width = 0.3
ind_list = []
sup_list = []
ticks_list = ['First intercourse','Hormonal Contraceptives','Age','Num pregnacies','Num of partners']
for i in FrequentItemsetsSorted:
ind = int(i[0])
ind_list.append(ind)
sup_list.append(int(i[len(i)-3:len(i)-1]))
fig, ax = plt.subplots(figsize=(10,5))
plt.bar(pos, sup_list ,width, alpha=0.5, color='g')
#plt.bar([p + width for p in pos], , width, alpha=0.5, color='b')
ax.set_ylabel('Support %')
ax.set_title('Frequent itemsets')
ax.set_xticks([p + width/2 for p in pos])
ax.set_xticklabels(ticks_list)
plt.xlim(min(pos)-width, max(pos)+width*2)
plt.ylim([0, 100])
plt.legend(['Percentage'], loc='upper left')
filename = "Bar_items.png"
plt.savefig(filename,dpi=600)
plt.close()
pos = list(range(len(AssocRulesSorted)))
width = 0.2
ind_list_cause = []
ind_list_effect = []
sup_list2 = []
conf_list = []
ticks_list = ['First intercourse','Hormonal Contraceptives','Age','Num pregnacies','Num of partners']
for i in range(len(AssocRulesSorted)):
ind1 = int(AssocRulesSorted[i][0])
ind2 = int(AssocRulesSorted[i][5])
ind_list_cause.append(ind2)
ind_list_effect.append(ind1)
sup_list2.append(int(AssocRulesSorted[i][21:23]))
conf_list.append(int(AssocRulesSorted[i][13:15]))
ticks_list2 = ['First intercourse->Age','Num pregnacies->Age']
fig, ax = plt.subplots(figsize=(8,5))
plt.bar(pos, sup_list2,width, alpha=0.5, color='g')
plt.bar([p + width for p in pos], conf_list, width, alpha=0.5, color='b')
ax.set_ylabel('Percentage')
ax.set_title('Association rules')
ax.set_xticks([p + 1.5 * width for p in pos])
ax.set_xticklabels(ticks_list2)
plt.xlim(min(pos)-width, max(pos)+width*4)
plt.ylim([0, 100])
plt.legend(['Support', 'Confidence'], loc='upper left')
filename = "Assoc_rules.png"
plt.savefig(filename,dpi=600)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment