Last active
September 23, 2022 18:55
-
-
Save sailist/103dc751f35d1a581757a750719c2e57 to your computer and use it in GitHub Desktop.
BUC algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import random | |
A = ["a1","a2","a3","a4"] | |
B = ["b1","b2","b3","b4"] | |
C = ["c1","c2","c3","c4"] | |
D = ["d1","d2"] | |
ALL = [A,B,C,D] | |
def sample(size = 6): | |
li = [] | |
for i in range(size): | |
tmp = [] | |
for d in ALL: | |
tmp.append(random.choice(d)) | |
tmp.append(1) | |
li.append(tmp) | |
li.sort() | |
li = pd.DataFrame(li) | |
return li | |
def sample_fixed(): | |
data = [ | |
["a1","b1","c1","d1"], | |
["a1","b1","c1","d2"], | |
["a1","b1","c2","d1"], | |
["a1","b1","c2","d2"], | |
["a2","b1","c1","d2"], | |
["a2","b2","c2","d2"], | |
["a3","b3","c1","d1"], | |
["a4","b4","c1","d2"], | |
] | |
# data = [ | |
# ["a1","b1","c1"], | |
# ["a1","b1","c2"], | |
# ["a2","b1","c1"], | |
# ] | |
for i in data: | |
i.append(1) | |
data = pd.DataFrame(data) | |
return data | |
candidate = {} | |
min_sup = 2 | |
def BUC(data, prefix=None): | |
if len(data.columns) == 1:#确保要进行的不是最后的计数列,可以用增加总维度和当前维度这两个参数来替换 | |
return | |
if prefix is None:#保存前缀参数,用来存储 | |
prefix = [] | |
for i in data.columns:#对每一个维度进行BUC | |
if i == data.columns[-1]:#判定是否是最后一个计数维度 | |
continue | |
count = data.groupby([i]).count()#对当前维度计数 | |
count = count[count >= min_sup].dropna()#直接对不符合min_sup的分区舍弃 | |
for j in count.index:#遍历符合的分区 | |
subdata = data[data[i].isin([j])] | |
subdata = subdata.drop(i,1)#筛选子分区 | |
BUC(subdata,prefix+[str(j)])#递归的进行更小维度上的BUC | |
candidate[",".join(prefix+[str(j)])] = int(count[fco][j])#添加进记录,即输出 | |
# print(",".join(prefix+[str(j)])) #当min_sup 设置为1的时候,能清楚的看到BUC计算的过程 | |
data = data.drop(i,1) | |
data = sample_fixed() | |
fco = data.columns[-1] | |
result = BUC(data) | |
for k,v in candidate.items(): | |
print(k,v) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment