Skip to content

Instantly share code, notes, and snippets.

@sailist
Last active September 23, 2022 18:55
Show Gist options
  • Save sailist/103dc751f35d1a581757a750719c2e57 to your computer and use it in GitHub Desktop.
Save sailist/103dc751f35d1a581757a750719c2e57 to your computer and use it in GitHub Desktop.
BUC algorithm
import pandas as pd
import random
A = ["a1","a2","a3","a4"]
B = ["b1","b2","b3","b4"]
C = ["c1","c2","c3","c4"]
D = ["d1","d2"]
ALL = [A,B,C,D]
def sample(size = 6):
li = []
for i in range(size):
tmp = []
for d in ALL:
tmp.append(random.choice(d))
tmp.append(1)
li.append(tmp)
li.sort()
li = pd.DataFrame(li)
return li
def sample_fixed():
data = [
["a1","b1","c1","d1"],
["a1","b1","c1","d2"],
["a1","b1","c2","d1"],
["a1","b1","c2","d2"],
["a2","b1","c1","d2"],
["a2","b2","c2","d2"],
["a3","b3","c1","d1"],
["a4","b4","c1","d2"],
]
# data = [
# ["a1","b1","c1"],
# ["a1","b1","c2"],
# ["a2","b1","c1"],
# ]
for i in data:
i.append(1)
data = pd.DataFrame(data)
return data
candidate = {}
min_sup = 2
def BUC(data, prefix=None):
if len(data.columns) == 1:#确保要进行的不是最后的计数列,可以用增加总维度和当前维度这两个参数来替换
return
if prefix is None:#保存前缀参数,用来存储
prefix = []
for i in data.columns:#对每一个维度进行BUC
if i == data.columns[-1]:#判定是否是最后一个计数维度
continue
count = data.groupby([i]).count()#对当前维度计数
count = count[count >= min_sup].dropna()#直接对不符合min_sup的分区舍弃
for j in count.index:#遍历符合的分区
subdata = data[data[i].isin([j])]
subdata = subdata.drop(i,1)#筛选子分区
BUC(subdata,prefix+[str(j)])#递归的进行更小维度上的BUC
candidate[",".join(prefix+[str(j)])] = int(count[fco][j])#添加进记录,即输出
# print(",".join(prefix+[str(j)])) #当min_sup 设置为1的时候,能清楚的看到BUC计算的过程
data = data.drop(i,1)
data = sample_fixed()
fco = data.columns[-1]
result = BUC(data)
for k,v in candidate.items():
print(k,v)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment