Last active
February 22, 2017 08:29
-
-
Save you-n-g/5425748f7715859e701cdb6ece5648fd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding:utf8 | |
import numpy as np | |
def calc_complexity(leaf, f, N, n_trees, dim_size, block, search_k=None, vector_complexity=None, benchmark_process=1): | |
''' | |
# 这些是参数 | |
leaf: 数据大小 | |
f: 维度 | |
N: 需要找到的最邻近点 | |
n_trees: 对于每一份索引,要构建多少树;100 ~ 400; | |
dim_size: 每个维度大小 | |
block: 所有的索引分块数量 | |
search_k: 最终的候选点的数量 | |
vector_complexity: 完成一次 f维的向量计算需要的时间复杂度 | |
benchmark_process: ann-benchmark是可以多进程跑的, 这里用于计算用几个进程跑数据 | |
''' | |
if search_k is None: | |
search_k = n_trees * N # 最终的候选点 | |
if vector_complexity is None: | |
vector_complexity = f # 完成一次 f维的向量计算需要的时间复杂度 | |
node_size = f * dim_size + 4 * 2 + 4 + 4 | |
total_size = leaf * node_size # 总数据量大小 | |
_K = f / 2 # 平均最后 一层区域有多少样本 | |
total_size_p_block = total_size / block # 每个数据块的大小 | |
leaf_p_block = leaf / block # 分块之后每个块的向量数量 | |
non_leaf_p_block = leaf_p_block / _K * 2 - 1 # 每一份索引的索引树的非叶子节点数量 | |
# TODO: 这个部分太大了, n_trees需要调成小一点 | |
non_leaf_size_p_block = node_size * non_leaf_p_block * n_trees # 每个block的所有索引树的大小 | |
search_k_p_block = search_k / block # 每个块需要提供的候选点的数量 | |
q_len_p_block = int(n_trees * np.log2(non_leaf_p_block)) # 每个索引块的优先队列长度大概所在数量级 | |
construct_time_p_block = int(leaf_p_block * np.log2(leaf_p_block) * vector_complexity * n_trees) # 每个块构建索引的所需基本操作数量 | |
# TODO: *f 这个环节肯定被高估了, 因为向量相乘应该会有相应的库函数优化。 | |
candi_query_time_p_block = int(1. * search_k_p_block / _K * np.log2(q_len_p_block) * vector_complexity + search_k_p_block + search_k_p_block * vector_complexity) # 每个块找到 search_k个点所需基本操作数量 | |
candi_p_block = min(search_k_p_block, N) # 每个block贡献的 candidate | |
total_candi = candi_p_block * block # 总共的candidate 数量 | |
combine_time = int(total_candi * np.log2(total_candi)) # 汇总时总的基本操作数量 | |
total_query_time = candi_query_time_p_block + combine_time # 每个查询总的操作数消耗 | |
mem_p_block = total_size_p_block + non_leaf_size_p_block # 每个block的内存占用 | |
# 原来计算复杂度 | |
orig_non_leaf = leaf / _K * 2 - 1 # 原版一个索引模块大小 | |
orig_q_len = int(n_trees * np.log2(orig_non_leaf)) # 原版的优先队列长度大概多少 | |
orig_construct_time = int(leaf * np.log2(leaf) * vector_complexity * n_trees) # 原版建立索引的时间大概是多少 | |
orig_candi_query_time = int(1. * search_k / _K * np.log2(orig_q_len) * vector_complexity + search_k * vector_complexity) # 原版选出候选点的时间 | |
orig_select_nearest = int(search_k * np.log2(search_k)) # 原版从候选点中选出最邻近点时间 | |
orig_total_query_time = orig_candi_query_time + orig_select_nearest # 原版查询总时间 | |
orig_qps = orig_total_query_time / benchmark_process # 原版用线程时,每秒能执行的查询数量 | |
orig_mem = node_size * (leaf + orig_non_leaf * n_trees) # 原版整体的内存占用 | |
template = ''' | |
leaf = {:,} # 数据大小 | |
f = {:,} # 维度 | |
dim_size = {:,}B # 每个维度大小 | |
total_size = {:,}B # 总数据量大小 | |
block = {:,} # 所有的索引分块数量 | |
N = {:,} # 需要找到的最邻近点 | |
n_trees = {:,} # 对于每一份索引,要构建多少树;100 ~ 400; | |
search_k = {:,} #最终的候选点的数量 | |
# 分块情况下 | |
total_size_p_block = {:,}B # 每个数据块的大小 | |
leaf_p_block = {:,} # 分块之后每个块的向量数量 | |
non_leaf_p_block = {:,} # 每一份索引的索引树的非叶子节点数量 | |
non_leaf_size_p_block = {:,}B # 每个block的所有索引树的大小 | |
search_k = {:,} # 最终的候选点 | |
search_k_p_block = {:,} # 每个块需要提供的候选点的数量 | |
q_len_p_block = {:,} # 每个索引块的优先队列长度大概所在数量级 | |
construct_time_p_block = {:,} # 每个块构建索引的所需基本操作数量 | |
candi_query_time_p_block = {:,} # 每个块找到 search_k个点所需基本操作数量 | |
combine_time = {:,} # 汇总时总的基本操作数量 | |
total_query_time = {:,} # 每个查询总的操作数消耗 | |
mem_p_block = {:,}B # 每个block的内存占用 | |
# 原来计算复杂度 | |
orig_q_len = = {:,} # 原版的优先队列长度大概多少 | |
orig_construct_time = {:,} # 原版建立索引的时间大概是多少 | |
orig_candi_query_time = {:,} # 原版选出候选点的时间 | |
orig_select_nearest = {:,} # 原版从候选点中选出最邻近点时间 | |
orig_total_query_time = {:,} # 原版查询总时间 | |
orig_qps = {:,} # 原版用线程时,每秒能执行的查询数量 | |
orig_mem = {:,}B # 原来整体的内存占用 | |
''' | |
print template.format( | |
leaf, f, dim_size, total_size, block, N, n_trees, | |
search_k, | |
total_size_p_block, leaf_p_block, non_leaf_p_block, | |
non_leaf_size_p_block, | |
search_k, search_k_p_block, q_len_p_block, | |
construct_time_p_block, candi_query_time_p_block, combine_time, | |
total_query_time, mem_p_block, | |
orig_q_len, orig_construct_time, orig_candi_query_time, | |
orig_select_nearest, orig_total_query_time, orig_qps, orig_mem) | |
# 模拟凤巢的数据 | |
fc_kwargs = { | |
'leaf': int(5e8), | |
'f': 200, | |
'N': 512, | |
'n_trees': 100, | |
'search_k': 400000, | |
'dim_size': 4, | |
'block': 60 | |
} | |
calc_complexity(**fc_kwargs) | |
# 我需要什么样的规模的参数才能做好实验 | |
exp_kwargs = { | |
'leaf': int(3e6), | |
'f': 256, | |
'N': 512, | |
'n_trees': 400, | |
'dim_size': 4, | |
'block': 1 | |
} | |
# calc_complexity(**exp_kwargs) | |
# 测试凤巢数据 | |
exp_kwargs = { | |
'leaf': int(1e6), | |
'f': 256, | |
'N': 512, | |
# 'n_trees': 400, | |
# 'search_k':400000, | |
'n_trees': 100, | |
'search_k':2000, | |
'dim_size': 4, | |
'block': 1, | |
'benchmark_process': 8, | |
} | |
# calc_complexity(**exp_kwargs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment