Skip to content

Instantly share code, notes, and snippets.

@you-n-g
Last active February 22, 2017 08:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save you-n-g/5425748f7715859e701cdb6ece5648fd to your computer and use it in GitHub Desktop.
Save you-n-g/5425748f7715859e701cdb6ece5648fd to your computer and use it in GitHub Desktop.
# coding:utf8
import numpy as np
def calc_complexity(leaf, f, N, n_trees, dim_size, block, search_k=None, vector_complexity=None, benchmark_process=1):
'''
# 这些是参数
leaf: 数据大小
f: 维度
N: 需要找到的最邻近点
n_trees: 对于每一份索引,要构建多少树;100 ~ 400;
dim_size: 每个维度大小
block: 所有的索引分块数量
search_k: 最终的候选点的数量
vector_complexity: 完成一次 f维的向量计算需要的时间复杂度
benchmark_process: ann-benchmark是可以多进程跑的, 这里用于计算用几个进程跑数据
'''
if search_k is None:
search_k = n_trees * N # 最终的候选点
if vector_complexity is None:
vector_complexity = f # 完成一次 f维的向量计算需要的时间复杂度
node_size = f * dim_size + 4 * 2 + 4 + 4
total_size = leaf * node_size # 总数据量大小
_K = f / 2 # 平均最后 一层区域有多少样本
total_size_p_block = total_size / block # 每个数据块的大小
leaf_p_block = leaf / block # 分块之后每个块的向量数量
non_leaf_p_block = leaf_p_block / _K * 2 - 1 # 每一份索引的索引树的非叶子节点数量
# TODO: 这个部分太大了, n_trees需要调成小一点
non_leaf_size_p_block = node_size * non_leaf_p_block * n_trees # 每个block的所有索引树的大小
search_k_p_block = search_k / block # 每个块需要提供的候选点的数量
q_len_p_block = int(n_trees * np.log2(non_leaf_p_block)) # 每个索引块的优先队列长度大概所在数量级
construct_time_p_block = int(leaf_p_block * np.log2(leaf_p_block) * vector_complexity * n_trees) # 每个块构建索引的所需基本操作数量
# TODO: *f 这个环节肯定被高估了, 因为向量相乘应该会有相应的库函数优化。
candi_query_time_p_block = int(1. * search_k_p_block / _K * np.log2(q_len_p_block) * vector_complexity + search_k_p_block + search_k_p_block * vector_complexity) # 每个块找到 search_k个点所需基本操作数量
candi_p_block = min(search_k_p_block, N) # 每个block贡献的 candidate
total_candi = candi_p_block * block # 总共的candidate 数量
combine_time = int(total_candi * np.log2(total_candi)) # 汇总时总的基本操作数量
total_query_time = candi_query_time_p_block + combine_time # 每个查询总的操作数消耗
mem_p_block = total_size_p_block + non_leaf_size_p_block # 每个block的内存占用
# 原来计算复杂度
orig_non_leaf = leaf / _K * 2 - 1 # 原版一个索引模块大小
orig_q_len = int(n_trees * np.log2(orig_non_leaf)) # 原版的优先队列长度大概多少
orig_construct_time = int(leaf * np.log2(leaf) * vector_complexity * n_trees) # 原版建立索引的时间大概是多少
orig_candi_query_time = int(1. * search_k / _K * np.log2(orig_q_len) * vector_complexity + search_k * vector_complexity) # 原版选出候选点的时间
orig_select_nearest = int(search_k * np.log2(search_k)) # 原版从候选点中选出最邻近点时间
orig_total_query_time = orig_candi_query_time + orig_select_nearest # 原版查询总时间
orig_qps = orig_total_query_time / benchmark_process # 原版用线程时,每秒能执行的查询数量
orig_mem = node_size * (leaf + orig_non_leaf * n_trees) # 原版整体的内存占用
template = '''
leaf = {:,} # 数据大小
f = {:,} # 维度
dim_size = {:,}B # 每个维度大小
total_size = {:,}B # 总数据量大小
block = {:,} # 所有的索引分块数量
N = {:,} # 需要找到的最邻近点
n_trees = {:,} # 对于每一份索引,要构建多少树;100 ~ 400;
search_k = {:,} #最终的候选点的数量
# 分块情况下
total_size_p_block = {:,}B # 每个数据块的大小
leaf_p_block = {:,} # 分块之后每个块的向量数量
non_leaf_p_block = {:,} # 每一份索引的索引树的非叶子节点数量
non_leaf_size_p_block = {:,}B # 每个block的所有索引树的大小
search_k = {:,} # 最终的候选点
search_k_p_block = {:,} # 每个块需要提供的候选点的数量
q_len_p_block = {:,} # 每个索引块的优先队列长度大概所在数量级
construct_time_p_block = {:,} # 每个块构建索引的所需基本操作数量
candi_query_time_p_block = {:,} # 每个块找到 search_k个点所需基本操作数量
combine_time = {:,} # 汇总时总的基本操作数量
total_query_time = {:,} # 每个查询总的操作数消耗
mem_p_block = {:,}B # 每个block的内存占用
# 原来计算复杂度
orig_q_len = = {:,} # 原版的优先队列长度大概多少
orig_construct_time = {:,} # 原版建立索引的时间大概是多少
orig_candi_query_time = {:,} # 原版选出候选点的时间
orig_select_nearest = {:,} # 原版从候选点中选出最邻近点时间
orig_total_query_time = {:,} # 原版查询总时间
orig_qps = {:,} # 原版用线程时,每秒能执行的查询数量
orig_mem = {:,}B # 原来整体的内存占用
'''
print template.format(
leaf, f, dim_size, total_size, block, N, n_trees,
search_k,
total_size_p_block, leaf_p_block, non_leaf_p_block,
non_leaf_size_p_block,
search_k, search_k_p_block, q_len_p_block,
construct_time_p_block, candi_query_time_p_block, combine_time,
total_query_time, mem_p_block,
orig_q_len, orig_construct_time, orig_candi_query_time,
orig_select_nearest, orig_total_query_time, orig_qps, orig_mem)
# 模拟凤巢的数据
fc_kwargs = {
'leaf': int(5e8),
'f': 200,
'N': 512,
'n_trees': 100,
'search_k': 400000,
'dim_size': 4,
'block': 60
}
calc_complexity(**fc_kwargs)
# 我需要什么样的规模的参数才能做好实验
exp_kwargs = {
'leaf': int(3e6),
'f': 256,
'N': 512,
'n_trees': 400,
'dim_size': 4,
'block': 1
}
# calc_complexity(**exp_kwargs)
# 测试凤巢数据
exp_kwargs = {
'leaf': int(1e6),
'f': 256,
'N': 512,
# 'n_trees': 400,
# 'search_k':400000,
'n_trees': 100,
'search_k':2000,
'dim_size': 4,
'block': 1,
'benchmark_process': 8,
}
# calc_complexity(**exp_kwargs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment