Skip to content

Instantly share code, notes, and snippets.

Created April 14, 2017 03:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/0c8589a67fcaaa32e4ccdbde1bfd6d3d to your computer and use it in GitHub Desktop.
Save anonymous/0c8589a67fcaaa32e4ccdbde1bfd6d3d to your computer and use it in GitHub Desktop.
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
'''
If my math is right, this function should output the ratio of documents
returned given dictionary size, avg. size of document and # of
documents.
'''
'''
N: size of dict.
n: # of documents
k: average size of document
q: average size of query
n is basically irrelevant because
http://www.wolframalpha.com/input/?i=((n-1)%2Fn)%5En is basically
constant
All that really influences the result is size of dict and k
In practice, we know that k ~= 1000 using SIFT
'''
def docs_seen(N, n, k, q = None):
if not q:
q = k
# Probability a word is in a document
p1 = 1 - ((N - 1)/N) ** k
# Expected number of documents that contain a given word
p2 = p1 * n
# Expected number of times we sample (visit) documents in a query
p3 = p2 * q
# Probability that a given document will be visited
p4 = 1 - ((n - 1)/n) ** p3
return n * p4
# size of dict.
N = 200000
# number of documents
n = 25000
# avg. size of document / query
k = 1000
seen = docs_seen(N, n, k)
print('Docs seen: %d' % seen)
print('Percent: %.2f%% ' % (seen / n * 100))
# exit()
'''
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = np.arange(1, 1e9, 1e8)
y = np.arange(1e6, 1e10, 1e8)
X, Y = np.meshgrid(x, y)
zs = np.array([(docs_seen(x,y,k) / y * 100) for x,y in zip(np.ravel(X), np.ravel(Y))])
Z = zs.reshape(X.shape)
ax.plot_surface(X, Y, Z)
ax.set_xlabel('size of dict')
ax.set_ylabel('# of docs')
ax.set_zlabel('% of docs seen')
plt.show()
'''
X = np.arange(1, 1e7, 1e4)
Y = np.array([(docs_seen(x,n,k) / n * 100) for x in np.ravel(X)])
plt.plot(X, Y)
plt.xlabel('Size of dictionary')
plt.ylabel('% of documents retrieved')
plt.grid(True)
plt.show()
# ideal dict size: 5M
# Size of dict vs % of docs seen
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment