Created
April 14, 2017 03:23
-
-
Save anonymous/0c8589a67fcaaa32e4ccdbde1bfd6d3d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import product | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.mplot3d import Axes3D | |
from matplotlib import cm | |
from matplotlib.ticker import LinearLocator, FormatStrFormatter | |
''' | |
If my math is right, this function should output the ratio of documents | |
returned given dictionary size, avg. size of document and # of | |
documents. | |
''' | |
''' | |
N: size of dict. | |
n: # of documents | |
k: average size of document | |
q: average size of query | |
n is basically irrelevant because | |
http://www.wolframalpha.com/input/?i=((n-1)%2Fn)%5En is basically | |
constant | |
All that really influences the result is size of dict and k | |
In practice, we know that k ~= 1000 using SIFT | |
''' | |
def docs_seen(N, n, k, q = None): | |
if not q: | |
q = k | |
# Probability a word is in a document | |
p1 = 1 - ((N - 1)/N) ** k | |
# Expected number of documents that contain a given word | |
p2 = p1 * n | |
# Expected number of times we sample (visit) documents in a query | |
p3 = p2 * q | |
# Probability that a given document will be visited | |
p4 = 1 - ((n - 1)/n) ** p3 | |
return n * p4 | |
# size of dict. | |
N = 200000 | |
# number of documents | |
n = 25000 | |
# avg. size of document / query | |
k = 1000 | |
seen = docs_seen(N, n, k) | |
print('Docs seen: %d' % seen) | |
print('Percent: %.2f%% ' % (seen / n * 100)) | |
# exit() | |
''' | |
fig = plt.figure() | |
ax = fig.add_subplot(111, projection='3d') | |
x = np.arange(1, 1e9, 1e8) | |
y = np.arange(1e6, 1e10, 1e8) | |
X, Y = np.meshgrid(x, y) | |
zs = np.array([(docs_seen(x,y,k) / y * 100) for x,y in zip(np.ravel(X), np.ravel(Y))]) | |
Z = zs.reshape(X.shape) | |
ax.plot_surface(X, Y, Z) | |
ax.set_xlabel('size of dict') | |
ax.set_ylabel('# of docs') | |
ax.set_zlabel('% of docs seen') | |
plt.show() | |
''' | |
X = np.arange(1, 1e7, 1e4) | |
Y = np.array([(docs_seen(x,n,k) / n * 100) for x in np.ravel(X)]) | |
plt.plot(X, Y) | |
plt.xlabel('Size of dictionary') | |
plt.ylabel('% of documents retrieved') | |
plt.grid(True) | |
plt.show() | |
# ideal dict size: 5M | |
# Size of dict vs % of docs seen | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment