/inverted_index_math.py

## inverted_index_math.py
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter

'''
If my math is right, this function should output the ratio of documents
returned given dictionary size, avg. size of document and # of
documents.
'''


'''
N: size of dict.
n: # of documents
k: average size of document
q: average size of query

n is basically irrelevant because
http://www.wolframalpha.com/input/?i=((n-1)%2Fn)%5En is basically
constant

All that really influences the result is size of dict and k
In practice, we know that k ~= 1000 using SIFT
'''
def docs_seen(N, n, k, q = None):
    if not q:
        q = k
    # Probability a word is in a document
    p1 = 1 - ((N - 1)/N) ** k
    # Expected number of documents that contain a given word
    p2 = p1 * n
    # Expected number of times we sample (visit) documents in a query
    p3 = p2 * q
    # Probability that a given document will be visited
    p4 = 1 - ((n - 1)/n) ** p3
    return n * p4

# size of dict.
N = 200000
# number of documents
n = 25000
# avg. size of document / query
k = 1000
seen = docs_seen(N, n, k)
print('Docs seen: %d' % seen)
print('Percent: %.2f%% ' % (seen / n * 100))

# exit()

'''
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = np.arange(1, 1e9, 1e8)
y = np.arange(1e6, 1e10, 1e8)
X, Y = np.meshgrid(x, y)
zs = np.array([(docs_seen(x,y,k) / y * 100) for x,y in zip(np.ravel(X), np.ravel(Y))])
Z = zs.reshape(X.shape)

ax.plot_surface(X, Y, Z)

ax.set_xlabel('size of dict')
ax.set_ylabel('# of docs')
ax.set_zlabel('% of docs seen')

plt.show()

'''

X = np.arange(1, 1e7, 1e4)
Y = np.array([(docs_seen(x,n,k) / n * 100) for x in np.ravel(X)])
plt.plot(X, Y)
plt.xlabel('Size of dictionary')
plt.ylabel('% of documents retrieved')
plt.grid(True)
plt.show()

# ideal dict size: 5M

# Size of dict vs % of docs seen
	from itertools import product
	import numpy as np
	import matplotlib.pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D
	from matplotlib import cm
	from matplotlib.ticker import LinearLocator, FormatStrFormatter

	'''
	If my math is right, this function should output the ratio of documents
	returned given dictionary size, avg. size of document and # of
	documents.
	'''


	'''
	N: size of dict.
	n: # of documents
	k: average size of document
	q: average size of query

	n is basically irrelevant because
	http://www.wolframalpha.com/input/?i=((n-1)%2Fn)%5En is basically
	constant

	All that really influences the result is size of dict and k
	In practice, we know that k ~= 1000 using SIFT
	'''
	def docs_seen(N, n, k, q = None):
	if not q:
	q = k
	# Probability a word is in a document
	p1 = 1 - ((N - 1)/N) ** k
	# Expected number of documents that contain a given word
	p2 = p1 * n
	# Expected number of times we sample (visit) documents in a query
	p3 = p2 * q
	# Probability that a given document will be visited
	p4 = 1 - ((n - 1)/n) ** p3
	return n * p4

	# size of dict.
	N = 200000
	# number of documents
	n = 25000
	# avg. size of document / query
	k = 1000
	seen = docs_seen(N, n, k)
	print('Docs seen: %d' % seen)
	print('Percent: %.2f%% ' % (seen / n * 100))

	# exit()

	'''
	fig = plt.figure()
	ax = fig.add_subplot(111, projection='3d')
	x = np.arange(1, 1e9, 1e8)
	y = np.arange(1e6, 1e10, 1e8)
	X, Y = np.meshgrid(x, y)
	zs = np.array([(docs_seen(x,y,k) / y * 100) for x,y in zip(np.ravel(X), np.ravel(Y))])
	Z = zs.reshape(X.shape)

	ax.plot_surface(X, Y, Z)

	ax.set_xlabel('size of dict')
	ax.set_ylabel('# of docs')
	ax.set_zlabel('% of docs seen')

	plt.show()

	'''

	X = np.arange(1, 1e7, 1e4)
	Y = np.array([(docs_seen(x,n,k) / n * 100) for x in np.ravel(X)])
	plt.plot(X, Y)
	plt.xlabel('Size of dictionary')
	plt.ylabel('% of documents retrieved')
	plt.grid(True)
	plt.show()

	# ideal dict size: 5M

	# Size of dict vs % of docs seen