jrconlin/noiseDork.py

## noiseDork.py
#!/bin/python

#Math is weird.
# I was having a discussion with a coworker who wanted to do site visit metrics, but not expose user information. His idea
# was to have a very large user sample, but apply a large amount of noise to the individual signals. When averaged out, the
# noise cancels leaving a reasonably approximate metric.
# I figured that the if the visit count is low, it would be literally lost in the noise.
# Being suspiciously minded, I decided to test this theory.
#
# Math is weird.

import random


users = 1000000
sample = 0.0
realsample = 0.0

for x in xrange(0, users):
    maxvisits = 10
    #10% of users potentially visit "frequently"
    if random.randint(1, 100) < 10:
        maxvisits = 40
    # Make it noisy as hell.
    noise = random.randrange(-500, 500, 1)
    visits = random.randint(0, maxvisits)
    realsample += visits
    sample += visits + noise

print "Real Visits:" + repr(realsample) + \
        " reported Visits: " + repr(sample) + "\n"
print "Real Visit per user: " + repr(realsample / users) + \
      "Reported visits per user: " + repr(sample / users)

#sample output
# Real Visits:6351072.0 reported Visits: 6077616.0
# Real Visit per user: 6.351072Reported visits per user: 6.077616
# :boggles:
	#!/bin/python

	#Math is weird.
	# I was having a discussion with a coworker who wanted to do site visit metrics, but not expose user information. His idea
	# was to have a very large user sample, but apply a large amount of noise to the individual signals. When averaged out, the
	# noise cancels leaving a reasonably approximate metric.
	# I figured that the if the visit count is low, it would be literally lost in the noise.
	# Being suspiciously minded, I decided to test this theory.
	#
	# Math is weird.

	import random


	users = 1000000
	sample = 0.0
	realsample = 0.0

	for x in xrange(0, users):
	maxvisits = 10
	#10% of users potentially visit "frequently"
	if random.randint(1, 100) < 10:
	maxvisits = 40
	# Make it noisy as hell.
	noise = random.randrange(-500, 500, 1)
	visits = random.randint(0, maxvisits)
	realsample += visits
	sample += visits + noise

	print "Real Visits:" + repr(realsample) + \
	" reported Visits: " + repr(sample) + "\n"
	print "Real Visit per user: " + repr(realsample / users) + \
	"Reported visits per user: " + repr(sample / users)

	#sample output
	# Real Visits:6351072.0 reported Visits: 6077616.0
	# Real Visit per user: 6.351072Reported visits per user: 6.077616
	# :boggles: