heuristicfencepost/generate_random_data.rb

## generate_random_data.rb
require 'yaml'

def generate_random_name(len)
    1.upto(len).map { (65 + (rand 26)).chr }.join()
end

NUM_AUTHORS=10000
AUTHOR_NAME_LENGTH=24
POSTS_PER_AUTHOR=100
MAX_PAGE_VIEWS_PER_POST=2000

authors=[]
max_views=Hash.new()

1.upto(NUM_AUTHORS) do |v|

  # Generate random data for this author and each of their blog
  # posts.  Note that we don't actually care about the content of each
  # post; in the example we're evaluating content isn't actually
  # involved.  We can always add some generic content later.
  author_name = generate_random_name(AUTHOR_NAME_LENGTH).capitalize
  page_views = 0.upto(POSTS_PER_AUTHOR).map { rand 2000 }
  authors << {
    "author" => author_name,
    "page_views" => page_views
  }

  # Maintain a separate map of metadata about our blog posts.
  # Specifically we're interested in a single piece of metadata;
  # for each author what is max number of page views seen for
  # any of their articles?
  max_views[author_name] = page_views.max

  # Flush and start over every 500 entries
  if v % 500 == 0
    part = v / 500
    puts "Iteration #{v}, flushing part #{part}"
    out = File.new("random_data_part#{"%02d" % part}.yaml",'w+')
    out.write YAML.dump(authors)
    out.flush
    out.close
    authors=[]
  end
end

# Now that we've written out all data parts let's write out the
# metadata as well.
out = File.new("max_views.yaml",'w+')
out.write YAML.dump(max_views)
out.flush
out.close

## load_data.rb
require 'rubygems'
require 'mongo'
require 'yaml'

# Read the YAML files generated by generate_random_data.rb and store
# them in a MongoDB instance
db=Mongo::Connection.new.db('indexing')
coll=db.collection('indexing')

thelist = Dir.new(".").find_all {|f| f.start_with? "random_data_part" }.sort.each do |filename|

  puts "Loading data from file #{filename}"

  out = File.new(filename)
  YAML.load(out).each do |data|
    data["page_views"].each do |page_view_count|
      coll.insert({
        "author" => data["author"],
        "page_views" => page_view_count,
        "content" => "The quick brown fox jumped over the lazy dog"
      })
    end
  end
end

## mapreduce.js
// JavaScript map and reduce functions for use within the Mongo shell
map = function () {
    var x = { page_views : this.page_views , _id : this._id };
    emit(this.author, { min : x , max : x } )
}

reduce = function (key, values) {
    var res = values[0];
    for ( var i=1; i<values.length; i++ ) {
        if ( values[i].min.page_views < res.min.page_views )
           res.min = values[i].min;
        if ( values[i].max.page_views > res.max.page_views )
           res.max = values[i].max;
    }
    return res;
}
	require 'yaml'

	def generate_random_name(len)
	1.upto(len).map { (65 + (rand 26)).chr }.join()
	end

	NUM_AUTHORS=10000
	AUTHOR_NAME_LENGTH=24
	POSTS_PER_AUTHOR=100
	MAX_PAGE_VIEWS_PER_POST=2000

	authors=[]
	max_views=Hash.new()

	1.upto(NUM_AUTHORS) do \|v\|

	# Generate random data for this author and each of their blog
	# posts. Note that we don't actually care about the content of each
	# post; in the example we're evaluating content isn't actually
	# involved. We can always add some generic content later.
	author_name = generate_random_name(AUTHOR_NAME_LENGTH).capitalize
	page_views = 0.upto(POSTS_PER_AUTHOR).map { rand 2000 }
	authors << {
	"author" => author_name,
	"page_views" => page_views
	}

	# Maintain a separate map of metadata about our blog posts.
	# Specifically we're interested in a single piece of metadata;
	# for each author what is max number of page views seen for
	# any of their articles?
	max_views[author_name] = page_views.max

	# Flush and start over every 500 entries
	if v % 500 == 0
	part = v / 500
	puts "Iteration #{v}, flushing part #{part}"
	out = File.new("random_data_part#{"%02d" % part}.yaml",'w+')
	out.write YAML.dump(authors)
	out.flush
	out.close
	authors=[]
	end
	end

	# Now that we've written out all data parts let's write out the
	# metadata as well.
	out = File.new("max_views.yaml",'w+')
	out.write YAML.dump(max_views)
	out.flush
	out.close
	require 'rubygems'
	require 'mongo'
	require 'yaml'

	# Read the YAML files generated by generate_random_data.rb and store
	# them in a MongoDB instance
	db=Mongo::Connection.new.db('indexing')
	coll=db.collection('indexing')

	thelist = Dir.new(".").find_all {\|f\| f.start_with? "random_data_part" }.sort.each do \|filename\|

	puts "Loading data from file #{filename}"

	out = File.new(filename)
	YAML.load(out).each do \|data\|
	data["page_views"].each do \|page_view_count\|
	coll.insert({
	"author" => data["author"],
	"page_views" => page_view_count,
	"content" => "The quick brown fox jumped over the lazy dog"
	})
	end
	end
	end
	// JavaScript map and reduce functions for use within the Mongo shell
	map = function () {
	var x = { page_views : this.page_views , _id : this._id };
	emit(this.author, { min : x , max : x } )
	}

	reduce = function (key, values) {
	var res = values[0];
	for ( var i=1; i<values.length; i++ ) {
	if ( values[i].min.page_views < res.min.page_views )
	res.min = values[i].min;
	if ( values[i].max.page_views > res.max.page_views )
	res.max = values[i].max;
	}
	return res;
	}