nevernormal1/gist:720445

## gistfile1.rb
#!/usr/bin/env ruby

def dump_items(items)
  if items.size > 1
    items.each_with_index do |item_to_print, i|
      items.each_with_index do |related_item, j|
        next if item_to_print == related_item
        puts "#{item_to_print}:#{related_item}\t1"
      end
    end
  end
end

current_path = nil
related_items = []
ARGF.each do |line|
  path, item = line.split
  if path != current_path
    dump_items(related_items) unless current_path.nil?
    current_path = path
    related_items = []
  end
  related_items << item
end

dump_items(related_items) unless current_path.nil?

## gistfile2.rb
#!/usr/bin/env ruby

COUNT_THRESHOLD = 10

def dump_items(key, count)
  product, related_product = key.split(":")
  puts "#{product}\t#{related_product}\t#{count}" if count.to_i >= COUNT_THRESHOLD
end

current_key = nil
current_count = 0
ARGF.each do |line|
  key, count = line.split
  if key != current_key
    dump_items(current_key, current_count) unless current_key.nil?
    current_key = key
    current_count = 0
  end
  current_count += count.to_i
end

dump_items(current_key, current_count) unless current_key.nil?

## gistfile3.sh
#!/usr/bin/env bash

HADOOP_HOME=${HADOOP_HOME:?HADOOP_HOME is not set}

script_path=`dirname $0`

# Clean up output from prior runs
for dir in "/tmp/input" "/tmp/output" "/tmp/final_output"; do
  if [[ -d $dir ]] ; then
    rm -fr $dir
  fi
done

mkdir /tmp/input

user=$1
host=$2
database=$3
pass=$4

mysql -u $user --password=$pass --host=$host --batch -e \
      "SELECT path_id, product_id FROM recent_items" \
      $database > /tmp/input/recent_items.tsv

jarfile="$HADOOP_HOME/mapred/contrib/streaming/hadoop-0.21.0-streaming.jar"

"$HADOOP_HOME/bin/hadoop" jar \
  $jarfile \
  -input /tmp/input \
  -output /tmp/output \
  -mapper cat \
  -reducer $script_path/reduce.rb

cat /tmp/output/part* > /tmp/output/related.tsv

"$HADOOP_HOME/bin/hadoop" jar \
  $jarfile \
  -input /tmp/output/related.tsv \
  -output /tmp/final_output \
  -mapper cat \
  -reducer $script_path/reduce2.rb

cat /tmp/final_output/part* > /tmp/final_output/viewed_products.tsv

mysqlimport --local --compress -u $user --host=$host \
            --columns=source_product_id,target_product_id,count \
            --replace $database \
            /tmp/final_output/viewed_products.tsv
	#!/usr/bin/env ruby

	def dump_items(items)
	if items.size > 1
	items.each_with_index do \|item_to_print, i\|
	items.each_with_index do \|related_item, j\|
	next if item_to_print == related_item
	puts "#{item_to_print}:#{related_item}\t1"
	end
	end
	end
	end

	current_path = nil
	related_items = []
	ARGF.each do \|line\|
	path, item = line.split
	if path != current_path
	dump_items(related_items) unless current_path.nil?
	current_path = path
	related_items = []
	end
	related_items << item
	end

	dump_items(related_items) unless current_path.nil?
	#!/usr/bin/env ruby

	COUNT_THRESHOLD = 10

	def dump_items(key, count)
	product, related_product = key.split(":")
	puts "#{product}\t#{related_product}\t#{count}" if count.to_i >= COUNT_THRESHOLD
	end

	current_key = nil
	current_count = 0
	ARGF.each do \|line\|
	key, count = line.split
	if key != current_key
	dump_items(current_key, current_count) unless current_key.nil?
	current_key = key
	current_count = 0
	end
	current_count += count.to_i
	end

	dump_items(current_key, current_count) unless current_key.nil?
	#!/usr/bin/env bash

	HADOOP_HOME=${HADOOP_HOME:?HADOOP_HOME is not set}

	script_path=`dirname $0`

	# Clean up output from prior runs
	for dir in "/tmp/input" "/tmp/output" "/tmp/final_output"; do
	if [[ -d $dir ]] ; then
	rm -fr $dir
	fi
	done

	mkdir /tmp/input

	user=$1
	host=$2
	database=$3
	pass=$4

	mysql -u $user --password=$pass --host=$host --batch -e \
	"SELECT path_id, product_id FROM recent_items" \
	$database > /tmp/input/recent_items.tsv

	jarfile="$HADOOP_HOME/mapred/contrib/streaming/hadoop-0.21.0-streaming.jar"

	"$HADOOP_HOME/bin/hadoop" jar \
	$jarfile \
	-input /tmp/input \
	-output /tmp/output \
	-mapper cat \
	-reducer $script_path/reduce.rb

	cat /tmp/output/part* > /tmp/output/related.tsv

	"$HADOOP_HOME/bin/hadoop" jar \
	$jarfile \
	-input /tmp/output/related.tsv \
	-output /tmp/final_output \
	-mapper cat \
	-reducer $script_path/reduce2.rb

	cat /tmp/final_output/part* > /tmp/final_output/viewed_products.tsv

	mysqlimport --local --compress -u $user --host=$host \
	--columns=source_product_id,target_product_id,count \
	--replace $database \
	/tmp/final_output/viewed_products.tsv