rxin/ampcamp-ecnu-2013-data.sh

## ampcamp-ecnu-2013-data.sh
################################################################################
# Step 1. Download wiki traffic log.
# from
#  https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00095.gz
# to
#  https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00168.gz
# Note that 095 and 168 are both 0 bytes. The sole purpose of their existence is
# to verify the downloads.

# NOTE THAT THE FOLLOWING SCRIPT STARTS wget AS BACKGROUND PROCESSES.

mkdir wikistats
cd wikistats
for x in {1095..1168}; do
  x=${x#1}
  wget -b -nv https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00$x.gz
done

# Watch the progress (stops at 4.0G)
watch -n 5 du -h .

# Unzip the files when all files finish downloading
rm wget-log*
gunzip *gz


################################################################################
# Step 2. Download featurized wiki articles.
# from
#  https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats_featurized/part-00001.gz
# to
#  https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats_featurized/part-00015.gz

mkdir wikistats_featurized
cd wikistats_featurized
for x in {101..115}; do
  x=${x#1}
  wget -b -nv https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats_featurized/part-000$x.gz
done

# Watch the progress (stops at 110M)
watch -n 5 du -h .

# Unzip the files when all files finish downloading
rm wget-log*
gunzip *gz


################################################################################
# Step 3. Download Scala.

wget http://www.scala-lang.org/downloads/distrib/files/scala-2.9.2.tgz
tar xvfz scala-2.9.2.tgz


## ampcamp-ecnu-2013-user.sh
# For each user setup Spark, Hive, Shark.

export USERDIR=/scratch/rxin2/user
export SCALA_HOME=/path/to/scala

cd $USERDIR
wget http://www.spark-project.org/download-spark-0.7.0-sources-tgz
rm -f download-spark-0.7.0-sources-tgz
tar xvfz download-spark-0.7.0-sources-tgz
echo -e "export SCALA_HOME=$SCALA_HOME\nexport SPARK_MEM=40G\nexport MASTER=local[8]" > spark-0.7.0/conf/spark-env.sh

# TODO: setup local tmp path for Spark; use Kryo for data serialization.

# I will create a new tarball for Hive and Shark later.

## stream-server.py
import socket
import time
import threading
import SocketServer

class ThreadedTCPRequestHandler(SocketServer.BaseRequestHandler):

  def handle(self):
    data = open("file", "r").readlines()
    for line in data:
      self.request.send(line)
      time.sleep(1)

class ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):
  pass

if __name__ == "__main__":
  # Port 0 means to select an arbitrary unused port
  HOST, PORT = "localhost", 8083

  server = ThreadedTCPServer((HOST, PORT), ThreadedTCPRequestHandler)
  ip, port = server.server_address

  # Start a thread with the server -- that thread will then start one
  # more thread for each request
  server_thread = threading.Thread(target=server.serve_forever)
  # Exit the server thread when the main thread terminates
  server_thread.daemon = True
  server_thread.start()
  print "Server loop running in thread:", server_thread.name

  while True:
    pass
  server.shutdown()
	################################################################################
	# Step 1. Download wiki traffic log.
	# from
	# https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00095.gz
	# to
	# https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00168.gz
	# Note that 095 and 168 are both 0 bytes. The sole purpose of their existence is
	# to verify the downloads.

	# NOTE THAT THE FOLLOWING SCRIPT STARTS wget AS BACKGROUND PROCESSES.

	mkdir wikistats
	cd wikistats
	for x in {1095..1168}; do
	x=${x#1}
	wget -b -nv https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00$x.gz
	done

	# Watch the progress (stops at 4.0G)
	watch -n 5 du -h .

	# Unzip the files when all files finish downloading
	rm wget-log*
	gunzip *gz


	################################################################################
	# Step 2. Download featurized wiki articles.
	# from
	# https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats_featurized/part-00001.gz
	# to
	# https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats_featurized/part-00015.gz

	mkdir wikistats_featurized
	cd wikistats_featurized
	for x in {101..115}; do
	x=${x#1}
	wget -b -nv https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats_featurized/part-000$x.gz
	done

	# Watch the progress (stops at 110M)
	watch -n 5 du -h .

	# Unzip the files when all files finish downloading
	rm wget-log*
	gunzip *gz


	################################################################################
	# Step 3. Download Scala.

	wget http://www.scala-lang.org/downloads/distrib/files/scala-2.9.2.tgz
	tar xvfz scala-2.9.2.tgz
	# For each user setup Spark, Hive, Shark.

	export USERDIR=/scratch/rxin2/user
	export SCALA_HOME=/path/to/scala

	cd $USERDIR
	wget http://www.spark-project.org/download-spark-0.7.0-sources-tgz
	rm -f download-spark-0.7.0-sources-tgz
	tar xvfz download-spark-0.7.0-sources-tgz
	echo -e "export SCALA_HOME=$SCALA_HOME\nexport SPARK_MEM=40G\nexport MASTER=local[8]" > spark-0.7.0/conf/spark-env.sh

	# TODO: setup local tmp path for Spark; use Kryo for data serialization.

	# I will create a new tarball for Hive and Shark later.
	import socket
	import time
	import threading
	import SocketServer

	class ThreadedTCPRequestHandler(SocketServer.BaseRequestHandler):

	def handle(self):
	data = open("file", "r").readlines()
	for line in data:
	self.request.send(line)
	time.sleep(1)

	class ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):
	pass

	if __name__ == "__main__":
	# Port 0 means to select an arbitrary unused port
	HOST, PORT = "localhost", 8083

	server = ThreadedTCPServer((HOST, PORT), ThreadedTCPRequestHandler)
	ip, port = server.server_address

	# Start a thread with the server -- that thread will then start one
	# more thread for each request
	server_thread = threading.Thread(target=server.serve_forever)
	# Exit the server thread when the main thread terminates
	server_thread.daemon = True
	server_thread.start()
	print "Server loop running in thread:", server_thread.name

	while True:
	pass
	server.shutdown()