Francesco Simoneschi francescosimoneschi

## multiple_input.json
{
  "urls":[
    "s3n://playhaven-segmentation/production2/2013/08/08/15/syslog1-north.device.log.gz",
    "s3n://playhaven-segmentation/production2/2013/08/08/16/syslog1-north.device.log.gz"
  ]
}

## gist:6453960
# Cassandra storage config YAML

# NOTE:
#   See http://wiki.apache.org/cassandra/StorageConfiguration for
#   full explanations of configuration directives
# /NOTE

# The name of the cluster. This is mainly used to prevent machines in
# one logical cluster from joining another.
cluster_name: 'Test Cluster'

## gist:6430096
hadoop/segment-api-jobs/org/playhaven/segmentapi/hadoop/jobs/SegmentExpressionWrapper.py
src/segment_api/model/segment_expression.py

## gist:6395218
#From master node, as hduser

# Format hdfs

hadoop namenode -format

# Start data nodes

/usr/sbin/start-dfs.sh

## gist:6392606
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
  <name>dfs.replication</name>
  <value>NUMBER_OF_REPLICA_BLOCKS</value>
  <description>Default block replication.

## gist:6392516
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
  <name>mapred.job.tracker</name>
  <value>master:54311</value>
  <description>The host and port that the MapReduce job tracker runs

## gist:6392491
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
  <name>hadoop.tmp.dir</name>
  <value>/large_volume/hadoop/tmp</value>
  <description>A base for other temporary directories.</description>

## gist:6392411
localhost
slave1
slave2
slaveN

## Node configuration
localhost

## Import sample file
Aug 22 22:00:00 api16-north metrics[29694]: [1377205200] {"game_id": "123456", "device_id": "78910", "preload": "1", "device_token": "dd0fe6d4521239d649773b8935fb4cc62e707d25"}
Aug 22 20:00:00 api16-north metrics[29694]: [1377205200] {"game_id": "123456", "device_id": "78910", "preload": "1", "device_token": "dd0fe6d4521239d649773b8935fb4cc62e707d25"}
Aug 22 19:00:00 api16-north metrics[29694]: [1377205200] {"game_id": "123456", "device_id": "78910", "testfield": "testvalue"}
	{
	"urls":[
	"s3n://playhaven-segmentation/production2/2013/08/08/15/syslog1-north.device.log.gz",
	"s3n://playhaven-segmentation/production2/2013/08/08/16/syslog1-north.device.log.gz"
	]
	}
	# Cassandra storage config YAML

	# NOTE:
	# See http://wiki.apache.org/cassandra/StorageConfiguration for
	# full explanations of configuration directives
	# /NOTE

	# The name of the cluster. This is mainly used to prevent machines in
	# one logical cluster from joining another.
	cluster_name: 'Test Cluster'
	hadoop/segment-api-jobs/org/playhaven/segmentapi/hadoop/jobs/SegmentExpressionWrapper.py
	src/segment_api/model/segment_expression.py
	#From master node, as hduser

	# Format hdfs

	hadoop namenode -format

	# Start data nodes

	/usr/sbin/start-dfs.sh
	<?xml version="1.0"?>
	<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

	<!-- Put site-specific property overrides in this file. -->

	<configuration>
	<property>
	<name>dfs.replication</name>
	<value>NUMBER_OF_REPLICA_BLOCKS</value>
	<description>Default block replication.
	Aug 22 22:00:00 api16-north metrics[29694]: [1377205200] {"game_id": "123456", "device_id": "78910", "preload": "1", "device_token": "dd0fe6d4521239d649773b8935fb4cc62e707d25"}
	Aug 22 20:00:00 api16-north metrics[29694]: [1377205200] {"game_id": "123456", "device_id": "78910", "preload": "1", "device_token": "dd0fe6d4521239d649773b8935fb4cc62e707d25"}
	Aug 22 19:00:00 api16-north metrics[29694]: [1377205200] {"game_id": "123456", "device_id": "78910", "testfield": "testvalue"}