jpparis-orange/esPigArray

## esPigArray
#!/bin/bash

# my configuration
# elasticsearch-1.0.0
# elasticsearch-hadoop-yarn.jar from 1.3.0.M2
# hadoop-2.2.0-bin
# hive-0.12.0-bin
# pig-0.12.0/ with recompiled pig-0.12.0-withouthadoop.jar for yarn

ES_CLUSTER="localhost:9200"

# Remove old data
curl -XDELETE "http://${ES_CLUSTER}/hread?pretty"
# Create index with settings
curl -XPOST "http://${ES_CLUSTER}/hread/?pretty" -d '{
  "settings" : {
    "index" : {
      "number_of_shards"   : 1,
      "number_of_replicas" : 0
    }
  }
}'
# Define mapping
curl -XPOST "http://${ES_CLUSTER}/hread/doc/_mapping?pretty" -d '
{
  "doc" : {
    "properties" : {
      "my_id" : {
        "type" : "string"
      },
      "my_array" : {
        "type" : "string"
      }
    }
  }
}'

# Create Documents
curl -XPOST "http://${ES_CLUSTER}/hread/doc/?pretty" -d '
{
  "my_id":"doc1",
  "my_array": ["1.a", "1.b"]
}'
curl -XPOST "http://${ES_CLUSTER}/hread/doc/?pretty" -d '
{
  "my_id":"doc2",
  "my_array": ["2.a", "2.b"]
}'
# Wait for ES to be synced (aka refresh indices)
curl -XPOST "http://${ES_CLUSTER}/hread/_refresh?pretty"

exit

######################################
# PIG COMMANDS
######################################

es_read = LOAD 'hread/doc' USING org.elasticsearch.hadoop.pig.EsStorage('') AS ( my_id: chararray, my_array:{ the_tuple: ( the_item: chararray ) } );
DUMP es_read;
--
-- Unexpected System Error Occured: java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
--	at org.apache.pig.backend.hadoop23.PigJobControl.submit(PigJobControl.java:130)
--	at org.apache.pig.backend.hadoop23.PigJobControl.run(PigJobControl.java:191)
--	at org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher$1.run(MapReduceLauncher.java:270)
-- Caused by: java.lang.reflect.InvocationTargetException
--	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
--	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
-- Caused by: java.lang.OutOfMemoryError: Java heap space
--	at java.util.Arrays.copyOf(Arrays.java:2367)
--	at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130)

es_read = LOAD 'hread/doc' USING org.elasticsearch.hadoop.pig.EsStorage('') AS ( my_id: chararray, my_array:() );
DUMP es_read;
--
-- OK
-- (doc1,(1.a,1.b))
-- (doc2,(2.a,2.b))

es_read = LOAD 'hread/doc' USING org.elasticsearch.hadoop.pig.EsStorage('') AS ( my_id: chararray, my_array:( ) );
the_gen = FOREACH es_read GENERATE my_id AS the_id, COUNT(my_array) AS the_count;
DUMP the_gen;
--
-- <line 2, column 52> Could not infer the matching function for org.apache.pig.builtin.COUNT as multiple or none of them fit. Please use an explicit cast.
	#!/bin/bash

	# my configuration
	# elasticsearch-1.0.0
	# elasticsearch-hadoop-yarn.jar from 1.3.0.M2
	# hadoop-2.2.0-bin
	# hive-0.12.0-bin
	# pig-0.12.0/ with recompiled pig-0.12.0-withouthadoop.jar for yarn

	ES_CLUSTER="localhost:9200"

	# Remove old data
	curl -XDELETE "http://${ES_CLUSTER}/hread?pretty"
	# Create index with settings
	curl -XPOST "http://${ES_CLUSTER}/hread/?pretty" -d '{
	"settings" : {
	"index" : {
	"number_of_shards" : 1,
	"number_of_replicas" : 0
	}
	}
	}'
	# Define mapping
	curl -XPOST "http://${ES_CLUSTER}/hread/doc/_mapping?pretty" -d '
	{
	"doc" : {
	"properties" : {
	"my_id" : {
	"type" : "string"
	},
	"my_array" : {
	"type" : "string"
	}
	}
	}
	}'

	# Create Documents
	curl -XPOST "http://${ES_CLUSTER}/hread/doc/?pretty" -d '
	{
	"my_id":"doc1",
	"my_array": ["1.a", "1.b"]
	}'
	curl -XPOST "http://${ES_CLUSTER}/hread/doc/?pretty" -d '
	{
	"my_id":"doc2",
	"my_array": ["2.a", "2.b"]
	}'
	# Wait for ES to be synced (aka refresh indices)
	curl -XPOST "http://${ES_CLUSTER}/hread/_refresh?pretty"

	exit

	######################################
	# PIG COMMANDS
	######################################

	es_read = LOAD 'hread/doc' USING org.elasticsearch.hadoop.pig.EsStorage('') AS ( my_id: chararray, my_array:{ the_tuple: ( the_item: chararray ) } );
	DUMP es_read;
	--
	-- Unexpected System Error Occured: java.lang.RuntimeException: java.lang.reflect.InvocationTargetException
	-- at org.apache.pig.backend.hadoop23.PigJobControl.submit(PigJobControl.java:130)
	-- at org.apache.pig.backend.hadoop23.PigJobControl.run(PigJobControl.java:191)
	-- at org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher$1.run(MapReduceLauncher.java:270)
	-- Caused by: java.lang.reflect.InvocationTargetException
	-- at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	-- at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	-- Caused by: java.lang.OutOfMemoryError: Java heap space
	-- at java.util.Arrays.copyOf(Arrays.java:2367)
	-- at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130)

	es_read = LOAD 'hread/doc' USING org.elasticsearch.hadoop.pig.EsStorage('') AS ( my_id: chararray, my_array:() );
	DUMP es_read;
	--
	-- OK
	-- (doc1,(1.a,1.b))
	-- (doc2,(2.a,2.b))

	es_read = LOAD 'hread/doc' USING org.elasticsearch.hadoop.pig.EsStorage('') AS ( my_id: chararray, my_array:( ) );
	the_gen = FOREACH es_read GENERATE my_id AS the_id, COUNT(my_array) AS the_count;
	DUMP the_gen;
	--
	-- <line 2, column 52> Could not infer the matching function for org.apache.pig.builtin.COUNT as multiple or none of them fit. Please use an explicit cast.