ceteri/ Main.java

## Main.java
public class
  Main
  {
  public static void
  main( String[] args )
    {
    String docPath = args[ 0 ];
    String wcPath = args[ 1 ];
    String stopPath = args[ 2 ];
    String tfidfPath = args[ 3 ];

    Properties properties = new Properties();
    AppProps.setApplicationJarClass( properties, Main.class );
    HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );

    // create source and sink taps
    Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath );
    Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );

    Fields stop = new Fields( "stop" );
    Tap stopTap = new Hfs( new TextDelimited( stop, true, "\t" ), stopPath );
    Tap tfidfTap = new Hfs( new TextDelimited( true, "\t" ), tfidfPath );

    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields( "token" );
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
    Fields fieldSelector = new Fields( "doc_id", "token" );
    Pipe docPipe = new Each( "token", text, splitter, fieldSelector );

    // define "ScrubFunction" to clean up the token stream
    Fields scrubArguments = new Fields( "doc_id", "token" );
    docPipe = new Each( docPipe, scrubArguments, new ScrubFunction( scrubArguments ), Fields.RESULTS );

    // perform a left join to remove stop words, discarding the rows
    // which joined with stop words, i.e., were non-null after left join
    Pipe stopPipe = new Pipe( "stop" );
    Pipe tokenPipe = new HashJoin( docPipe, token, stopPipe, stop, new LeftJoin() );
    tokenPipe = new Each( tokenPipe, stop, new RegexFilter( "^$" ) );
    tokenPipe = new Retain( tokenPipe, fieldSelector );

    // one branch of the flow tallies the token counts for term frequency (TF)
    Pipe tfPipe = new Pipe( "TF", tokenPipe );
    Fields tf_count = new Fields( "tf_count" );
    tfPipe = new CountBy( tfPipe, new Fields( "doc_id", "token" ), tf_count );

    Fields tf_token = new Fields( "tf_token" );
    tfPipe = new Rename( tfPipe, token, tf_token );

    // one branch counts the number of documents (D)
    Fields doc_id = new Fields( "doc_id" );
    Fields tally = new Fields( "tally" );
    Fields rhs_join = new Fields( "rhs_join" );
    Fields n_docs = new Fields( "n_docs" );
    Pipe dPipe = new Unique( "D", tokenPipe, doc_id );
    dPipe = new Each( dPipe, new Insert( tally, 1 ), Fields.ALL );
    dPipe = new Each( dPipe, new Insert( rhs_join, 1 ), Fields.ALL );
    dPipe = new SumBy( dPipe, rhs_join, tally, n_docs, long.class );

    // one branch tallies the token counts for document frequency (DF)
    Pipe dfPipe = new Unique( "DF", tokenPipe, Fields.ALL );
    Fields df_count = new Fields( "df_count" );
    dfPipe = new CountBy( dfPipe, token, df_count );

    Fields df_token = new Fields( "df_token" );
    Fields lhs_join = new Fields( "lhs_join" );
    dfPipe = new Rename( dfPipe, token, df_token );
    dfPipe = new Each( dfPipe, new Insert( lhs_join, 1 ), Fields.ALL );

    // join to bring together all the components for calculating TF-IDF
    // the D side of the join is smaller, so it goes on the RHS
    Pipe idfPipe = new HashJoin( dfPipe, lhs_join, dPipe, rhs_join );

    // the IDF side of the join is smaller, so it goes on the RHS
    Pipe tfidfPipe = new CoGroup( tfPipe, tf_token, idfPipe, df_token );

    // calculate the TF-IDF weights, per token, per document
    Fields tfidf = new Fields( "tfidf" );
    String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
    ExpressionFunction tfidfExpression = new ExpressionFunction( tfidf, expression, Double.class );
    Fields tfidfArguments = new Fields( "tf_count", "df_count", "n_docs" );
    tfidfPipe = new Each( tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL );

    fieldSelector = new Fields( "tf_token", "doc_id", "tfidf" );
    tfidfPipe = new Retain( tfidfPipe, fieldSelector );
    tfidfPipe = new Rename( tfidfPipe, tf_token, token );

    // keep track of the word counts, which are useful for QA
    Pipe wcPipe = new Pipe( "wc", tfPipe );

    Fields count = new Fields( "count" );
    wcPipe = new SumBy( wcPipe, tf_token, tf_count, count, long.class );
    wcPipe = new Rename( wcPipe, tf_token, token );

    // additionally, sort by count
    wcPipe = new GroupBy( wcPipe, count, count );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "tfidf" )
     .addSource( docPipe, docTap )
     .addSource( stopPipe, stopTap )
     .addTailSink( tfidfPipe, tfidfTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow tfidfFlow = flowConnector.connect( flowDef );
    tfidfFlow.writeDOT( "dot/tfidf.dot" );
    tfidfFlow.complete();
    }
  }

## log
bash-3.2$ ls
LICENSE.txt  README.md	build		build.gradle	data		docs		src
bash-3.2$ hadoop version
Warning: $HADOOP_HOME is deprecated.

Hadoop 1.0.3
Subversion https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.0 -r 1335192
Compiled by hortonfo on Tue May  8 20:31:25 UTC 2012
From source with checksum e6b0c1e23dcf76907c5fecb4b832f3be
bash-3.2$ gradle -version

------------------------------------------------------------
Gradle 1.4
------------------------------------------------------------

Gradle build time: Monday, January 28, 2013 3:42:46 AM UTC
Groovy: 1.8.6
Ant: Apache Ant(TM) version 1.8.4 compiled on May 22 2012
Ivy: 2.2.0
JVM: 1.6.0_43 (Apple Inc. 20.14-b01-447)
OS: Mac OS X 10.7.5 x86_64

bash-3.2$ gradle clean jar
:clean
:compileJava
:processResources UP-TO-DATE
:classes
:jar

BUILD SUCCESSFUL

Total time: 4.073 secs
bash-3.2$ hadoop jar ./build/libs/impatient.jar data/rain.txt output/wc data/en.stop output/tfidf
Warning: $HADOOP_HOME is deprecated.

13/04/04 17:16:02 INFO util.HadoopUtil: resolving application jar from found main method on: impatient.Main
13/04/04 17:16:02 INFO planner.HadoopPlanner: using application jar: /Users/ceteri/src/concur/Impatient/part5/./build/libs/impatient.jar
13/04/04 17:16:02 INFO property.AppProps: using app.id: A9973837DCB25E00D2E73B5447252121
2013-04-04 17:16:02.629 java[11208:1903] Unable to load realm info from SCDynamicStore
13/04/04 17:16:02 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
13/04/04 17:16:02 WARN snappy.LoadSnappy: Snappy native library not loaded
13/04/04 17:16:02 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:16:03 INFO util.Version: Concurrent, Inc - Cascading 2.0.1
13/04/04 17:16:03 INFO flow.Flow: [tfidf] starting
13/04/04 17:16:03 INFO flow.Flow: [tfidf]  source: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"]
13/04/04 17:16:03 INFO flow.Flow: [tfidf]  source: Hfs["TextDelimited[['doc_id', 'text']->[ALL]]"]["data/rain.txt"]"]
13/04/04 17:16:03 INFO flow.Flow: [tfidf]  sink: Hfs["TextDelimited[[UNKNOWN]->['doc_id', 'tfidf', 'token']]"]["output/tfidf"]"]
13/04/04 17:16:03 INFO flow.Flow: [tfidf]  sink: Hfs["TextDelimited[[UNKNOWN]->['count', 'token']]"]["output/wc"]"]
13/04/04 17:16:03 INFO flow.Flow: [tfidf]  parallel execution is enabled: false
13/04/04 17:16:03 INFO flow.Flow: [tfidf]  starting jobs: 9
13/04/04 17:16:03 INFO flow.Flow: [tfidf]  allocating threads: 1
13/04/04 17:16:03 INFO flow.FlowStep: [tfidf] starting step: (1/9)
13/04/04 17:16:03 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:16:03 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0001
13/04/04 17:16:03 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:03 INFO io.MultiInputSplit: current split input path: file:/Users/ceteri/src/concur/Impatient/part5/data/rain.txt
13/04/04 17:16:03 INFO mapred.MapTask: numReduceTasks: 0
13/04/04 17:16:03 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['doc_id', 'text']->[ALL]]"]["data/rain.txt"]"]
13/04/04 17:16:03 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"]
13/04/04 17:16:03 INFO hadoop.FlowMapper: sinking to: TempHfs["SequenceFile[['doc_id', 'token']]"][token_stop/83257/]
13/04/04 17:16:03 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:16:03 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:16:03 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:16:03 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:16:03 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:16:03 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
13/04/04 17:16:03 INFO mapred.LocalJobRunner:
13/04/04 17:16:03 INFO mapred.Task: Task attempt_local_0001_m_000000_0 is allowed to commit now
13/04/04 17:16:03 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0001_m_000000_0' to file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA
13/04/04 17:16:06 INFO mapred.LocalJobRunner: file:/Users/ceteri/src/concur/Impatient/part5/data/rain.txt:0+510
13/04/04 17:16:06 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done.
13/04/04 17:16:08 INFO flow.FlowStep: [tfidf] starting step: (2/9)
13/04/04 17:16:08 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:16:08 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0002
13/04/04 17:16:08 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:08 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000
13/04/04 17:16:08 INFO mapred.MapTask: numReduceTasks: 1
13/04/04 17:16:08 INFO mapred.MapTask: io.sort.mb = 100
13/04/04 17:16:08 INFO mapred.MapTask: data buffer = 79691776/99614720
13/04/04 17:16:08 INFO mapred.MapTask: record buffer = 262144/327680
13/04/04 17:16:08 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'token']]"][token_stop/83257/]
13/04/04 17:16:08 INFO hadoop.FlowMapper: sinking to: GroupBy(TF)[by:[{2}:'doc_id', 'token']]
13/04/04 17:16:08 INFO assembly.AggregateBy: using threshold value: 10000
13/04/04 17:16:08 INFO mapred.MapTask: Starting flush of map output
13/04/04 17:16:08 INFO mapred.MapTask: Finished spill 0
13/04/04 17:16:08 INFO mapred.Task: Task:attempt_local_0002_m_000000_0 is done. And is in the process of commiting
13/04/04 17:16:11 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000:0+1539
13/04/04 17:16:11 INFO mapred.Task: Task 'attempt_local_0002_m_000000_0' done.
13/04/04 17:16:11 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:11 INFO mapred.LocalJobRunner:
13/04/04 17:16:11 INFO mapred.Merger: Merging 1 sorted segments
13/04/04 17:16:11 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 1321 bytes
13/04/04 17:16:11 INFO mapred.LocalJobRunner:
13/04/04 17:16:11 INFO hadoop.FlowReducer: sourcing from: GroupBy(TF)[by:[{2}:'doc_id', 'token']]
13/04/04 17:16:11 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['doc_id', 'tf_count', 'tf_token']]"][TF/16560/]
13/04/04 17:16:11 INFO mapred.Task: Task:attempt_local_0002_r_000000_0 is done. And is in the process of commiting
13/04/04 17:16:11 INFO mapred.LocalJobRunner:
13/04/04 17:16:11 INFO mapred.Task: Task attempt_local_0002_r_000000_0 is allowed to commit now
13/04/04 17:16:11 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0002_r_000000_0' to file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB
13/04/04 17:16:14 INFO mapred.LocalJobRunner: reduce > reduce
13/04/04 17:16:14 INFO mapred.Task: Task 'attempt_local_0002_r_000000_0' done.
13/04/04 17:16:18 INFO flow.FlowStep: [tfidf] starting step: (4/9)
13/04/04 17:16:18 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:16:18 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0003
13/04/04 17:16:18 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:18 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000
13/04/04 17:16:18 INFO mapred.MapTask: numReduceTasks: 1
13/04/04 17:16:18 INFO mapred.MapTask: io.sort.mb = 100
13/04/04 17:16:18 INFO mapred.MapTask: data buffer = 79691776/99614720
13/04/04 17:16:18 INFO mapred.MapTask: record buffer = 262144/327680
13/04/04 17:16:18 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'token']]"][token_stop/83257/]
13/04/04 17:16:18 INFO hadoop.FlowMapper: sinking to: GroupBy(DF)[by:[{?}:ALL]]
13/04/04 17:16:18 INFO mapred.MapTask: Starting flush of map output
13/04/04 17:16:18 INFO mapred.MapTask: Finished spill 0
13/04/04 17:16:18 INFO mapred.Task: Task:attempt_local_0003_m_000000_0 is done. And is in the process of commiting
13/04/04 17:16:21 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000:0+1539
13/04/04 17:16:21 INFO mapred.Task: Task 'attempt_local_0003_m_000000_0' done.
13/04/04 17:16:21 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:21 INFO mapred.LocalJobRunner:
13/04/04 17:16:21 INFO mapred.Merger: Merging 1 sorted segments
13/04/04 17:16:21 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 1229 bytes
13/04/04 17:16:21 INFO mapred.LocalJobRunner:
13/04/04 17:16:21 INFO hadoop.FlowReducer: sourcing from: GroupBy(DF)[by:[{?}:ALL]]
13/04/04 17:16:21 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['token', 'df_count']]"][DF/8315/]
13/04/04 17:16:21 INFO assembly.AggregateBy: using threshold value: 10000
13/04/04 17:16:21 INFO mapred.Task: Task:attempt_local_0003_r_000000_0 is done. And is in the process of commiting
13/04/04 17:16:21 INFO mapred.LocalJobRunner:
13/04/04 17:16:21 INFO mapred.Task: Task attempt_local_0003_r_000000_0 is allowed to commit now
13/04/04 17:16:21 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0003_r_000000_0' to file:/tmp/hadoop-ceteri/DF_8315_17032D064A215335E3182ADA4ABC5EE3
13/04/04 17:16:24 INFO mapred.LocalJobRunner: reduce > reduce
13/04/04 17:16:24 INFO mapred.Task: Task 'attempt_local_0003_r_000000_0' done.
13/04/04 17:16:28 INFO flow.FlowStep: [tfidf] starting step: (6/9)
13/04/04 17:16:28 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:16:28 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0004
13/04/04 17:16:28 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:28 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB/part-00000
13/04/04 17:16:28 INFO mapred.MapTask: numReduceTasks: 1
13/04/04 17:16:28 INFO mapred.MapTask: io.sort.mb = 100
13/04/04 17:16:28 INFO mapred.MapTask: data buffer = 79691776/99614720
13/04/04 17:16:28 INFO mapred.MapTask: record buffer = 262144/327680
13/04/04 17:16:28 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'tf_count', 'tf_token']]"][TF/16560/]
13/04/04 17:16:28 INFO hadoop.FlowMapper: sinking to: GroupBy(wc)[by:[{1}:'tf_token']]
13/04/04 17:16:28 INFO assembly.AggregateBy: using threshold value: 10000
13/04/04 17:16:28 INFO mapred.MapTask: Starting flush of map output
13/04/04 17:16:28 INFO mapred.MapTask: Finished spill 0
13/04/04 17:16:28 INFO mapred.Task: Task:attempt_local_0004_m_000000_0 is done. And is in the process of commiting
13/04/04 17:16:31 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB/part-00000:0+1573
13/04/04 17:16:31 INFO mapred.Task: Task 'attempt_local_0004_m_000000_0' done.
13/04/04 17:16:31 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:31 INFO mapred.LocalJobRunner:
13/04/04 17:16:31 INFO mapred.Merger: Merging 1 sorted segments
13/04/04 17:16:31 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 561 bytes
13/04/04 17:16:31 INFO mapred.LocalJobRunner:
13/04/04 17:16:31 INFO hadoop.FlowReducer: sourcing from: GroupBy(wc)[by:[{1}:'tf_token']]
13/04/04 17:16:31 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['count', 'token']]"][wc/96581/]
13/04/04 17:16:31 INFO mapred.Task: Task:attempt_local_0004_r_000000_0 is done. And is in the process of commiting
13/04/04 17:16:31 INFO mapred.LocalJobRunner:
13/04/04 17:16:31 INFO mapred.Task: Task attempt_local_0004_r_000000_0 is allowed to commit now
13/04/04 17:16:31 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0004_r_000000_0' to file:/tmp/hadoop-ceteri/wc_96581_B5483ECB7D3FFB07529931D0BAB0FE52
13/04/04 17:16:34 INFO mapred.LocalJobRunner: reduce > reduce
13/04/04 17:16:34 INFO mapred.Task: Task 'attempt_local_0004_r_000000_0' done.
13/04/04 17:16:38 INFO flow.FlowStep: [tfidf] starting step: (8/9)
13/04/04 17:16:38 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:16:38 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0005
13/04/04 17:16:38 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:38 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/DF_8315_17032D064A215335E3182ADA4ABC5EE3/part-00000
13/04/04 17:16:38 INFO mapred.MapTask: numReduceTasks: 1
13/04/04 17:16:38 INFO mapred.MapTask: io.sort.mb = 100
13/04/04 17:16:38 INFO mapred.MapTask: data buffer = 79691776/99614720
13/04/04 17:16:38 INFO mapred.MapTask: record buffer = 262144/327680
13/04/04 17:16:38 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['token', 'df_count']]"][DF/8315/]
13/04/04 17:16:38 INFO hadoop.FlowMapper: sinking to: GroupBy(DF)[by:[{1}:'token']]
13/04/04 17:16:38 INFO mapred.MapTask: Starting flush of map output
13/04/04 17:16:38 INFO mapred.MapTask: Finished spill 0
13/04/04 17:16:38 INFO mapred.Task: Task:attempt_local_0005_m_000000_0 is done. And is in the process of commiting
13/04/04 17:16:41 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/DF_8315_17032D064A215335E3182ADA4ABC5EE3/part-00000:0+784
13/04/04 17:16:41 INFO mapred.Task: Task 'attempt_local_0005_m_000000_0' done.
13/04/04 17:16:41 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:41 INFO mapred.LocalJobRunner:
13/04/04 17:16:41 INFO mapred.Merger: Merging 1 sorted segments
13/04/04 17:16:41 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 561 bytes
13/04/04 17:16:41 INFO mapred.LocalJobRunner:
13/04/04 17:16:41 INFO hadoop.FlowReducer: sourcing from: GroupBy(DF)[by:[{1}:'token']]
13/04/04 17:16:41 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['df_count', 'df_token', 'lhs_join']]"][DF/37139/]
13/04/04 17:16:41 INFO mapred.Task: Task:attempt_local_0005_r_000000_0 is done. And is in the process of commiting
13/04/04 17:16:41 INFO mapred.LocalJobRunner:
13/04/04 17:16:41 INFO mapred.Task: Task attempt_local_0005_r_000000_0 is allowed to commit now
13/04/04 17:16:41 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0005_r_000000_0' to file:/tmp/hadoop-ceteri/DF_37139_5D60E73D038B7A1DFA46ECF5292A68B4
13/04/04 17:16:44 INFO mapred.LocalJobRunner: reduce > reduce
13/04/04 17:16:44 INFO mapred.Task: Task 'attempt_local_0005_r_000000_0' done.
13/04/04 17:16:48 INFO flow.FlowStep: [tfidf] starting step: (9/9) output/wc
13/04/04 17:16:48 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:16:48 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0006
13/04/04 17:16:48 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:48 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/wc_96581_B5483ECB7D3FFB07529931D0BAB0FE52/part-00000
13/04/04 17:16:48 INFO mapred.MapTask: numReduceTasks: 1
13/04/04 17:16:48 INFO mapred.MapTask: io.sort.mb = 100
13/04/04 17:16:48 INFO mapred.MapTask: data buffer = 79691776/99614720
13/04/04 17:16:48 INFO mapred.MapTask: record buffer = 262144/327680
13/04/04 17:16:48 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['count', 'token']]"][wc/96581/]
13/04/04 17:16:48 INFO hadoop.FlowMapper: sinking to: GroupBy(wc)[by:[{1}:'count']]
13/04/04 17:16:48 INFO mapred.MapTask: Starting flush of map output
13/04/04 17:16:48 INFO mapred.MapTask: Finished spill 0
13/04/04 17:16:48 INFO mapred.Task: Task:attempt_local_0006_m_000000_0 is done. And is in the process of commiting
13/04/04 17:16:51 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/wc_96581_B5483ECB7D3FFB07529931D0BAB0FE52/part-00000:0+784
13/04/04 17:16:51 INFO mapred.Task: Task 'attempt_local_0006_m_000000_0' done.
13/04/04 17:16:51 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:51 INFO mapred.LocalJobRunner:
13/04/04 17:16:51 INFO mapred.Merger: Merging 1 sorted segments
13/04/04 17:16:51 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 654 bytes
13/04/04 17:16:51 INFO mapred.LocalJobRunner:
13/04/04 17:16:51 INFO hadoop.FlowReducer: sourcing from: GroupBy(wc)[by:[{1}:'count']]
13/04/04 17:16:51 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[UNKNOWN]->['count', 'token']]"]["output/wc"]"]
13/04/04 17:16:51 INFO mapred.Task: Task:attempt_local_0006_r_000000_0 is done. And is in the process of commiting
13/04/04 17:16:51 INFO mapred.LocalJobRunner:
13/04/04 17:16:51 INFO mapred.Task: Task attempt_local_0006_r_000000_0 is allowed to commit now
13/04/04 17:16:51 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0006_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part5/output/wc
13/04/04 17:16:54 INFO mapred.LocalJobRunner: reduce > reduce
13/04/04 17:16:54 INFO mapred.Task: Task 'attempt_local_0006_r_000000_0' done.
13/04/04 17:16:58 INFO flow.FlowStep: [tfidf] starting step: (3/9)
13/04/04 17:16:58 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:16:58 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0007
13/04/04 17:16:58 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:16:58 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000
13/04/04 17:16:58 INFO mapred.MapTask: numReduceTasks: 1
13/04/04 17:16:58 INFO mapred.MapTask: io.sort.mb = 100
13/04/04 17:16:58 INFO mapred.MapTask: data buffer = 79691776/99614720
13/04/04 17:16:58 INFO mapred.MapTask: record buffer = 262144/327680
13/04/04 17:16:58 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'token']]"][token_stop/83257/]
13/04/04 17:16:58 INFO hadoop.FlowMapper: sinking to: GroupBy(D)[by:[{1}:'doc_id']]
13/04/04 17:16:58 INFO mapred.MapTask: Starting flush of map output
13/04/04 17:16:58 INFO mapred.MapTask: Finished spill 0
13/04/04 17:16:58 INFO mapred.Task: Task:attempt_local_0007_m_000000_0 is done. And is in the process of commiting
13/04/04 17:17:01 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000:0+1539
13/04/04 17:17:01 INFO mapred.Task: Task 'attempt_local_0007_m_000000_0' done.
13/04/04 17:17:01 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:17:01 INFO mapred.LocalJobRunner:
13/04/04 17:17:01 INFO mapred.Merger: Merging 1 sorted segments
13/04/04 17:17:01 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 127 bytes
13/04/04 17:17:01 INFO mapred.LocalJobRunner:
13/04/04 17:17:01 INFO hadoop.FlowReducer: sourcing from: GroupBy(D)[by:[{1}:'doc_id']]
13/04/04 17:17:01 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['rhs_join', 'n_docs']]"][D/38403/]
13/04/04 17:17:01 INFO assembly.AggregateBy: using threshold value: 10000
13/04/04 17:17:01 INFO mapred.Task: Task:attempt_local_0007_r_000000_0 is done. And is in the process of commiting
13/04/04 17:17:01 INFO mapred.LocalJobRunner:
13/04/04 17:17:01 INFO mapred.Task: Task attempt_local_0007_r_000000_0 is allowed to commit now
13/04/04 17:17:01 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0007_r_000000_0' to file:/tmp/hadoop-ceteri/D_38403_D3972FDA75B1A76BFF4DF639470CB650
13/04/04 17:17:04 INFO mapred.LocalJobRunner: reduce > reduce
13/04/04 17:17:04 INFO mapred.Task: Task 'attempt_local_0007_r_000000_0' done.
13/04/04 17:17:08 INFO flow.FlowStep: [tfidf] starting step: (7/9)
13/04/04 17:17:08 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:17:08 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0008
13/04/04 17:17:08 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:17:08 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/D_38403_D3972FDA75B1A76BFF4DF639470CB650/part-00000
13/04/04 17:17:08 INFO mapred.MapTask: numReduceTasks: 1
13/04/04 17:17:08 INFO mapred.MapTask: io.sort.mb = 100
13/04/04 17:17:08 INFO mapred.MapTask: data buffer = 79691776/99614720
13/04/04 17:17:08 INFO mapred.MapTask: record buffer = 262144/327680
13/04/04 17:17:08 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['rhs_join', 'n_docs']]"][D/38403/]
13/04/04 17:17:08 INFO hadoop.FlowMapper: sinking to: GroupBy(D)[by:[{1}:'rhs_join']]
13/04/04 17:17:08 INFO mapred.MapTask: Starting flush of map output
13/04/04 17:17:08 INFO mapred.MapTask: Finished spill 0
13/04/04 17:17:08 INFO mapred.Task: Task:attempt_local_0008_m_000000_0 is done. And is in the process of commiting
13/04/04 17:17:11 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/D_38403_D3972FDA75B1A76BFF4DF639470CB650/part-00000:0+84
13/04/04 17:17:11 INFO mapred.Task: Task 'attempt_local_0008_m_000000_0' done.
13/04/04 17:17:11 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:17:11 INFO mapred.LocalJobRunner:
13/04/04 17:17:11 INFO mapred.Merger: Merging 1 sorted segments
13/04/04 17:17:11 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 11 bytes
13/04/04 17:17:11 INFO mapred.LocalJobRunner:
13/04/04 17:17:11 INFO hadoop.FlowReducer: sourcing from: GroupBy(D)[by:[{1}:'rhs_join']]
13/04/04 17:17:11 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['rhs_join', 'n_docs']]"][D/43305/]
13/04/04 17:17:11 INFO mapred.Task: Task:attempt_local_0008_r_000000_0 is done. And is in the process of commiting
13/04/04 17:17:11 INFO mapred.LocalJobRunner:
13/04/04 17:17:11 INFO mapred.Task: Task attempt_local_0008_r_000000_0 is allowed to commit now
13/04/04 17:17:11 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0008_r_000000_0' to file:/tmp/hadoop-ceteri/D_43305_F519DB9808753107EC31FF765111D5CD
13/04/04 17:17:14 INFO mapred.LocalJobRunner: reduce > reduce
13/04/04 17:17:14 INFO mapred.Task: Task 'attempt_local_0008_r_000000_0' done.
13/04/04 17:17:18 INFO flow.FlowStep: [tfidf] starting step: (5/9) output/tfidf
13/04/04 17:17:18 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:17:18 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:17:18 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0009
13/04/04 17:17:18 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:17:18 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB/part-00000
13/04/04 17:17:18 INFO mapred.MapTask: numReduceTasks: 1
13/04/04 17:17:18 INFO mapred.MapTask: io.sort.mb = 100
13/04/04 17:17:18 INFO mapred.MapTask: data buffer = 79691776/99614720
13/04/04 17:17:18 INFO mapred.MapTask: record buffer = 262144/327680
13/04/04 17:17:18 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'tf_count', 'tf_token']]"][TF/16560/]
13/04/04 17:17:18 INFO hadoop.FlowMapper: sinking to: CoGroup(TF*DF*D)[by:TF:[{1}:'tf_token']DF*D:[{1}:'df_token']]
13/04/04 17:17:18 INFO mapred.MapTask: Starting flush of map output
13/04/04 17:17:18 INFO mapred.MapTask: Finished spill 0
13/04/04 17:17:18 INFO mapred.Task: Task:attempt_local_0009_m_000000_0 is done. And is in the process of commiting
13/04/04 17:17:21 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB/part-00000:0+1573
13/04/04 17:17:21 INFO mapred.Task: Task 'attempt_local_0009_m_000000_0' done.
13/04/04 17:17:21 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:17:21 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/DF_37139_5D60E73D038B7A1DFA46ECF5292A68B4/part-00000
13/04/04 17:17:21 INFO mapred.MapTask: numReduceTasks: 1
13/04/04 17:17:21 INFO mapred.MapTask: io.sort.mb = 100
13/04/04 17:17:21 INFO mapred.MapTask: data buffer = 79691776/99614720
13/04/04 17:17:21 INFO mapred.MapTask: record buffer = 262144/327680
13/04/04 17:17:21 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['df_count', 'df_token', 'lhs_join']]"][DF/37139/]
13/04/04 17:17:21 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['rhs_join', 'n_docs']]"][D/43305/]
13/04/04 17:17:21 INFO hadoop.FlowMapper: sinking to: CoGroup(TF*DF*D)[by:TF:[{1}:'tf_token']DF*D:[{1}:'df_token']]
13/04/04 17:17:21 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:17:21 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:17:21 INFO mapred.FileInputFormat: Total input paths to process : 1
13/04/04 17:17:21 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:17:21 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:17:21 INFO mapred.MapTask: Starting flush of map output
13/04/04 17:17:21 INFO mapred.MapTask: Finished spill 0
13/04/04 17:17:21 INFO mapred.Task: Task:attempt_local_0009_m_000001_0 is done. And is in the process of commiting
13/04/04 17:17:24 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/DF_37139_5D60E73D038B7A1DFA46ECF5292A68B4/part-00000:0+846
13/04/04 17:17:24 INFO mapred.Task: Task 'attempt_local_0009_m_000001_0' done.
13/04/04 17:17:24 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
13/04/04 17:17:24 INFO mapred.LocalJobRunner:
13/04/04 17:17:24 INFO mapred.Merger: Merging 2 sorted segments
13/04/04 17:17:24 INFO mapred.Merger: Down to the last merge-pass, with 2 segments left of total size: 2176 bytes
13/04/04 17:17:24 INFO mapred.LocalJobRunner:
13/04/04 17:17:24 INFO hadoop.FlowReducer: sourcing from: CoGroup(TF*DF*D)[by:TF:[{1}:'tf_token']DF*D:[{1}:'df_token']]
13/04/04 17:17:24 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[UNKNOWN]->['doc_id', 'tfidf', 'token']]"]["output/tfidf"]"]
13/04/04 17:17:24 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:17:24 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec
13/04/04 17:17:24 INFO mapred.Task: Task:attempt_local_0009_r_000000_0 is done. And is in the process of commiting
13/04/04 17:17:24 INFO mapred.LocalJobRunner:
13/04/04 17:17:24 INFO mapred.Task: Task attempt_local_0009_r_000000_0 is allowed to commit now
13/04/04 17:17:24 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0009_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part5/output/tfidf
13/04/04 17:17:27 INFO mapred.LocalJobRunner: reduce > reduce
13/04/04 17:17:27 INFO mapred.Task: Task 'attempt_local_0009_r_000000_0' done.
13/04/04 17:17:28 INFO util.Hadoop18TapUtil: deleting temp path output/wc/_temporary
13/04/04 17:17:28 INFO util.Hadoop18TapUtil: deleting temp path output/tfidf/_temporary
bash-3.2$ more output/wc/part-00000
count   token
1       women
1       australia
1       broken
1       california's
1       cause
1       cloudcover
1       death
1       deserts
1       downwind
1       dvd
1       effect
1       known
1       air
1       less
1       lies
1       mountainous
1       primary
1       produces
1       ranges
1       secrets
1       sinking
1       such
1       valley
2       land
2       leeward
2       lee
3       dry
3       mountain
4       area
4       shadow
5       rain
bash-3.2$ more output/tfidf/part-00000
doc_id  tfidf   token
doc02   0.9162907318741551      air
doc01   0.44628710262841953     area
doc03   0.22314355131420976     area
doc02   0.22314355131420976     area
doc05   0.9162907318741551      australia
doc05   0.9162907318741551      broken
doc04   0.9162907318741551      california's
doc04   0.9162907318741551      cause
doc02   0.9162907318741551      cloudcover
doc04   0.9162907318741551      death
doc04   0.9162907318741551      deserts
doc03   0.9162907318741551      downwind
doc02   0.22314355131420976     dry
doc01   0.22314355131420976     dry
doc03   0.22314355131420976     dry
doc05   0.9162907318741551      dvd
doc04   0.9162907318741551      effect
doc04   0.9162907318741551      known
doc05   0.5108256237659907      land
doc03   0.5108256237659907      land
doc01   0.5108256237659907      lee
doc02   0.5108256237659907      lee
doc04   0.5108256237659907      leeward
doc03   0.5108256237659907      leeward
doc02   0.9162907318741551      less
doc03   0.9162907318741551      lies
doc03   0.22314355131420976     mountain
doc02   0.22314355131420976     mountain
doc04   0.22314355131420976     mountain
doc01   0.9162907318741551      mountainous
doc04   0.9162907318741551      primary
doc02   0.9162907318741551      produces
doc01   0.0     rain
doc02   0.0     rain
doc04   0.0     rain
doc03   0.0     rain
doc04   0.9162907318741551      ranges
doc05   0.9162907318741551      secrets
doc04   0.0     shadow
doc02   0.0     shadow
doc01   0.0     shadow
doc03   0.0     shadow
doc02   0.9162907318741551      sinking
doc04   0.9162907318741551      such
doc04   0.9162907318741551      valley
doc05   0.9162907318741551      women
bash-3.2$

## pig.log
bash-3.2$ pig -version
Warning: $HADOOP_HOME is deprecated.

Apache Pig version 0.10.0 (r1328203)
compiled Apr 19 2012, 22:54:12
bash-3.2$ pig -p docPath=./data/rain.txt -p wcPath=./output/wc -p stopPath=./data/en.stop -p tfidfPath=./output/tfidf ./src/scripts/tfidf.pig
Warning: $HADOOP_HOME is deprecated.

2012-08-28 10:29:44,460 [main] INFO  org.apache.pig.Main - Apache Pig version 0.10.0 (r1328203) compiled Apr 19 2012, 22:54:12
2012-08-28 10:29:44,460 [main] INFO  org.apache.pig.Main - Logging error messages to: /Users/ceteri/src/concur/Impatient/part5/pig_1346174984458.log
2012-08-28 10:29:44.558 java[74234:1903] Unable to load realm info from SCDynamicStore
2012-08-28 10:29:44,760 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: file:///
2012-08-28 10:29:45,539 [main] WARN  org.apache.pig.PigServer - Encountered Warning USING_OVERLOADED_FUNCTION 1 time(s).
2012-08-28 10:29:45,539 [main] WARN  org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_CHARARRAY 2 time(s).
2012-08-28 10:29:45,715 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false
2012-08-28 10:29:45,728 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
2012-08-28 10:29:45,731 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
2012-08-28 10:29:45,732 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
2012-08-28 10:29:45,743 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LastInputStreamingOptimizer - Rewrite: POPackage->POForEach to POJoinPackage
2012-08-28 10:29:45,743 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LastInputStreamingOptimizer - Rewrite: POPackage->POForEach to POJoinPackage
2012-08-28 10:29:45,754 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 8
2012-08-28 10:29:45,754 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Cannot merge this splittee: it has distinct combiner.
2012-08-28 10:29:45,754 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Cannot merge this splittee: it has distinct combiner.
2012-08-28 10:29:45,754 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Merged 0 map-reduce splittees.
2012-08-28 10:29:45,755 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Merged 0 out of total 4 MR operators.
2012-08-28 10:29:45,755 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 8
2012-08-28 10:29:45,815 [main] WARN  org.apache.pig.PigServer - Encountered Warning USING_OVERLOADED_FUNCTION 1 time(s).
2012-08-28 10:29:45,815 [main] WARN  org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_CHARARRAY 2 time(s).
2012-08-28 10:29:45,818 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig features used in the script: HASH_JOIN,GROUP_BY,DISTINCT,FILTER,CROSS
2012-08-28 10:29:45,858 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false
2012-08-28 10:29:45,862 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
2012-08-28 10:29:45,863 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
2012-08-28 10:29:45,864 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
2012-08-28 10:29:45,866 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LastInputStreamingOptimizer - Rewrite: POPackage->POForEach to POJoinPackage
2012-08-28 10:29:45,866 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LastInputStreamingOptimizer - Rewrite: POPackage->POForEach to POJoinPackage
2012-08-28 10:29:45,868 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 8
2012-08-28 10:29:45,868 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Cannot merge this splittee: it has distinct combiner.
2012-08-28 10:29:45,868 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Cannot merge this splittee: it has distinct combiner.
2012-08-28 10:29:45,868 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Merged 0 map-reduce splittees.
2012-08-28 10:29:45,868 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Merged 0 out of total 4 MR operators.
2012-08-28 10:29:45,868 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 8
2012-08-28 10:29:45,885 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
2012-08-28 10:29:45,894 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
2012-08-28 10:29:45,897 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job911310665554333174.jar
2012-08-28 10:29:50,739 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job911310665554333174.jar created
2012-08-28 10:29:50,747 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
2012-08-28 10:29:50,754 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1054
2012-08-28 10:29:50,754 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
2012-08-28 10:29:50,798 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission.
2012-08-28 10:29:50,807 [Thread-6] WARN  org.apache.hadoop.util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2012-08-28 10:29:50,902 [Thread-6] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:29:50,902 [Thread-6] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:29:50,907 [Thread-6] WARN  org.apache.hadoop.io.compress.snappy.LoadSnappy - Snappy native library not loaded
2012-08-28 10:29:50,909 [Thread-6] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:29:50,915 [Thread-6] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:29:50,915 [Thread-6] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:29:50,915 [Thread-6] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:29:51,088 [Thread-7] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:29:51,100 [Thread-7] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/Users/ceteri/src/concur/Impatient/part5/data/en.stop:0+544
2012-08-28 10:29:51,105 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:29:51,211 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:29:51,213 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:29:51,252 [Thread-7] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _0_en.stop
2012-08-28 10:29:51,264 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:29:51,273 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:29:51,274 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
2012-08-28 10:29:51,299 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0001
2012-08-28 10:29:51,300 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 0% complete
2012-08-28 10:29:54,073 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:29:54,074 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_m_000000_0' done.
2012-08-28 10:29:54,077 [Thread-7] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:29:54,080 [Thread-7] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/Users/ceteri/src/concur/Impatient/part5/data/rain.txt:0+510
2012-08-28 10:29:54,080 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:29:54,142 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:29:54,142 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:29:54,156 [Thread-7] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _1_rain.txt
2012-08-28 10:29:54,163 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:29:54,165 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:29:54,170 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0001_m_000001_0 is done. And is in the process of commiting
2012-08-28 10:29:57,076 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:29:57,076 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_m_000001_0' done.
2012-08-28 10:29:57,087 [Thread-7] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:29:57,087 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:29:57,090 [Thread-7] INFO  org.apache.hadoop.mapred.Merger - Merging 2 sorted segments
2012-08-28 10:29:57,097 [Thread-7] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 2 segments left of total size: 3284 bytes
2012-08-28 10:29:57,097 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:29:57,129 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
2012-08-28 10:29:57,129 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:29:57,130 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0001_r_000000_0 is allowed to commit now
2012-08-28 10:29:57,132 [Thread-7] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0001_r_000000_0' to file:/tmp/temp-2375117/tmp-1797990455
2012-08-28 10:30:00,088 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
2012-08-28 10:30:00,089 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_r_000000_0' done.
2012-08-28 10:30:00,091 [Thread-7] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
2012-08-28 10:30:01,316 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 12% complete
2012-08-28 10:30:01,318 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0001
2012-08-28 10:30:01,320 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
2012-08-28 10:30:01,320 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
2012-08-28 10:30:01,320 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job6866114866035432926.jar
2012-08-28 10:30:05,039 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job6866114866035432926.jar created
2012-08-28 10:30:05,043 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
2012-08-28 10:30:05,043 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting identity combiner class.
2012-08-28 10:30:05,044 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1037
2012-08-28 10:30:05,044 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
2012-08-28 10:30:05,050 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
2012-08-28 10:30:05,051 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
2012-08-28 10:30:05,051 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job750153399916515834.jar
2012-08-28 10:30:08,741 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job750153399916515834.jar created
2012-08-28 10:30:08,744 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
2012-08-28 10:30:08,748 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1037
2012-08-28 10:30:08,748 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
2012-08-28 10:30:08,756 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
2012-08-28 10:30:08,757 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
2012-08-28 10:30:08,757 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job461043214572580021.jar
2012-08-28 10:30:12,371 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job461043214572580021.jar created
2012-08-28 10:30:12,375 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
2012-08-28 10:30:12,376 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting identity combiner class.
2012-08-28 10:30:12,377 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1037
2012-08-28 10:30:12,377 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
2012-08-28 10:30:12,383 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 3 map-reduce job(s) waiting for submission.
2012-08-28 10:30:12,431 [Thread-11] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:30:12,431 [Thread-11] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:30:12,431 [Thread-11] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:30:12,508 [Thread-12] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:12,513 [Thread-12] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1797990455/part-r-00000:0+1037
2012-08-28 10:30:12,515 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:30:12,526 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:30:12,526 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:30:12,543 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:30:12,546 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:30:12,548 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0002_m_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:12,558 [Thread-11] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:30:12,558 [Thread-11] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:30:12,558 [Thread-11] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:30:12,605 [Thread-15] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:12,607 [Thread-15] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1797990455/part-r-00000:0+1037
2012-08-28 10:30:12,607 [Thread-15] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:30:12,672 [Thread-15] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:30:12,673 [Thread-15] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:30:12,693 [Thread-15] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:30:12,705 [Thread-15] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:30:12,708 [Thread-15] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0003_m_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:12,720 [Thread-11] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:30:12,720 [Thread-11] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:30:12,720 [Thread-11] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:30:12,767 [Thread-18] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:12,769 [Thread-18] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1797990455/part-r-00000:0+1037
2012-08-28 10:30:12,769 [Thread-18] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:30:12,885 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0002
2012-08-28 10:30:12,885 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0003
2012-08-28 10:30:12,886 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0004
2012-08-28 10:30:12,890 [Thread-18] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:30:12,890 [Thread-18] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:30:12,910 [Thread-18] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:30:12,914 [Thread-18] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:30:12,917 [Thread-18] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0004_m_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:15,504 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,504 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0002_m_000000_0' done.
2012-08-28 10:30:15,512 [Thread-12] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:15,512 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,513 [Thread-12] INFO  org.apache.hadoop.mapred.Merger - Merging 1 sorted segments
2012-08-28 10:30:15,513 [Thread-12] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 87 bytes
2012-08-28 10:30:15,513 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,520 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0002_r_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:15,522 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,522 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0002_r_000000_0 is allowed to commit now
2012-08-28 10:30:15,524 [Thread-12] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0002_r_000000_0' to file:/tmp/temp-2375117/tmp-1840385936
2012-08-28 10:30:15,604 [Thread-15] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,605 [Thread-15] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0003_m_000000_0' done.
2012-08-28 10:30:15,614 [Thread-15] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:15,614 [Thread-15] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,615 [Thread-15] INFO  org.apache.hadoop.mapred.Merger - Merging 1 sorted segments
2012-08-28 10:30:15,616 [Thread-15] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 1689 bytes
2012-08-28 10:30:15,616 [Thread-15] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,637 [Thread-15] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0003_r_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:15,639 [Thread-15] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,640 [Thread-15] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0003_r_000000_0 is allowed to commit now
2012-08-28 10:30:15,643 [Thread-15] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0003_r_000000_0' to file:/tmp/temp-2375117/tmp-633078531
2012-08-28 10:30:15,765 [Thread-18] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,765 [Thread-18] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0004_m_000000_0' done.
2012-08-28 10:30:15,773 [Thread-18] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:15,773 [Thread-18] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,773 [Thread-18] INFO  org.apache.hadoop.mapred.Merger - Merging 1 sorted segments
2012-08-28 10:30:15,774 [Thread-18] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 1229 bytes
2012-08-28 10:30:15,774 [Thread-18] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,787 [Thread-18] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0004_r_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:15,789 [Thread-18] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:15,789 [Thread-18] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0004_r_000000_0 is allowed to commit now
2012-08-28 10:30:15,792 [Thread-18] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0004_r_000000_0' to file:/tmp/temp-2375117/tmp-185348010
2012-08-28 10:30:18,515 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
2012-08-28 10:30:18,516 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0002_r_000000_0' done.
2012-08-28 10:30:18,517 [Thread-12] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
2012-08-28 10:30:18,608 [Thread-15] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
2012-08-28 10:30:18,609 [Thread-15] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0003_r_000000_0' done.
2012-08-28 10:30:18,610 [Thread-15] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
2012-08-28 10:30:18,767 [Thread-18] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
2012-08-28 10:30:18,768 [Thread-18] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0004_r_000000_0' done.
2012-08-28 10:30:18,769 [Thread-18] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
2012-08-28 10:30:22,898 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 50% complete
2012-08-28 10:30:22,899 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0002
2012-08-28 10:30:22,900 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0003
2012-08-28 10:30:22,901 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0004
2012-08-28 10:30:22,901 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
2012-08-28 10:30:22,902 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
2012-08-28 10:30:22,902 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job497102663141037932.jar
2012-08-28 10:30:26,702 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job497102663141037932.jar created
2012-08-28 10:30:26,703 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
2012-08-28 10:30:26,710 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
2012-08-28 10:30:26,710 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
2012-08-28 10:30:26,710 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job416157849282719047.jar
2012-08-28 10:30:30,383 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job416157849282719047.jar created
2012-08-28 10:30:30,384 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
2012-08-28 10:30:30,387 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=997
2012-08-28 10:30:30,387 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
2012-08-28 10:30:30,392 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 2 map-reduce job(s) waiting for submission.
2012-08-28 10:30:30,442 [Thread-21] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:30:30,442 [Thread-21] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:30:30,442 [Thread-21] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:30:30,491 [Thread-22] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:30,495 [Thread-22] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-185348010/part-r-00000:0+997
2012-08-28 10:30:30,495 [Thread-22] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:30:30,541 [Thread-21] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:30:30,541 [Thread-21] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:30:30,541 [Thread-21] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:30:30,550 [Thread-22] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:30:30,550 [Thread-22] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:30:30,579 [Thread-22] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:30:30,590 [Thread-22] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:30:30,593 [Thread-25] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:30,593 [Thread-22] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0005_m_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:30,596 [Thread-25] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1840385936/part-r-00000:0+60
2012-08-28 10:30:30,596 [Thread-25] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:30:30,685 [Thread-25] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:30:30,685 [Thread-25] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:30:30,698 [Thread-25] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:30:30,705 [Thread-25] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:30:30,706 [Thread-25] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0006_m_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:30,893 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0005
2012-08-28 10:30:30,893 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0006
2012-08-28 10:30:33,487 [Thread-22] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:33,487 [Thread-22] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0005_m_000000_0' done.
2012-08-28 10:30:33,493 [Thread-22] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:33,493 [Thread-22] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:33,493 [Thread-22] INFO  org.apache.hadoop.mapred.Merger - Merging 1 sorted segments
2012-08-28 10:30:33,494 [Thread-22] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 809 bytes
2012-08-28 10:30:33,494 [Thread-22] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:33,508 [Thread-22] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0005_r_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:33,509 [Thread-22] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:33,509 [Thread-22] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0005_r_000000_0 is allowed to commit now
2012-08-28 10:30:33,511 [Thread-22] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0005_r_000000_0' to file:/tmp/temp-2375117/tmp155933281
2012-08-28 10:30:33,590 [Thread-25] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:33,591 [Thread-25] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0006_m_000000_0' done.
2012-08-28 10:30:33,597 [Thread-25] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:33,597 [Thread-25] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:33,598 [Thread-25] INFO  org.apache.hadoop.mapred.Merger - Merging 1 sorted segments
2012-08-28 10:30:33,598 [Thread-25] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 25 bytes
2012-08-28 10:30:33,598 [Thread-25] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:33,608 [Thread-25] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0006_r_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:33,609 [Thread-25] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:33,610 [Thread-25] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0006_r_000000_0 is allowed to commit now
2012-08-28 10:30:33,611 [Thread-25] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0006_r_000000_0' to file:/tmp/temp-2375117/tmp-1694028400
2012-08-28 10:30:36,490 [Thread-22] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
2012-08-28 10:30:36,490 [Thread-22] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0005_r_000000_0' done.
2012-08-28 10:30:36,491 [Thread-22] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
2012-08-28 10:30:36,593 [Thread-25] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
2012-08-28 10:30:36,593 [Thread-25] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0006_r_000000_0' done.
2012-08-28 10:30:36,594 [Thread-25] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
2012-08-28 10:30:40,914 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 75% complete
2012-08-28 10:30:40,915 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0005
2012-08-28 10:30:40,916 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0006
2012-08-28 10:30:40,917 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
2012-08-28 10:30:40,917 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
2012-08-28 10:30:40,917 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job6401439778182906434.jar
2012-08-28 10:30:44,617 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job6401439778182906434.jar created
2012-08-28 10:30:44,619 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
2012-08-28 10:30:44,621 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=792
2012-08-28 10:30:44,621 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
2012-08-28 10:30:44,624 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission.
2012-08-28 10:30:44,672 [Thread-28] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:30:44,672 [Thread-28] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:30:44,672 [Thread-28] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:30:44,675 [Thread-28] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:30:44,675 [Thread-28] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:30:44,675 [Thread-28] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:30:44,710 [Thread-29] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:44,713 [Thread-29] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp155933281/part-r-00000:0+776
2012-08-28 10:30:44,716 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:30:44,731 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:30:44,731 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:30:44,738 [Thread-29] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _0_tmp155933281
2012-08-28 10:30:44,741 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:30:44,743 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:30:44,745 [Thread-29] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0007_m_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:45,125 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0007
2012-08-28 10:30:47,706 [Thread-29] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:47,707 [Thread-29] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0007_m_000000_0' done.
2012-08-28 10:30:47,711 [Thread-29] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:47,714 [Thread-29] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1694028400/part-r-00000:0+16
2012-08-28 10:30:47,714 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:30:47,724 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:30:47,724 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:30:47,730 [Thread-29] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _1_tmp-1694028400
2012-08-28 10:30:47,731 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:30:47,732 [Thread-29] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:30:47,734 [Thread-29] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0007_m_000001_0 is done. And is in the process of commiting
2012-08-28 10:30:50,709 [Thread-29] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:50,709 [Thread-29] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0007_m_000001_0' done.
2012-08-28 10:30:50,719 [Thread-29] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:50,719 [Thread-29] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:50,720 [Thread-29] INFO  org.apache.hadoop.mapred.Merger - Merging 2 sorted segments
2012-08-28 10:30:50,720 [Thread-29] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 2 segments left of total size: 956 bytes
2012-08-28 10:30:50,720 [Thread-29] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:50,734 [Thread-29] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0007_r_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:50,736 [Thread-29] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:30:50,736 [Thread-29] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0007_r_000000_0 is allowed to commit now
2012-08-28 10:30:50,738 [Thread-29] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0007_r_000000_0' to file:/tmp/temp-2375117/tmp277297603
2012-08-28 10:30:53,715 [Thread-29] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
2012-08-28 10:30:53,716 [Thread-29] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0007_r_000000_0' done.
2012-08-28 10:30:53,717 [Thread-29] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
2012-08-28 10:30:55,142 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 87% complete
2012-08-28 10:30:55,143 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0007
2012-08-28 10:30:55,143 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
2012-08-28 10:30:55,143 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
2012-08-28 10:30:55,144 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job1907337562856891341.jar
2012-08-28 10:30:58,780 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job1907337562856891341.jar created
2012-08-28 10:30:58,782 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
2012-08-28 10:30:58,783 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=2358
2012-08-28 10:30:58,783 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
2012-08-28 10:30:58,790 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission.
2012-08-28 10:30:58,827 [Thread-33] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:30:58,827 [Thread-33] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:30:58,828 [Thread-33] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:30:58,829 [Thread-33] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2012-08-28 10:30:58,829 [Thread-33] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2012-08-28 10:30:58,829 [Thread-33] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2012-08-28 10:30:58,858 [Thread-34] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:30:58,860 [Thread-34] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-633078531/part-r-00000:0+1365
2012-08-28 10:30:58,860 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:30:58,873 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:30:58,873 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:30:58,878 [Thread-34] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _1_tmp-633078531
2012-08-28 10:30:58,881 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:30:58,883 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:30:58,884 [Thread-34] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0008_m_000000_0 is done. And is in the process of commiting
2012-08-28 10:30:59,292 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0008
2012-08-28 10:31:01,855 [Thread-34] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:31:01,856 [Thread-34] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0008_m_000000_0' done.
2012-08-28 10:31:01,860 [Thread-34] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:31:01,862 [Thread-34] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp277297603/part-r-00000:0+993
2012-08-28 10:31:01,863 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
2012-08-28 10:31:01,874 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
2012-08-28 10:31:01,874 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
2012-08-28 10:31:01,879 [Thread-34] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _0_tmp277297603
2012-08-28 10:31:01,881 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
2012-08-28 10:31:01,883 [Thread-34] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
2012-08-28 10:31:01,884 [Thread-34] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0008_m_000001_0 is done. And is in the process of commiting
2012-08-28 10:31:04,857 [Thread-34] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:31:04,858 [Thread-34] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0008_m_000001_0' done.
2012-08-28 10:31:04,866 [Thread-34] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
2012-08-28 10:31:04,866 [Thread-34] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:31:04,867 [Thread-34] INFO  org.apache.hadoop.mapred.Merger - Merging 2 sorted segments
2012-08-28 10:31:04,867 [Thread-34] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 2 segments left of total size: 2439 bytes
2012-08-28 10:31:04,867 [Thread-34] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:31:04,888 [Thread-34] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0008_r_000000_0 is done. And is in the process of commiting
2012-08-28 10:31:04,889 [Thread-34] INFO  org.apache.hadoop.mapred.LocalJobRunner -
2012-08-28 10:31:04,890 [Thread-34] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0008_r_000000_0 is allowed to commit now
2012-08-28 10:31:04,892 [Thread-34] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0008_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part5/output/tfidf
2012-08-28 10:31:07,862 [Thread-34] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
2012-08-28 10:31:07,863 [Thread-34] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0008_r_000000_0' done.
2012-08-28 10:31:07,865 [Thread-34] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
2012-08-28 10:31:09,312 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0008
2012-08-28 10:31:09,314 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 100% complete
2012-08-28 10:31:09,317 [main] INFO  org.apache.pig.tools.pigstats.SimplePigStats - Script Statistics:

HadoopVersion	PigVersion	UserId	StartedAt	FinishedAt	Features
1.0.3	0.10.0	ceteri	2012-08-28 10:29:45	2012-08-28 10:31:09	HASH_JOIN,GROUP_BY,DISTINCT,FILTER,CROSS

Success!

Job Stats (time in seconds):
JobId	Maps	Reduces	MaxMapTime	MinMapTIme	AvgMapTime	MaxReduceTime	MinReduceTime	AvgReduceTime	Alias	Feature	Outputs
job_local_0001	1	1	n/a	n/a	n/a	n/a	n/a	n/a	docPipe,stopPipe,tokenPipe	HASH_JOIN
job_local_0002	1	1	n/a	n/a	n/a	n/a	n/a	n/a	dPipe	DISTINCT
job_local_0003	1	1	n/a	n/a	n/a	n/a	n/a	n/a	tfGroups,tfPipe	GROUP_BY,COMBINER
job_local_0004	1	1	n/a	n/a	n/a	n/a	n/a	n/a		DISTINCT
job_local_0005	1	1	n/a	n/a	n/a	n/a	n/a	n/a	dfGroups,dfPipe,idfPipe	GROUP_BY,COMBINER
job_local_0006	1	1	n/a	n/a	n/a	n/a	n/a	n/a	dGroups,dPipe,idfPipe	GROUP_BY,COMBINER
job_local_0007	1	1	n/a	n/a	n/a	n/a	n/a	n/a	idfPipe
job_local_0008	1	1	n/a	n/a	n/a	n/a	n/a	n/a	tfidfPipe	HASH_JOIN	file:///Users/ceteri/src/concur/Impatient/part5/output/tfidf,

Input(s):
Successfully read 0 records from: "file:///Users/ceteri/src/concur/Impatient/part5/data/en.stop"
Successfully read 0 records from: "file:///Users/ceteri/src/concur/Impatient/part5/data/rain.txt"

Output(s):
Successfully stored 0 records in: "file:///Users/ceteri/src/concur/Impatient/part5/output/tfidf"

Counters:
Total records written : 0
Total bytes written : 0
Spillable Memory Manager spill count : 0
Total bags proactively spilled: 0
Total records proactively spilled: 0

Job DAG:
job_local_0001	->	job_local_0003,job_local_0002,job_local_0004,
job_local_0003	->	job_local_0008,
job_local_0002	->	job_local_0006,
job_local_0006	->	job_local_0007,
job_local_0004	->	job_local_0005,
job_local_0005	->	job_local_0007,
job_local_0007	->	job_local_0008,
job_local_0008


2012-08-28 10:31:09,317 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!
bash-3.2$ cat output/tfidf/part-r-00000
doc02	0.9162907318741551	air
doc02	0.22314355131420976	dry
doc01	0.22314355131420976	dry
doc03	0.22314355131420976	dry
doc05	0.9162907318741551	dvd
doc02	0.5108256237659907	lee
doc01	0.5108256237659907	lee
doc03	0.22314355131420976	area
doc01	0.44628710262841953	area
doc02	0.22314355131420976	area
doc05	0.5108256237659907	land
doc03	0.5108256237659907	land
doc02	0.9162907318741551	less
doc03	0.9162907318741551	lies
doc02	0.0	rain
doc04	0.0	rain
doc01	0.0	rain
doc03	0.0	rain
doc04	0.9162907318741551	such
doc04	0.9162907318741551	cause
doc04	0.9162907318741551	death
doc04	0.9162907318741551	known
doc05	0.9162907318741551	women
doc05	0.9162907318741551	broken
doc04	0.9162907318741551	effect
doc04	0.9162907318741551	ranges
doc04	0.0	shadow
doc03	0.0	shadow
doc02	0.0	shadow
doc01	0.0	shadow
doc04	0.9162907318741551	valley
doc04	0.9162907318741551	deserts
doc03	0.5108256237659907	leeward
doc04	0.5108256237659907	leeward
doc04	0.9162907318741551	primary
doc05	0.9162907318741551	secrets
doc02	0.9162907318741551	sinking
doc03	0.9162907318741551	downwind
doc04	0.22314355131420976	mountain
doc02	0.22314355131420976	mountain
doc03	0.22314355131420976	mountain
doc02	0.9162907318741551	produces
doc05	0.9162907318741551	australia
doc02	0.9162907318741551	cloudcover
doc01	0.9162907318741551	mountainous
doc04	0.9162907318741551	california's
bash-3.2$

## tfidf.pig
docPipe = LOAD '$docPath' USING PigStorage('\t', 'tagsource') AS (doc_id, text);
docPipe = FILTER docPipe BY doc_id != 'doc_id';

stopPipe = LOAD '$stopPath' USING PigStorage('\t', 'tagsource') AS (stop:chararray);
stopPipe = FILTER stopPipe BY stop != 'stop';

-- specify a regex operation to split the "document" text lines into a token stream
tokenPipe = FOREACH docPipe GENERATE doc_id, FLATTEN(TOKENIZE(LOWER(text), ' [](),.')) AS token;
tokenPipe = FILTER tokenPipe BY token MATCHES '\\w.*';

-- perform a left join to remove stop words, discarding the rows
-- which joined with stop words, i.e., were non-null after left join
tokenPipe = JOIN tokenPipe BY token LEFT, stopPipe BY stop;
tokenPipe = FILTER tokenPipe BY stopPipe::stop is NULL;
-- DUMP tokenPipe;

-- one branch of the flow tallies the token counts for term frequency (TF)
tfGroups = GROUP tokenPipe BY (doc_id, token);
tfPipe = FOREACH tfGroups GENERATE FLATTEN(group) AS (doc_id, tf_token), COUNT(tokenPipe) AS tf_count;
-- DUMP tfPipe;

-- one branch counts the number of documents (D)
dPipe = FOREACH tokenPipe GENERATE doc_id;
dPipe = DISTINCT dPipe;
dGroups = GROUP dPipe ALL;
dPipe = FOREACH dGroups GENERATE COUNT(dPipe) AS n_docs;
-- DUMP dPipe;

-- one branch tallies the token counts for document frequency (DF)
dfPipe = DISTINCT tokenPipe;
dfGroups = GROUP dfPipe BY token;
dfPipe = FOREACH dfGroups GENERATE group AS df_token, COUNT(dfPipe) AS df_count;
-- DUMP dfPipe;

-- join to bring together all the components for calculating TF-IDF
idfPipe = CROSS dfPipe, dPipe;
tfidfPipe = JOIN tfPipe BY tf_token, idfPipe BY df_token;
tfidfPipe = FOREACH tfidfPipe GENERATE doc_id, (double) tf_count * LOG( (double) n_docs / ( 1.0 + (double) df_count ) ) AS tfidf, tf_token AS token;

-- output
STORE tfidfPipe INTO '$tfidfPath' using PigStorage('\t', 'tagsource');
EXPLAIN -out dot/tfidf_pig.dot -dot tfidfPipe;

-- determine the word counts
-- THIS PART DIES IN APACHE PIG W/O HELPFUL EXCEPTION MESSAGES
--tokenGroups = GROUP tokenPipe BY token;
--wcPipe = FOREACH tokenGroups GENERATE COUNT(tokenPipe) AS count, group AS token;
--wcPipe = ORDER wcPipe BY count DESC;
--STORE wcPipe INTO '$wcPath' using PigStorage('\t', 'tagsource');
	public class
	Main
	{
	public static void
	main( String[] args )
	{
	String docPath = args[ 0 ];
	String wcPath = args[ 1 ];
	String stopPath = args[ 2 ];
	String tfidfPath = args[ 3 ];

	Properties properties = new Properties();
	AppProps.setApplicationJarClass( properties, Main.class );
	HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );

	// create source and sink taps
	Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath );
	Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );

	Fields stop = new Fields( "stop" );
	Tap stopTap = new Hfs( new TextDelimited( stop, true, "\t" ), stopPath );
	Tap tfidfTap = new Hfs( new TextDelimited( true, "\t" ), tfidfPath );

	// specify a regex operation to split the "document" text lines into a token stream
	Fields token = new Fields( "token" );
	Fields text = new Fields( "text" );
	RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
	Fields fieldSelector = new Fields( "doc_id", "token" );
	Pipe docPipe = new Each( "token", text, splitter, fieldSelector );

	// define "ScrubFunction" to clean up the token stream
	Fields scrubArguments = new Fields( "doc_id", "token" );
	docPipe = new Each( docPipe, scrubArguments, new ScrubFunction( scrubArguments ), Fields.RESULTS );

	// perform a left join to remove stop words, discarding the rows
	// which joined with stop words, i.e., were non-null after left join
	Pipe stopPipe = new Pipe( "stop" );
	Pipe tokenPipe = new HashJoin( docPipe, token, stopPipe, stop, new LeftJoin() );
	tokenPipe = new Each( tokenPipe, stop, new RegexFilter( "^$" ) );
	tokenPipe = new Retain( tokenPipe, fieldSelector );

	// one branch of the flow tallies the token counts for term frequency (TF)
	Pipe tfPipe = new Pipe( "TF", tokenPipe );
	Fields tf_count = new Fields( "tf_count" );
	tfPipe = new CountBy( tfPipe, new Fields( "doc_id", "token" ), tf_count );

	Fields tf_token = new Fields( "tf_token" );
	tfPipe = new Rename( tfPipe, token, tf_token );

	// one branch counts the number of documents (D)
	Fields doc_id = new Fields( "doc_id" );
	Fields tally = new Fields( "tally" );
	Fields rhs_join = new Fields( "rhs_join" );
	Fields n_docs = new Fields( "n_docs" );
	Pipe dPipe = new Unique( "D", tokenPipe, doc_id );
	dPipe = new Each( dPipe, new Insert( tally, 1 ), Fields.ALL );
	dPipe = new Each( dPipe, new Insert( rhs_join, 1 ), Fields.ALL );
	dPipe = new SumBy( dPipe, rhs_join, tally, n_docs, long.class );

	// one branch tallies the token counts for document frequency (DF)
	Pipe dfPipe = new Unique( "DF", tokenPipe, Fields.ALL );
	Fields df_count = new Fields( "df_count" );
	dfPipe = new CountBy( dfPipe, token, df_count );

	Fields df_token = new Fields( "df_token" );
	Fields lhs_join = new Fields( "lhs_join" );
	dfPipe = new Rename( dfPipe, token, df_token );
	dfPipe = new Each( dfPipe, new Insert( lhs_join, 1 ), Fields.ALL );

	// join to bring together all the components for calculating TF-IDF
	// the D side of the join is smaller, so it goes on the RHS
	Pipe idfPipe = new HashJoin( dfPipe, lhs_join, dPipe, rhs_join );

	// the IDF side of the join is smaller, so it goes on the RHS
	Pipe tfidfPipe = new CoGroup( tfPipe, tf_token, idfPipe, df_token );

	// calculate the TF-IDF weights, per token, per document
	Fields tfidf = new Fields( "tfidf" );
	String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
	ExpressionFunction tfidfExpression = new ExpressionFunction( tfidf, expression, Double.class );
	Fields tfidfArguments = new Fields( "tf_count", "df_count", "n_docs" );
	tfidfPipe = new Each( tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL );

	fieldSelector = new Fields( "tf_token", "doc_id", "tfidf" );
	tfidfPipe = new Retain( tfidfPipe, fieldSelector );
	tfidfPipe = new Rename( tfidfPipe, tf_token, token );

	// keep track of the word counts, which are useful for QA
	Pipe wcPipe = new Pipe( "wc", tfPipe );

	Fields count = new Fields( "count" );
	wcPipe = new SumBy( wcPipe, tf_token, tf_count, count, long.class );
	wcPipe = new Rename( wcPipe, tf_token, token );

	// additionally, sort by count
	wcPipe = new GroupBy( wcPipe, count, count );

	// connect the taps, pipes, etc., into a flow
	FlowDef flowDef = FlowDef.flowDef()
	.setName( "tfidf" )
	.addSource( docPipe, docTap )
	.addSource( stopPipe, stopTap )
	.addTailSink( tfidfPipe, tfidfTap )
	.addTailSink( wcPipe, wcTap );

	// write a DOT file and run the flow
	Flow tfidfFlow = flowConnector.connect( flowDef );
	tfidfFlow.writeDOT( "dot/tfidf.dot" );
	tfidfFlow.complete();
	}
	}
	docPipe = LOAD '$docPath' USING PigStorage('\t', 'tagsource') AS (doc_id, text);
	docPipe = FILTER docPipe BY doc_id != 'doc_id';

	stopPipe = LOAD '$stopPath' USING PigStorage('\t', 'tagsource') AS (stop:chararray);
	stopPipe = FILTER stopPipe BY stop != 'stop';

	-- specify a regex operation to split the "document" text lines into a token stream
	tokenPipe = FOREACH docPipe GENERATE doc_id, FLATTEN(TOKENIZE(LOWER(text), ' [](),.')) AS token;
	tokenPipe = FILTER tokenPipe BY token MATCHES '\\w.*';

	-- perform a left join to remove stop words, discarding the rows
	-- which joined with stop words, i.e., were non-null after left join
	tokenPipe = JOIN tokenPipe BY token LEFT, stopPipe BY stop;
	tokenPipe = FILTER tokenPipe BY stopPipe::stop is NULL;
	-- DUMP tokenPipe;

	-- one branch of the flow tallies the token counts for term frequency (TF)
	tfGroups = GROUP tokenPipe BY (doc_id, token);
	tfPipe = FOREACH tfGroups GENERATE FLATTEN(group) AS (doc_id, tf_token), COUNT(tokenPipe) AS tf_count;
	-- DUMP tfPipe;

	-- one branch counts the number of documents (D)
	dPipe = FOREACH tokenPipe GENERATE doc_id;
	dPipe = DISTINCT dPipe;
	dGroups = GROUP dPipe ALL;
	dPipe = FOREACH dGroups GENERATE COUNT(dPipe) AS n_docs;
	-- DUMP dPipe;

	-- one branch tallies the token counts for document frequency (DF)
	dfPipe = DISTINCT tokenPipe;
	dfGroups = GROUP dfPipe BY token;
	dfPipe = FOREACH dfGroups GENERATE group AS df_token, COUNT(dfPipe) AS df_count;
	-- DUMP dfPipe;

	-- join to bring together all the components for calculating TF-IDF
	idfPipe = CROSS dfPipe, dPipe;
	tfidfPipe = JOIN tfPipe BY tf_token, idfPipe BY df_token;
	tfidfPipe = FOREACH tfidfPipe GENERATE doc_id, (double) tf_count * LOG( (double) n_docs / ( 1.0 + (double) df_count ) ) AS tfidf, tf_token AS token;

	-- output
	STORE tfidfPipe INTO '$tfidfPath' using PigStorage('\t', 'tagsource');
	EXPLAIN -out dot/tfidf_pig.dot -dot tfidfPipe;

	-- determine the word counts
	-- THIS PART DIES IN APACHE PIG W/O HELPFUL EXCEPTION MESSAGES
	--tokenGroups = GROUP tokenPipe BY token;
	--wcPipe = FOREACH tokenGroups GENERATE COUNT(tokenPipe) AS count, group AS token;
	--wcPipe = ORDER wcPipe BY count DESC;
	--STORE wcPipe INTO '$wcPath' using PigStorage('\t', 'tagsource');