Last active
October 6, 2015 19:47
-
-
Save ceteri/3043791 to your computer and use it in GitHub Desktop.
Cascading for the Impatient, Part 5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public class | |
Main | |
{ | |
public static void | |
main( String[] args ) | |
{ | |
String docPath = args[ 0 ]; | |
String wcPath = args[ 1 ]; | |
String stopPath = args[ 2 ]; | |
String tfidfPath = args[ 3 ]; | |
Properties properties = new Properties(); | |
AppProps.setApplicationJarClass( properties, Main.class ); | |
HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties ); | |
// create source and sink taps | |
Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath ); | |
Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath ); | |
Fields stop = new Fields( "stop" ); | |
Tap stopTap = new Hfs( new TextDelimited( stop, true, "\t" ), stopPath ); | |
Tap tfidfTap = new Hfs( new TextDelimited( true, "\t" ), tfidfPath ); | |
// specify a regex operation to split the "document" text lines into a token stream | |
Fields token = new Fields( "token" ); | |
Fields text = new Fields( "text" ); | |
RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" ); | |
Fields fieldSelector = new Fields( "doc_id", "token" ); | |
Pipe docPipe = new Each( "token", text, splitter, fieldSelector ); | |
// define "ScrubFunction" to clean up the token stream | |
Fields scrubArguments = new Fields( "doc_id", "token" ); | |
docPipe = new Each( docPipe, scrubArguments, new ScrubFunction( scrubArguments ), Fields.RESULTS ); | |
// perform a left join to remove stop words, discarding the rows | |
// which joined with stop words, i.e., were non-null after left join | |
Pipe stopPipe = new Pipe( "stop" ); | |
Pipe tokenPipe = new HashJoin( docPipe, token, stopPipe, stop, new LeftJoin() ); | |
tokenPipe = new Each( tokenPipe, stop, new RegexFilter( "^$" ) ); | |
tokenPipe = new Retain( tokenPipe, fieldSelector ); | |
// one branch of the flow tallies the token counts for term frequency (TF) | |
Pipe tfPipe = new Pipe( "TF", tokenPipe ); | |
Fields tf_count = new Fields( "tf_count" ); | |
tfPipe = new CountBy( tfPipe, new Fields( "doc_id", "token" ), tf_count ); | |
Fields tf_token = new Fields( "tf_token" ); | |
tfPipe = new Rename( tfPipe, token, tf_token ); | |
// one branch counts the number of documents (D) | |
Fields doc_id = new Fields( "doc_id" ); | |
Fields tally = new Fields( "tally" ); | |
Fields rhs_join = new Fields( "rhs_join" ); | |
Fields n_docs = new Fields( "n_docs" ); | |
Pipe dPipe = new Unique( "D", tokenPipe, doc_id ); | |
dPipe = new Each( dPipe, new Insert( tally, 1 ), Fields.ALL ); | |
dPipe = new Each( dPipe, new Insert( rhs_join, 1 ), Fields.ALL ); | |
dPipe = new SumBy( dPipe, rhs_join, tally, n_docs, long.class ); | |
// one branch tallies the token counts for document frequency (DF) | |
Pipe dfPipe = new Unique( "DF", tokenPipe, Fields.ALL ); | |
Fields df_count = new Fields( "df_count" ); | |
dfPipe = new CountBy( dfPipe, token, df_count ); | |
Fields df_token = new Fields( "df_token" ); | |
Fields lhs_join = new Fields( "lhs_join" ); | |
dfPipe = new Rename( dfPipe, token, df_token ); | |
dfPipe = new Each( dfPipe, new Insert( lhs_join, 1 ), Fields.ALL ); | |
// join to bring together all the components for calculating TF-IDF | |
// the D side of the join is smaller, so it goes on the RHS | |
Pipe idfPipe = new HashJoin( dfPipe, lhs_join, dPipe, rhs_join ); | |
// the IDF side of the join is smaller, so it goes on the RHS | |
Pipe tfidfPipe = new CoGroup( tfPipe, tf_token, idfPipe, df_token ); | |
// calculate the TF-IDF weights, per token, per document | |
Fields tfidf = new Fields( "tfidf" ); | |
String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )"; | |
ExpressionFunction tfidfExpression = new ExpressionFunction( tfidf, expression, Double.class ); | |
Fields tfidfArguments = new Fields( "tf_count", "df_count", "n_docs" ); | |
tfidfPipe = new Each( tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL ); | |
fieldSelector = new Fields( "tf_token", "doc_id", "tfidf" ); | |
tfidfPipe = new Retain( tfidfPipe, fieldSelector ); | |
tfidfPipe = new Rename( tfidfPipe, tf_token, token ); | |
// keep track of the word counts, which are useful for QA | |
Pipe wcPipe = new Pipe( "wc", tfPipe ); | |
Fields count = new Fields( "count" ); | |
wcPipe = new SumBy( wcPipe, tf_token, tf_count, count, long.class ); | |
wcPipe = new Rename( wcPipe, tf_token, token ); | |
// additionally, sort by count | |
wcPipe = new GroupBy( wcPipe, count, count ); | |
// connect the taps, pipes, etc., into a flow | |
FlowDef flowDef = FlowDef.flowDef() | |
.setName( "tfidf" ) | |
.addSource( docPipe, docTap ) | |
.addSource( stopPipe, stopTap ) | |
.addTailSink( tfidfPipe, tfidfTap ) | |
.addTailSink( wcPipe, wcTap ); | |
// write a DOT file and run the flow | |
Flow tfidfFlow = flowConnector.connect( flowDef ); | |
tfidfFlow.writeDOT( "dot/tfidf.dot" ); | |
tfidfFlow.complete(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bash-3.2$ ls | |
LICENSE.txt README.md build build.gradle data docs src | |
bash-3.2$ hadoop version | |
Warning: $HADOOP_HOME is deprecated. | |
Hadoop 1.0.3 | |
Subversion https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.0 -r 1335192 | |
Compiled by hortonfo on Tue May 8 20:31:25 UTC 2012 | |
From source with checksum e6b0c1e23dcf76907c5fecb4b832f3be | |
bash-3.2$ gradle -version | |
------------------------------------------------------------ | |
Gradle 1.4 | |
------------------------------------------------------------ | |
Gradle build time: Monday, January 28, 2013 3:42:46 AM UTC | |
Groovy: 1.8.6 | |
Ant: Apache Ant(TM) version 1.8.4 compiled on May 22 2012 | |
Ivy: 2.2.0 | |
JVM: 1.6.0_43 (Apple Inc. 20.14-b01-447) | |
OS: Mac OS X 10.7.5 x86_64 | |
bash-3.2$ gradle clean jar | |
:clean | |
:compileJava | |
:processResources UP-TO-DATE | |
:classes | |
:jar | |
BUILD SUCCESSFUL | |
Total time: 4.073 secs | |
bash-3.2$ hadoop jar ./build/libs/impatient.jar data/rain.txt output/wc data/en.stop output/tfidf | |
Warning: $HADOOP_HOME is deprecated. | |
13/04/04 17:16:02 INFO util.HadoopUtil: resolving application jar from found main method on: impatient.Main | |
13/04/04 17:16:02 INFO planner.HadoopPlanner: using application jar: /Users/ceteri/src/concur/Impatient/part5/./build/libs/impatient.jar | |
13/04/04 17:16:02 INFO property.AppProps: using app.id: A9973837DCB25E00D2E73B5447252121 | |
2013-04-04 17:16:02.629 java[11208:1903] Unable to load realm info from SCDynamicStore | |
13/04/04 17:16:02 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
13/04/04 17:16:02 WARN snappy.LoadSnappy: Snappy native library not loaded | |
13/04/04 17:16:02 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:16:03 INFO util.Version: Concurrent, Inc - Cascading 2.0.1 | |
13/04/04 17:16:03 INFO flow.Flow: [tfidf] starting | |
13/04/04 17:16:03 INFO flow.Flow: [tfidf] source: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"] | |
13/04/04 17:16:03 INFO flow.Flow: [tfidf] source: Hfs["TextDelimited[['doc_id', 'text']->[ALL]]"]["data/rain.txt"]"] | |
13/04/04 17:16:03 INFO flow.Flow: [tfidf] sink: Hfs["TextDelimited[[UNKNOWN]->['doc_id', 'tfidf', 'token']]"]["output/tfidf"]"] | |
13/04/04 17:16:03 INFO flow.Flow: [tfidf] sink: Hfs["TextDelimited[[UNKNOWN]->['count', 'token']]"]["output/wc"]"] | |
13/04/04 17:16:03 INFO flow.Flow: [tfidf] parallel execution is enabled: false | |
13/04/04 17:16:03 INFO flow.Flow: [tfidf] starting jobs: 9 | |
13/04/04 17:16:03 INFO flow.Flow: [tfidf] allocating threads: 1 | |
13/04/04 17:16:03 INFO flow.FlowStep: [tfidf] starting step: (1/9) | |
13/04/04 17:16:03 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:16:03 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0001 | |
13/04/04 17:16:03 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:03 INFO io.MultiInputSplit: current split input path: file:/Users/ceteri/src/concur/Impatient/part5/data/rain.txt | |
13/04/04 17:16:03 INFO mapred.MapTask: numReduceTasks: 0 | |
13/04/04 17:16:03 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['doc_id', 'text']->[ALL]]"]["data/rain.txt"]"] | |
13/04/04 17:16:03 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"] | |
13/04/04 17:16:03 INFO hadoop.FlowMapper: sinking to: TempHfs["SequenceFile[['doc_id', 'token']]"][token_stop/83257/] | |
13/04/04 17:16:03 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:16:03 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:16:03 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:16:03 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:16:03 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:16:03 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:03 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:03 INFO mapred.Task: Task attempt_local_0001_m_000000_0 is allowed to commit now | |
13/04/04 17:16:03 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0001_m_000000_0' to file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA | |
13/04/04 17:16:06 INFO mapred.LocalJobRunner: file:/Users/ceteri/src/concur/Impatient/part5/data/rain.txt:0+510 | |
13/04/04 17:16:06 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done. | |
13/04/04 17:16:08 INFO flow.FlowStep: [tfidf] starting step: (2/9) | |
13/04/04 17:16:08 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:16:08 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0002 | |
13/04/04 17:16:08 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:08 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000 | |
13/04/04 17:16:08 INFO mapred.MapTask: numReduceTasks: 1 | |
13/04/04 17:16:08 INFO mapred.MapTask: io.sort.mb = 100 | |
13/04/04 17:16:08 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/04/04 17:16:08 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/04/04 17:16:08 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'token']]"][token_stop/83257/] | |
13/04/04 17:16:08 INFO hadoop.FlowMapper: sinking to: GroupBy(TF)[by:[{2}:'doc_id', 'token']] | |
13/04/04 17:16:08 INFO assembly.AggregateBy: using threshold value: 10000 | |
13/04/04 17:16:08 INFO mapred.MapTask: Starting flush of map output | |
13/04/04 17:16:08 INFO mapred.MapTask: Finished spill 0 | |
13/04/04 17:16:08 INFO mapred.Task: Task:attempt_local_0002_m_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:11 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000:0+1539 | |
13/04/04 17:16:11 INFO mapred.Task: Task 'attempt_local_0002_m_000000_0' done. | |
13/04/04 17:16:11 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:11 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:11 INFO mapred.Merger: Merging 1 sorted segments | |
13/04/04 17:16:11 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 1321 bytes | |
13/04/04 17:16:11 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:11 INFO hadoop.FlowReducer: sourcing from: GroupBy(TF)[by:[{2}:'doc_id', 'token']] | |
13/04/04 17:16:11 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['doc_id', 'tf_count', 'tf_token']]"][TF/16560/] | |
13/04/04 17:16:11 INFO mapred.Task: Task:attempt_local_0002_r_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:11 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:11 INFO mapred.Task: Task attempt_local_0002_r_000000_0 is allowed to commit now | |
13/04/04 17:16:11 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0002_r_000000_0' to file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB | |
13/04/04 17:16:14 INFO mapred.LocalJobRunner: reduce > reduce | |
13/04/04 17:16:14 INFO mapred.Task: Task 'attempt_local_0002_r_000000_0' done. | |
13/04/04 17:16:18 INFO flow.FlowStep: [tfidf] starting step: (4/9) | |
13/04/04 17:16:18 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:16:18 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0003 | |
13/04/04 17:16:18 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:18 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000 | |
13/04/04 17:16:18 INFO mapred.MapTask: numReduceTasks: 1 | |
13/04/04 17:16:18 INFO mapred.MapTask: io.sort.mb = 100 | |
13/04/04 17:16:18 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/04/04 17:16:18 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/04/04 17:16:18 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'token']]"][token_stop/83257/] | |
13/04/04 17:16:18 INFO hadoop.FlowMapper: sinking to: GroupBy(DF)[by:[{?}:ALL]] | |
13/04/04 17:16:18 INFO mapred.MapTask: Starting flush of map output | |
13/04/04 17:16:18 INFO mapred.MapTask: Finished spill 0 | |
13/04/04 17:16:18 INFO mapred.Task: Task:attempt_local_0003_m_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:21 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000:0+1539 | |
13/04/04 17:16:21 INFO mapred.Task: Task 'attempt_local_0003_m_000000_0' done. | |
13/04/04 17:16:21 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:21 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:21 INFO mapred.Merger: Merging 1 sorted segments | |
13/04/04 17:16:21 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 1229 bytes | |
13/04/04 17:16:21 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:21 INFO hadoop.FlowReducer: sourcing from: GroupBy(DF)[by:[{?}:ALL]] | |
13/04/04 17:16:21 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['token', 'df_count']]"][DF/8315/] | |
13/04/04 17:16:21 INFO assembly.AggregateBy: using threshold value: 10000 | |
13/04/04 17:16:21 INFO mapred.Task: Task:attempt_local_0003_r_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:21 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:21 INFO mapred.Task: Task attempt_local_0003_r_000000_0 is allowed to commit now | |
13/04/04 17:16:21 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0003_r_000000_0' to file:/tmp/hadoop-ceteri/DF_8315_17032D064A215335E3182ADA4ABC5EE3 | |
13/04/04 17:16:24 INFO mapred.LocalJobRunner: reduce > reduce | |
13/04/04 17:16:24 INFO mapred.Task: Task 'attempt_local_0003_r_000000_0' done. | |
13/04/04 17:16:28 INFO flow.FlowStep: [tfidf] starting step: (6/9) | |
13/04/04 17:16:28 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:16:28 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0004 | |
13/04/04 17:16:28 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:28 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB/part-00000 | |
13/04/04 17:16:28 INFO mapred.MapTask: numReduceTasks: 1 | |
13/04/04 17:16:28 INFO mapred.MapTask: io.sort.mb = 100 | |
13/04/04 17:16:28 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/04/04 17:16:28 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/04/04 17:16:28 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'tf_count', 'tf_token']]"][TF/16560/] | |
13/04/04 17:16:28 INFO hadoop.FlowMapper: sinking to: GroupBy(wc)[by:[{1}:'tf_token']] | |
13/04/04 17:16:28 INFO assembly.AggregateBy: using threshold value: 10000 | |
13/04/04 17:16:28 INFO mapred.MapTask: Starting flush of map output | |
13/04/04 17:16:28 INFO mapred.MapTask: Finished spill 0 | |
13/04/04 17:16:28 INFO mapred.Task: Task:attempt_local_0004_m_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:31 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB/part-00000:0+1573 | |
13/04/04 17:16:31 INFO mapred.Task: Task 'attempt_local_0004_m_000000_0' done. | |
13/04/04 17:16:31 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:31 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:31 INFO mapred.Merger: Merging 1 sorted segments | |
13/04/04 17:16:31 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 561 bytes | |
13/04/04 17:16:31 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:31 INFO hadoop.FlowReducer: sourcing from: GroupBy(wc)[by:[{1}:'tf_token']] | |
13/04/04 17:16:31 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['count', 'token']]"][wc/96581/] | |
13/04/04 17:16:31 INFO mapred.Task: Task:attempt_local_0004_r_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:31 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:31 INFO mapred.Task: Task attempt_local_0004_r_000000_0 is allowed to commit now | |
13/04/04 17:16:31 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0004_r_000000_0' to file:/tmp/hadoop-ceteri/wc_96581_B5483ECB7D3FFB07529931D0BAB0FE52 | |
13/04/04 17:16:34 INFO mapred.LocalJobRunner: reduce > reduce | |
13/04/04 17:16:34 INFO mapred.Task: Task 'attempt_local_0004_r_000000_0' done. | |
13/04/04 17:16:38 INFO flow.FlowStep: [tfidf] starting step: (8/9) | |
13/04/04 17:16:38 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:16:38 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0005 | |
13/04/04 17:16:38 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:38 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/DF_8315_17032D064A215335E3182ADA4ABC5EE3/part-00000 | |
13/04/04 17:16:38 INFO mapred.MapTask: numReduceTasks: 1 | |
13/04/04 17:16:38 INFO mapred.MapTask: io.sort.mb = 100 | |
13/04/04 17:16:38 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/04/04 17:16:38 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/04/04 17:16:38 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['token', 'df_count']]"][DF/8315/] | |
13/04/04 17:16:38 INFO hadoop.FlowMapper: sinking to: GroupBy(DF)[by:[{1}:'token']] | |
13/04/04 17:16:38 INFO mapred.MapTask: Starting flush of map output | |
13/04/04 17:16:38 INFO mapred.MapTask: Finished spill 0 | |
13/04/04 17:16:38 INFO mapred.Task: Task:attempt_local_0005_m_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:41 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/DF_8315_17032D064A215335E3182ADA4ABC5EE3/part-00000:0+784 | |
13/04/04 17:16:41 INFO mapred.Task: Task 'attempt_local_0005_m_000000_0' done. | |
13/04/04 17:16:41 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:41 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:41 INFO mapred.Merger: Merging 1 sorted segments | |
13/04/04 17:16:41 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 561 bytes | |
13/04/04 17:16:41 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:41 INFO hadoop.FlowReducer: sourcing from: GroupBy(DF)[by:[{1}:'token']] | |
13/04/04 17:16:41 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['df_count', 'df_token', 'lhs_join']]"][DF/37139/] | |
13/04/04 17:16:41 INFO mapred.Task: Task:attempt_local_0005_r_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:41 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:41 INFO mapred.Task: Task attempt_local_0005_r_000000_0 is allowed to commit now | |
13/04/04 17:16:41 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0005_r_000000_0' to file:/tmp/hadoop-ceteri/DF_37139_5D60E73D038B7A1DFA46ECF5292A68B4 | |
13/04/04 17:16:44 INFO mapred.LocalJobRunner: reduce > reduce | |
13/04/04 17:16:44 INFO mapred.Task: Task 'attempt_local_0005_r_000000_0' done. | |
13/04/04 17:16:48 INFO flow.FlowStep: [tfidf] starting step: (9/9) output/wc | |
13/04/04 17:16:48 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:16:48 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0006 | |
13/04/04 17:16:48 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:48 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/wc_96581_B5483ECB7D3FFB07529931D0BAB0FE52/part-00000 | |
13/04/04 17:16:48 INFO mapred.MapTask: numReduceTasks: 1 | |
13/04/04 17:16:48 INFO mapred.MapTask: io.sort.mb = 100 | |
13/04/04 17:16:48 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/04/04 17:16:48 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/04/04 17:16:48 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['count', 'token']]"][wc/96581/] | |
13/04/04 17:16:48 INFO hadoop.FlowMapper: sinking to: GroupBy(wc)[by:[{1}:'count']] | |
13/04/04 17:16:48 INFO mapred.MapTask: Starting flush of map output | |
13/04/04 17:16:48 INFO mapred.MapTask: Finished spill 0 | |
13/04/04 17:16:48 INFO mapred.Task: Task:attempt_local_0006_m_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:51 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/wc_96581_B5483ECB7D3FFB07529931D0BAB0FE52/part-00000:0+784 | |
13/04/04 17:16:51 INFO mapred.Task: Task 'attempt_local_0006_m_000000_0' done. | |
13/04/04 17:16:51 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:51 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:51 INFO mapred.Merger: Merging 1 sorted segments | |
13/04/04 17:16:51 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 654 bytes | |
13/04/04 17:16:51 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:51 INFO hadoop.FlowReducer: sourcing from: GroupBy(wc)[by:[{1}:'count']] | |
13/04/04 17:16:51 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[UNKNOWN]->['count', 'token']]"]["output/wc"]"] | |
13/04/04 17:16:51 INFO mapred.Task: Task:attempt_local_0006_r_000000_0 is done. And is in the process of commiting | |
13/04/04 17:16:51 INFO mapred.LocalJobRunner: | |
13/04/04 17:16:51 INFO mapred.Task: Task attempt_local_0006_r_000000_0 is allowed to commit now | |
13/04/04 17:16:51 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0006_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part5/output/wc | |
13/04/04 17:16:54 INFO mapred.LocalJobRunner: reduce > reduce | |
13/04/04 17:16:54 INFO mapred.Task: Task 'attempt_local_0006_r_000000_0' done. | |
13/04/04 17:16:58 INFO flow.FlowStep: [tfidf] starting step: (3/9) | |
13/04/04 17:16:58 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:16:58 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0007 | |
13/04/04 17:16:58 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:16:58 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000 | |
13/04/04 17:16:58 INFO mapred.MapTask: numReduceTasks: 1 | |
13/04/04 17:16:58 INFO mapred.MapTask: io.sort.mb = 100 | |
13/04/04 17:16:58 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/04/04 17:16:58 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/04/04 17:16:58 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'token']]"][token_stop/83257/] | |
13/04/04 17:16:58 INFO hadoop.FlowMapper: sinking to: GroupBy(D)[by:[{1}:'doc_id']] | |
13/04/04 17:16:58 INFO mapred.MapTask: Starting flush of map output | |
13/04/04 17:16:58 INFO mapred.MapTask: Finished spill 0 | |
13/04/04 17:16:58 INFO mapred.Task: Task:attempt_local_0007_m_000000_0 is done. And is in the process of commiting | |
13/04/04 17:17:01 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/token_stop_83257_D1C356A79553FA44BF1B4A5F0C7E25BA/part-00000:0+1539 | |
13/04/04 17:17:01 INFO mapred.Task: Task 'attempt_local_0007_m_000000_0' done. | |
13/04/04 17:17:01 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:17:01 INFO mapred.LocalJobRunner: | |
13/04/04 17:17:01 INFO mapred.Merger: Merging 1 sorted segments | |
13/04/04 17:17:01 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 127 bytes | |
13/04/04 17:17:01 INFO mapred.LocalJobRunner: | |
13/04/04 17:17:01 INFO hadoop.FlowReducer: sourcing from: GroupBy(D)[by:[{1}:'doc_id']] | |
13/04/04 17:17:01 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['rhs_join', 'n_docs']]"][D/38403/] | |
13/04/04 17:17:01 INFO assembly.AggregateBy: using threshold value: 10000 | |
13/04/04 17:17:01 INFO mapred.Task: Task:attempt_local_0007_r_000000_0 is done. And is in the process of commiting | |
13/04/04 17:17:01 INFO mapred.LocalJobRunner: | |
13/04/04 17:17:01 INFO mapred.Task: Task attempt_local_0007_r_000000_0 is allowed to commit now | |
13/04/04 17:17:01 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0007_r_000000_0' to file:/tmp/hadoop-ceteri/D_38403_D3972FDA75B1A76BFF4DF639470CB650 | |
13/04/04 17:17:04 INFO mapred.LocalJobRunner: reduce > reduce | |
13/04/04 17:17:04 INFO mapred.Task: Task 'attempt_local_0007_r_000000_0' done. | |
13/04/04 17:17:08 INFO flow.FlowStep: [tfidf] starting step: (7/9) | |
13/04/04 17:17:08 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:17:08 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0008 | |
13/04/04 17:17:08 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:17:08 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/D_38403_D3972FDA75B1A76BFF4DF639470CB650/part-00000 | |
13/04/04 17:17:08 INFO mapred.MapTask: numReduceTasks: 1 | |
13/04/04 17:17:08 INFO mapred.MapTask: io.sort.mb = 100 | |
13/04/04 17:17:08 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/04/04 17:17:08 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/04/04 17:17:08 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['rhs_join', 'n_docs']]"][D/38403/] | |
13/04/04 17:17:08 INFO hadoop.FlowMapper: sinking to: GroupBy(D)[by:[{1}:'rhs_join']] | |
13/04/04 17:17:08 INFO mapred.MapTask: Starting flush of map output | |
13/04/04 17:17:08 INFO mapred.MapTask: Finished spill 0 | |
13/04/04 17:17:08 INFO mapred.Task: Task:attempt_local_0008_m_000000_0 is done. And is in the process of commiting | |
13/04/04 17:17:11 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/D_38403_D3972FDA75B1A76BFF4DF639470CB650/part-00000:0+84 | |
13/04/04 17:17:11 INFO mapred.Task: Task 'attempt_local_0008_m_000000_0' done. | |
13/04/04 17:17:11 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:17:11 INFO mapred.LocalJobRunner: | |
13/04/04 17:17:11 INFO mapred.Merger: Merging 1 sorted segments | |
13/04/04 17:17:11 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 11 bytes | |
13/04/04 17:17:11 INFO mapred.LocalJobRunner: | |
13/04/04 17:17:11 INFO hadoop.FlowReducer: sourcing from: GroupBy(D)[by:[{1}:'rhs_join']] | |
13/04/04 17:17:11 INFO hadoop.FlowReducer: sinking to: TempHfs["SequenceFile[['rhs_join', 'n_docs']]"][D/43305/] | |
13/04/04 17:17:11 INFO mapred.Task: Task:attempt_local_0008_r_000000_0 is done. And is in the process of commiting | |
13/04/04 17:17:11 INFO mapred.LocalJobRunner: | |
13/04/04 17:17:11 INFO mapred.Task: Task attempt_local_0008_r_000000_0 is allowed to commit now | |
13/04/04 17:17:11 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0008_r_000000_0' to file:/tmp/hadoop-ceteri/D_43305_F519DB9808753107EC31FF765111D5CD | |
13/04/04 17:17:14 INFO mapred.LocalJobRunner: reduce > reduce | |
13/04/04 17:17:14 INFO mapred.Task: Task 'attempt_local_0008_r_000000_0' done. | |
13/04/04 17:17:18 INFO flow.FlowStep: [tfidf] starting step: (5/9) output/tfidf | |
13/04/04 17:17:18 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:17:18 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:17:18 INFO flow.FlowStep: [tfidf] submitted hadoop job: job_local_0009 | |
13/04/04 17:17:18 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:17:18 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB/part-00000 | |
13/04/04 17:17:18 INFO mapred.MapTask: numReduceTasks: 1 | |
13/04/04 17:17:18 INFO mapred.MapTask: io.sort.mb = 100 | |
13/04/04 17:17:18 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/04/04 17:17:18 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/04/04 17:17:18 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['doc_id', 'tf_count', 'tf_token']]"][TF/16560/] | |
13/04/04 17:17:18 INFO hadoop.FlowMapper: sinking to: CoGroup(TF*DF*D)[by:TF:[{1}:'tf_token']DF*D:[{1}:'df_token']] | |
13/04/04 17:17:18 INFO mapred.MapTask: Starting flush of map output | |
13/04/04 17:17:18 INFO mapred.MapTask: Finished spill 0 | |
13/04/04 17:17:18 INFO mapred.Task: Task:attempt_local_0009_m_000000_0 is done. And is in the process of commiting | |
13/04/04 17:17:21 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/TF_16560_AE8298DC43B9EFCC0B8ADF5375434EBB/part-00000:0+1573 | |
13/04/04 17:17:21 INFO mapred.Task: Task 'attempt_local_0009_m_000000_0' done. | |
13/04/04 17:17:21 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:17:21 INFO io.MultiInputSplit: current split input path: file:/tmp/hadoop-ceteri/DF_37139_5D60E73D038B7A1DFA46ECF5292A68B4/part-00000 | |
13/04/04 17:17:21 INFO mapred.MapTask: numReduceTasks: 1 | |
13/04/04 17:17:21 INFO mapred.MapTask: io.sort.mb = 100 | |
13/04/04 17:17:21 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/04/04 17:17:21 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/04/04 17:17:21 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['df_count', 'df_token', 'lhs_join']]"][DF/37139/] | |
13/04/04 17:17:21 INFO hadoop.FlowMapper: sourcing from: TempHfs["SequenceFile[['rhs_join', 'n_docs']]"][D/43305/] | |
13/04/04 17:17:21 INFO hadoop.FlowMapper: sinking to: CoGroup(TF*DF*D)[by:TF:[{1}:'tf_token']DF*D:[{1}:'df_token']] | |
13/04/04 17:17:21 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:17:21 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:17:21 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/04/04 17:17:21 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:17:21 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:17:21 INFO mapred.MapTask: Starting flush of map output | |
13/04/04 17:17:21 INFO mapred.MapTask: Finished spill 0 | |
13/04/04 17:17:21 INFO mapred.Task: Task:attempt_local_0009_m_000001_0 is done. And is in the process of commiting | |
13/04/04 17:17:24 INFO mapred.LocalJobRunner: file:/tmp/hadoop-ceteri/DF_37139_5D60E73D038B7A1DFA46ECF5292A68B4/part-00000:0+846 | |
13/04/04 17:17:24 INFO mapred.Task: Task 'attempt_local_0009_m_000001_0' done. | |
13/04/04 17:17:24 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/04/04 17:17:24 INFO mapred.LocalJobRunner: | |
13/04/04 17:17:24 INFO mapred.Merger: Merging 2 sorted segments | |
13/04/04 17:17:24 INFO mapred.Merger: Down to the last merge-pass, with 2 segments left of total size: 2176 bytes | |
13/04/04 17:17:24 INFO mapred.LocalJobRunner: | |
13/04/04 17:17:24 INFO hadoop.FlowReducer: sourcing from: CoGroup(TF*DF*D)[by:TF:[{1}:'tf_token']DF*D:[{1}:'df_token']] | |
13/04/04 17:17:24 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[UNKNOWN]->['doc_id', 'tfidf', 'token']]"]["output/tfidf"]"] | |
13/04/04 17:17:24 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:17:24 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec | |
13/04/04 17:17:24 INFO mapred.Task: Task:attempt_local_0009_r_000000_0 is done. And is in the process of commiting | |
13/04/04 17:17:24 INFO mapred.LocalJobRunner: | |
13/04/04 17:17:24 INFO mapred.Task: Task attempt_local_0009_r_000000_0 is allowed to commit now | |
13/04/04 17:17:24 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0009_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part5/output/tfidf | |
13/04/04 17:17:27 INFO mapred.LocalJobRunner: reduce > reduce | |
13/04/04 17:17:27 INFO mapred.Task: Task 'attempt_local_0009_r_000000_0' done. | |
13/04/04 17:17:28 INFO util.Hadoop18TapUtil: deleting temp path output/wc/_temporary | |
13/04/04 17:17:28 INFO util.Hadoop18TapUtil: deleting temp path output/tfidf/_temporary | |
bash-3.2$ more output/wc/part-00000 | |
count token | |
1 women | |
1 australia | |
1 broken | |
1 california's | |
1 cause | |
1 cloudcover | |
1 death | |
1 deserts | |
1 downwind | |
1 dvd | |
1 effect | |
1 known | |
1 air | |
1 less | |
1 lies | |
1 mountainous | |
1 primary | |
1 produces | |
1 ranges | |
1 secrets | |
1 sinking | |
1 such | |
1 valley | |
2 land | |
2 leeward | |
2 lee | |
3 dry | |
3 mountain | |
4 area | |
4 shadow | |
5 rain | |
bash-3.2$ more output/tfidf/part-00000 | |
doc_id tfidf token | |
doc02 0.9162907318741551 air | |
doc01 0.44628710262841953 area | |
doc03 0.22314355131420976 area | |
doc02 0.22314355131420976 area | |
doc05 0.9162907318741551 australia | |
doc05 0.9162907318741551 broken | |
doc04 0.9162907318741551 california's | |
doc04 0.9162907318741551 cause | |
doc02 0.9162907318741551 cloudcover | |
doc04 0.9162907318741551 death | |
doc04 0.9162907318741551 deserts | |
doc03 0.9162907318741551 downwind | |
doc02 0.22314355131420976 dry | |
doc01 0.22314355131420976 dry | |
doc03 0.22314355131420976 dry | |
doc05 0.9162907318741551 dvd | |
doc04 0.9162907318741551 effect | |
doc04 0.9162907318741551 known | |
doc05 0.5108256237659907 land | |
doc03 0.5108256237659907 land | |
doc01 0.5108256237659907 lee | |
doc02 0.5108256237659907 lee | |
doc04 0.5108256237659907 leeward | |
doc03 0.5108256237659907 leeward | |
doc02 0.9162907318741551 less | |
doc03 0.9162907318741551 lies | |
doc03 0.22314355131420976 mountain | |
doc02 0.22314355131420976 mountain | |
doc04 0.22314355131420976 mountain | |
doc01 0.9162907318741551 mountainous | |
doc04 0.9162907318741551 primary | |
doc02 0.9162907318741551 produces | |
doc01 0.0 rain | |
doc02 0.0 rain | |
doc04 0.0 rain | |
doc03 0.0 rain | |
doc04 0.9162907318741551 ranges | |
doc05 0.9162907318741551 secrets | |
doc04 0.0 shadow | |
doc02 0.0 shadow | |
doc01 0.0 shadow | |
doc03 0.0 shadow | |
doc02 0.9162907318741551 sinking | |
doc04 0.9162907318741551 such | |
doc04 0.9162907318741551 valley | |
doc05 0.9162907318741551 women | |
bash-3.2$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bash-3.2$ pig -version | |
Warning: $HADOOP_HOME is deprecated. | |
Apache Pig version 0.10.0 (r1328203) | |
compiled Apr 19 2012, 22:54:12 | |
bash-3.2$ pig -p docPath=./data/rain.txt -p wcPath=./output/wc -p stopPath=./data/en.stop -p tfidfPath=./output/tfidf ./src/scripts/tfidf.pig | |
Warning: $HADOOP_HOME is deprecated. | |
2012-08-28 10:29:44,460 [main] INFO org.apache.pig.Main - Apache Pig version 0.10.0 (r1328203) compiled Apr 19 2012, 22:54:12 | |
2012-08-28 10:29:44,460 [main] INFO org.apache.pig.Main - Logging error messages to: /Users/ceteri/src/concur/Impatient/part5/pig_1346174984458.log | |
2012-08-28 10:29:44.558 java[74234:1903] Unable to load realm info from SCDynamicStore | |
2012-08-28 10:29:44,760 [main] INFO org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: file:/// | |
2012-08-28 10:29:45,539 [main] WARN org.apache.pig.PigServer - Encountered Warning USING_OVERLOADED_FUNCTION 1 time(s). | |
2012-08-28 10:29:45,539 [main] WARN org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_CHARARRAY 2 time(s). | |
2012-08-28 10:29:45,715 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false | |
2012-08-28 10:29:45,728 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner | |
2012-08-28 10:29:45,731 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner | |
2012-08-28 10:29:45,732 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner | |
2012-08-28 10:29:45,743 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LastInputStreamingOptimizer - Rewrite: POPackage->POForEach to POJoinPackage | |
2012-08-28 10:29:45,743 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LastInputStreamingOptimizer - Rewrite: POPackage->POForEach to POJoinPackage | |
2012-08-28 10:29:45,754 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 8 | |
2012-08-28 10:29:45,754 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Cannot merge this splittee: it has distinct combiner. | |
2012-08-28 10:29:45,754 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Cannot merge this splittee: it has distinct combiner. | |
2012-08-28 10:29:45,754 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Merged 0 map-reduce splittees. | |
2012-08-28 10:29:45,755 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Merged 0 out of total 4 MR operators. | |
2012-08-28 10:29:45,755 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 8 | |
2012-08-28 10:29:45,815 [main] WARN org.apache.pig.PigServer - Encountered Warning USING_OVERLOADED_FUNCTION 1 time(s). | |
2012-08-28 10:29:45,815 [main] WARN org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_CHARARRAY 2 time(s). | |
2012-08-28 10:29:45,818 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig features used in the script: HASH_JOIN,GROUP_BY,DISTINCT,FILTER,CROSS | |
2012-08-28 10:29:45,858 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false | |
2012-08-28 10:29:45,862 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner | |
2012-08-28 10:29:45,863 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner | |
2012-08-28 10:29:45,864 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner | |
2012-08-28 10:29:45,866 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LastInputStreamingOptimizer - Rewrite: POPackage->POForEach to POJoinPackage | |
2012-08-28 10:29:45,866 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LastInputStreamingOptimizer - Rewrite: POPackage->POForEach to POJoinPackage | |
2012-08-28 10:29:45,868 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 8 | |
2012-08-28 10:29:45,868 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Cannot merge this splittee: it has distinct combiner. | |
2012-08-28 10:29:45,868 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Cannot merge this splittee: it has distinct combiner. | |
2012-08-28 10:29:45,868 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Merged 0 map-reduce splittees. | |
2012-08-28 10:29:45,868 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - Merged 0 out of total 4 MR operators. | |
2012-08-28 10:29:45,868 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 8 | |
2012-08-28 10:29:45,885 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job | |
2012-08-28 10:29:45,894 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3 | |
2012-08-28 10:29:45,897 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job911310665554333174.jar | |
2012-08-28 10:29:50,739 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job911310665554333174.jar created | |
2012-08-28 10:29:50,747 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job | |
2012-08-28 10:29:50,754 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1054 | |
2012-08-28 10:29:50,754 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1 | |
2012-08-28 10:29:50,798 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission. | |
2012-08-28 10:29:50,807 [Thread-6] WARN org.apache.hadoop.util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
2012-08-28 10:29:50,902 [Thread-6] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:29:50,902 [Thread-6] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:29:50,907 [Thread-6] WARN org.apache.hadoop.io.compress.snappy.LoadSnappy - Snappy native library not loaded | |
2012-08-28 10:29:50,909 [Thread-6] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:29:50,915 [Thread-6] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:29:50,915 [Thread-6] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:29:50,915 [Thread-6] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:29:51,088 [Thread-7] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:29:51,100 [Thread-7] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/Users/ceteri/src/concur/Impatient/part5/data/en.stop:0+544 | |
2012-08-28 10:29:51,105 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:29:51,211 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:29:51,213 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:29:51,252 [Thread-7] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _0_en.stop | |
2012-08-28 10:29:51,264 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:29:51,273 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:29:51,274 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:29:51,299 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0001 | |
2012-08-28 10:29:51,300 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 0% complete | |
2012-08-28 10:29:54,073 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:29:54,074 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_m_000000_0' done. | |
2012-08-28 10:29:54,077 [Thread-7] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:29:54,080 [Thread-7] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/Users/ceteri/src/concur/Impatient/part5/data/rain.txt:0+510 | |
2012-08-28 10:29:54,080 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:29:54,142 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:29:54,142 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:29:54,156 [Thread-7] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _1_rain.txt | |
2012-08-28 10:29:54,163 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:29:54,165 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:29:54,170 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0001_m_000001_0 is done. And is in the process of commiting | |
2012-08-28 10:29:57,076 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:29:57,076 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_m_000001_0' done. | |
2012-08-28 10:29:57,087 [Thread-7] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:29:57,087 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:29:57,090 [Thread-7] INFO org.apache.hadoop.mapred.Merger - Merging 2 sorted segments | |
2012-08-28 10:29:57,097 [Thread-7] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 2 segments left of total size: 3284 bytes | |
2012-08-28 10:29:57,097 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:29:57,129 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:29:57,129 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:29:57,130 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0001_r_000000_0 is allowed to commit now | |
2012-08-28 10:29:57,132 [Thread-7] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0001_r_000000_0' to file:/tmp/temp-2375117/tmp-1797990455 | |
2012-08-28 10:30:00,088 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce | |
2012-08-28 10:30:00,089 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_r_000000_0' done. | |
2012-08-28 10:30:00,091 [Thread-7] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup | |
2012-08-28 10:30:01,316 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 12% complete | |
2012-08-28 10:30:01,318 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0001 | |
2012-08-28 10:30:01,320 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job | |
2012-08-28 10:30:01,320 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3 | |
2012-08-28 10:30:01,320 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job6866114866035432926.jar | |
2012-08-28 10:30:05,039 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job6866114866035432926.jar created | |
2012-08-28 10:30:05,043 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job | |
2012-08-28 10:30:05,043 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting identity combiner class. | |
2012-08-28 10:30:05,044 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1037 | |
2012-08-28 10:30:05,044 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1 | |
2012-08-28 10:30:05,050 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job | |
2012-08-28 10:30:05,051 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3 | |
2012-08-28 10:30:05,051 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job750153399916515834.jar | |
2012-08-28 10:30:08,741 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job750153399916515834.jar created | |
2012-08-28 10:30:08,744 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job | |
2012-08-28 10:30:08,748 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1037 | |
2012-08-28 10:30:08,748 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1 | |
2012-08-28 10:30:08,756 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job | |
2012-08-28 10:30:08,757 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3 | |
2012-08-28 10:30:08,757 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job461043214572580021.jar | |
2012-08-28 10:30:12,371 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job461043214572580021.jar created | |
2012-08-28 10:30:12,375 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job | |
2012-08-28 10:30:12,376 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting identity combiner class. | |
2012-08-28 10:30:12,377 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1037 | |
2012-08-28 10:30:12,377 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1 | |
2012-08-28 10:30:12,383 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 3 map-reduce job(s) waiting for submission. | |
2012-08-28 10:30:12,431 [Thread-11] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:30:12,431 [Thread-11] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:30:12,431 [Thread-11] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:30:12,508 [Thread-12] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:12,513 [Thread-12] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1797990455/part-r-00000:0+1037 | |
2012-08-28 10:30:12,515 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:30:12,526 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:30:12,526 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:30:12,543 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:30:12,546 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:30:12,548 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0002_m_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:12,558 [Thread-11] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:30:12,558 [Thread-11] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:30:12,558 [Thread-11] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:30:12,605 [Thread-15] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:12,607 [Thread-15] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1797990455/part-r-00000:0+1037 | |
2012-08-28 10:30:12,607 [Thread-15] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:30:12,672 [Thread-15] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:30:12,673 [Thread-15] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:30:12,693 [Thread-15] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:30:12,705 [Thread-15] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:30:12,708 [Thread-15] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0003_m_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:12,720 [Thread-11] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:30:12,720 [Thread-11] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:30:12,720 [Thread-11] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:30:12,767 [Thread-18] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:12,769 [Thread-18] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1797990455/part-r-00000:0+1037 | |
2012-08-28 10:30:12,769 [Thread-18] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:30:12,885 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0002 | |
2012-08-28 10:30:12,885 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0003 | |
2012-08-28 10:30:12,886 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0004 | |
2012-08-28 10:30:12,890 [Thread-18] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:30:12,890 [Thread-18] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:30:12,910 [Thread-18] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:30:12,914 [Thread-18] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:30:12,917 [Thread-18] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0004_m_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:15,504 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,504 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0002_m_000000_0' done. | |
2012-08-28 10:30:15,512 [Thread-12] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:15,512 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,513 [Thread-12] INFO org.apache.hadoop.mapred.Merger - Merging 1 sorted segments | |
2012-08-28 10:30:15,513 [Thread-12] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 87 bytes | |
2012-08-28 10:30:15,513 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,520 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0002_r_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:15,522 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,522 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0002_r_000000_0 is allowed to commit now | |
2012-08-28 10:30:15,524 [Thread-12] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0002_r_000000_0' to file:/tmp/temp-2375117/tmp-1840385936 | |
2012-08-28 10:30:15,604 [Thread-15] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,605 [Thread-15] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0003_m_000000_0' done. | |
2012-08-28 10:30:15,614 [Thread-15] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:15,614 [Thread-15] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,615 [Thread-15] INFO org.apache.hadoop.mapred.Merger - Merging 1 sorted segments | |
2012-08-28 10:30:15,616 [Thread-15] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 1689 bytes | |
2012-08-28 10:30:15,616 [Thread-15] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,637 [Thread-15] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0003_r_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:15,639 [Thread-15] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,640 [Thread-15] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0003_r_000000_0 is allowed to commit now | |
2012-08-28 10:30:15,643 [Thread-15] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0003_r_000000_0' to file:/tmp/temp-2375117/tmp-633078531 | |
2012-08-28 10:30:15,765 [Thread-18] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,765 [Thread-18] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0004_m_000000_0' done. | |
2012-08-28 10:30:15,773 [Thread-18] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:15,773 [Thread-18] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,773 [Thread-18] INFO org.apache.hadoop.mapred.Merger - Merging 1 sorted segments | |
2012-08-28 10:30:15,774 [Thread-18] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 1229 bytes | |
2012-08-28 10:30:15,774 [Thread-18] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,787 [Thread-18] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0004_r_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:15,789 [Thread-18] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:15,789 [Thread-18] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0004_r_000000_0 is allowed to commit now | |
2012-08-28 10:30:15,792 [Thread-18] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0004_r_000000_0' to file:/tmp/temp-2375117/tmp-185348010 | |
2012-08-28 10:30:18,515 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce | |
2012-08-28 10:30:18,516 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0002_r_000000_0' done. | |
2012-08-28 10:30:18,517 [Thread-12] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup | |
2012-08-28 10:30:18,608 [Thread-15] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce | |
2012-08-28 10:30:18,609 [Thread-15] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0003_r_000000_0' done. | |
2012-08-28 10:30:18,610 [Thread-15] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup | |
2012-08-28 10:30:18,767 [Thread-18] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce | |
2012-08-28 10:30:18,768 [Thread-18] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0004_r_000000_0' done. | |
2012-08-28 10:30:18,769 [Thread-18] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup | |
2012-08-28 10:30:22,898 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 50% complete | |
2012-08-28 10:30:22,899 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0002 | |
2012-08-28 10:30:22,900 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0003 | |
2012-08-28 10:30:22,901 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0004 | |
2012-08-28 10:30:22,901 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job | |
2012-08-28 10:30:22,902 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3 | |
2012-08-28 10:30:22,902 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job497102663141037932.jar | |
2012-08-28 10:30:26,702 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job497102663141037932.jar created | |
2012-08-28 10:30:26,703 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job | |
2012-08-28 10:30:26,710 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job | |
2012-08-28 10:30:26,710 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3 | |
2012-08-28 10:30:26,710 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job416157849282719047.jar | |
2012-08-28 10:30:30,383 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job416157849282719047.jar created | |
2012-08-28 10:30:30,384 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job | |
2012-08-28 10:30:30,387 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=997 | |
2012-08-28 10:30:30,387 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1 | |
2012-08-28 10:30:30,392 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 2 map-reduce job(s) waiting for submission. | |
2012-08-28 10:30:30,442 [Thread-21] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:30:30,442 [Thread-21] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:30:30,442 [Thread-21] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:30:30,491 [Thread-22] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:30,495 [Thread-22] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-185348010/part-r-00000:0+997 | |
2012-08-28 10:30:30,495 [Thread-22] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:30:30,541 [Thread-21] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:30:30,541 [Thread-21] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:30:30,541 [Thread-21] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:30:30,550 [Thread-22] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:30:30,550 [Thread-22] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:30:30,579 [Thread-22] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:30:30,590 [Thread-22] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:30:30,593 [Thread-25] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:30,593 [Thread-22] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0005_m_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:30,596 [Thread-25] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1840385936/part-r-00000:0+60 | |
2012-08-28 10:30:30,596 [Thread-25] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:30:30,685 [Thread-25] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:30:30,685 [Thread-25] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:30:30,698 [Thread-25] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:30:30,705 [Thread-25] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:30:30,706 [Thread-25] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0006_m_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:30,893 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0005 | |
2012-08-28 10:30:30,893 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0006 | |
2012-08-28 10:30:33,487 [Thread-22] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:33,487 [Thread-22] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0005_m_000000_0' done. | |
2012-08-28 10:30:33,493 [Thread-22] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:33,493 [Thread-22] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:33,493 [Thread-22] INFO org.apache.hadoop.mapred.Merger - Merging 1 sorted segments | |
2012-08-28 10:30:33,494 [Thread-22] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 809 bytes | |
2012-08-28 10:30:33,494 [Thread-22] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:33,508 [Thread-22] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0005_r_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:33,509 [Thread-22] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:33,509 [Thread-22] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0005_r_000000_0 is allowed to commit now | |
2012-08-28 10:30:33,511 [Thread-22] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0005_r_000000_0' to file:/tmp/temp-2375117/tmp155933281 | |
2012-08-28 10:30:33,590 [Thread-25] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:33,591 [Thread-25] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0006_m_000000_0' done. | |
2012-08-28 10:30:33,597 [Thread-25] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:33,597 [Thread-25] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:33,598 [Thread-25] INFO org.apache.hadoop.mapred.Merger - Merging 1 sorted segments | |
2012-08-28 10:30:33,598 [Thread-25] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 25 bytes | |
2012-08-28 10:30:33,598 [Thread-25] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:33,608 [Thread-25] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0006_r_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:33,609 [Thread-25] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:33,610 [Thread-25] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0006_r_000000_0 is allowed to commit now | |
2012-08-28 10:30:33,611 [Thread-25] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0006_r_000000_0' to file:/tmp/temp-2375117/tmp-1694028400 | |
2012-08-28 10:30:36,490 [Thread-22] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce | |
2012-08-28 10:30:36,490 [Thread-22] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0005_r_000000_0' done. | |
2012-08-28 10:30:36,491 [Thread-22] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup | |
2012-08-28 10:30:36,593 [Thread-25] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce | |
2012-08-28 10:30:36,593 [Thread-25] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0006_r_000000_0' done. | |
2012-08-28 10:30:36,594 [Thread-25] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup | |
2012-08-28 10:30:40,914 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 75% complete | |
2012-08-28 10:30:40,915 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0005 | |
2012-08-28 10:30:40,916 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0006 | |
2012-08-28 10:30:40,917 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job | |
2012-08-28 10:30:40,917 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3 | |
2012-08-28 10:30:40,917 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job6401439778182906434.jar | |
2012-08-28 10:30:44,617 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job6401439778182906434.jar created | |
2012-08-28 10:30:44,619 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job | |
2012-08-28 10:30:44,621 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=792 | |
2012-08-28 10:30:44,621 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1 | |
2012-08-28 10:30:44,624 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission. | |
2012-08-28 10:30:44,672 [Thread-28] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:30:44,672 [Thread-28] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:30:44,672 [Thread-28] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:30:44,675 [Thread-28] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:30:44,675 [Thread-28] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:30:44,675 [Thread-28] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:30:44,710 [Thread-29] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:44,713 [Thread-29] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp155933281/part-r-00000:0+776 | |
2012-08-28 10:30:44,716 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:30:44,731 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:30:44,731 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:30:44,738 [Thread-29] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _0_tmp155933281 | |
2012-08-28 10:30:44,741 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:30:44,743 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:30:44,745 [Thread-29] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0007_m_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:45,125 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0007 | |
2012-08-28 10:30:47,706 [Thread-29] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:47,707 [Thread-29] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0007_m_000000_0' done. | |
2012-08-28 10:30:47,711 [Thread-29] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:47,714 [Thread-29] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-1694028400/part-r-00000:0+16 | |
2012-08-28 10:30:47,714 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:30:47,724 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:30:47,724 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:30:47,730 [Thread-29] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _1_tmp-1694028400 | |
2012-08-28 10:30:47,731 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:30:47,732 [Thread-29] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:30:47,734 [Thread-29] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0007_m_000001_0 is done. And is in the process of commiting | |
2012-08-28 10:30:50,709 [Thread-29] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:50,709 [Thread-29] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0007_m_000001_0' done. | |
2012-08-28 10:30:50,719 [Thread-29] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:50,719 [Thread-29] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:50,720 [Thread-29] INFO org.apache.hadoop.mapred.Merger - Merging 2 sorted segments | |
2012-08-28 10:30:50,720 [Thread-29] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 2 segments left of total size: 956 bytes | |
2012-08-28 10:30:50,720 [Thread-29] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:50,734 [Thread-29] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0007_r_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:50,736 [Thread-29] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:30:50,736 [Thread-29] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0007_r_000000_0 is allowed to commit now | |
2012-08-28 10:30:50,738 [Thread-29] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0007_r_000000_0' to file:/tmp/temp-2375117/tmp277297603 | |
2012-08-28 10:30:53,715 [Thread-29] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce | |
2012-08-28 10:30:53,716 [Thread-29] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0007_r_000000_0' done. | |
2012-08-28 10:30:53,717 [Thread-29] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup | |
2012-08-28 10:30:55,142 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 87% complete | |
2012-08-28 10:30:55,143 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0007 | |
2012-08-28 10:30:55,143 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job | |
2012-08-28 10:30:55,143 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3 | |
2012-08-28 10:30:55,144 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job1907337562856891341.jar | |
2012-08-28 10:30:58,780 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job1907337562856891341.jar created | |
2012-08-28 10:30:58,782 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job | |
2012-08-28 10:30:58,783 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=2358 | |
2012-08-28 10:30:58,783 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1 | |
2012-08-28 10:30:58,790 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission. | |
2012-08-28 10:30:58,827 [Thread-33] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:30:58,827 [Thread-33] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:30:58,828 [Thread-33] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:30:58,829 [Thread-33] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1 | |
2012-08-28 10:30:58,829 [Thread-33] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1 | |
2012-08-28 10:30:58,829 [Thread-33] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1 | |
2012-08-28 10:30:58,858 [Thread-34] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:30:58,860 [Thread-34] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp-633078531/part-r-00000:0+1365 | |
2012-08-28 10:30:58,860 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:30:58,873 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:30:58,873 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:30:58,878 [Thread-34] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _1_tmp-633078531 | |
2012-08-28 10:30:58,881 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:30:58,883 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:30:58,884 [Thread-34] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0008_m_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:30:59,292 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0008 | |
2012-08-28 10:31:01,855 [Thread-34] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:31:01,856 [Thread-34] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0008_m_000000_0' done. | |
2012-08-28 10:31:01,860 [Thread-34] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:31:01,862 [Thread-34] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp-2375117/tmp277297603/part-r-00000:0+993 | |
2012-08-28 10:31:01,863 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100 | |
2012-08-28 10:31:01,874 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720 | |
2012-08-28 10:31:01,874 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680 | |
2012-08-28 10:31:01,879 [Thread-34] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _0_tmp277297603 | |
2012-08-28 10:31:01,881 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output | |
2012-08-28 10:31:01,883 [Thread-34] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0 | |
2012-08-28 10:31:01,884 [Thread-34] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0008_m_000001_0 is done. And is in the process of commiting | |
2012-08-28 10:31:04,857 [Thread-34] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:31:04,858 [Thread-34] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0008_m_000001_0' done. | |
2012-08-28 10:31:04,866 [Thread-34] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null | |
2012-08-28 10:31:04,866 [Thread-34] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:31:04,867 [Thread-34] INFO org.apache.hadoop.mapred.Merger - Merging 2 sorted segments | |
2012-08-28 10:31:04,867 [Thread-34] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 2 segments left of total size: 2439 bytes | |
2012-08-28 10:31:04,867 [Thread-34] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:31:04,888 [Thread-34] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0008_r_000000_0 is done. And is in the process of commiting | |
2012-08-28 10:31:04,889 [Thread-34] INFO org.apache.hadoop.mapred.LocalJobRunner - | |
2012-08-28 10:31:04,890 [Thread-34] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0008_r_000000_0 is allowed to commit now | |
2012-08-28 10:31:04,892 [Thread-34] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0008_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part5/output/tfidf | |
2012-08-28 10:31:07,862 [Thread-34] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce | |
2012-08-28 10:31:07,863 [Thread-34] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0008_r_000000_0' done. | |
2012-08-28 10:31:07,865 [Thread-34] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup | |
2012-08-28 10:31:09,312 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0008 | |
2012-08-28 10:31:09,314 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 100% complete | |
2012-08-28 10:31:09,317 [main] INFO org.apache.pig.tools.pigstats.SimplePigStats - Script Statistics: | |
HadoopVersion PigVersion UserId StartedAt FinishedAt Features | |
1.0.3 0.10.0 ceteri 2012-08-28 10:29:45 2012-08-28 10:31:09 HASH_JOIN,GROUP_BY,DISTINCT,FILTER,CROSS | |
Success! | |
Job Stats (time in seconds): | |
JobId Maps Reduces MaxMapTime MinMapTIme AvgMapTime MaxReduceTime MinReduceTime AvgReduceTime Alias Feature Outputs | |
job_local_0001 1 1 n/a n/a n/a n/a n/a n/a docPipe,stopPipe,tokenPipe HASH_JOIN | |
job_local_0002 1 1 n/a n/a n/a n/a n/a n/a dPipe DISTINCT | |
job_local_0003 1 1 n/a n/a n/a n/a n/a n/a tfGroups,tfPipe GROUP_BY,COMBINER | |
job_local_0004 1 1 n/a n/a n/a n/a n/a n/a DISTINCT | |
job_local_0005 1 1 n/a n/a n/a n/a n/a n/a dfGroups,dfPipe,idfPipe GROUP_BY,COMBINER | |
job_local_0006 1 1 n/a n/a n/a n/a n/a n/a dGroups,dPipe,idfPipe GROUP_BY,COMBINER | |
job_local_0007 1 1 n/a n/a n/a n/a n/a n/a idfPipe | |
job_local_0008 1 1 n/a n/a n/a n/a n/a n/a tfidfPipe HASH_JOIN file:///Users/ceteri/src/concur/Impatient/part5/output/tfidf, | |
Input(s): | |
Successfully read 0 records from: "file:///Users/ceteri/src/concur/Impatient/part5/data/en.stop" | |
Successfully read 0 records from: "file:///Users/ceteri/src/concur/Impatient/part5/data/rain.txt" | |
Output(s): | |
Successfully stored 0 records in: "file:///Users/ceteri/src/concur/Impatient/part5/output/tfidf" | |
Counters: | |
Total records written : 0 | |
Total bytes written : 0 | |
Spillable Memory Manager spill count : 0 | |
Total bags proactively spilled: 0 | |
Total records proactively spilled: 0 | |
Job DAG: | |
job_local_0001 -> job_local_0003,job_local_0002,job_local_0004, | |
job_local_0003 -> job_local_0008, | |
job_local_0002 -> job_local_0006, | |
job_local_0006 -> job_local_0007, | |
job_local_0004 -> job_local_0005, | |
job_local_0005 -> job_local_0007, | |
job_local_0007 -> job_local_0008, | |
job_local_0008 | |
2012-08-28 10:31:09,317 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success! | |
bash-3.2$ cat output/tfidf/part-r-00000 | |
doc02 0.9162907318741551 air | |
doc02 0.22314355131420976 dry | |
doc01 0.22314355131420976 dry | |
doc03 0.22314355131420976 dry | |
doc05 0.9162907318741551 dvd | |
doc02 0.5108256237659907 lee | |
doc01 0.5108256237659907 lee | |
doc03 0.22314355131420976 area | |
doc01 0.44628710262841953 area | |
doc02 0.22314355131420976 area | |
doc05 0.5108256237659907 land | |
doc03 0.5108256237659907 land | |
doc02 0.9162907318741551 less | |
doc03 0.9162907318741551 lies | |
doc02 0.0 rain | |
doc04 0.0 rain | |
doc01 0.0 rain | |
doc03 0.0 rain | |
doc04 0.9162907318741551 such | |
doc04 0.9162907318741551 cause | |
doc04 0.9162907318741551 death | |
doc04 0.9162907318741551 known | |
doc05 0.9162907318741551 women | |
doc05 0.9162907318741551 broken | |
doc04 0.9162907318741551 effect | |
doc04 0.9162907318741551 ranges | |
doc04 0.0 shadow | |
doc03 0.0 shadow | |
doc02 0.0 shadow | |
doc01 0.0 shadow | |
doc04 0.9162907318741551 valley | |
doc04 0.9162907318741551 deserts | |
doc03 0.5108256237659907 leeward | |
doc04 0.5108256237659907 leeward | |
doc04 0.9162907318741551 primary | |
doc05 0.9162907318741551 secrets | |
doc02 0.9162907318741551 sinking | |
doc03 0.9162907318741551 downwind | |
doc04 0.22314355131420976 mountain | |
doc02 0.22314355131420976 mountain | |
doc03 0.22314355131420976 mountain | |
doc02 0.9162907318741551 produces | |
doc05 0.9162907318741551 australia | |
doc02 0.9162907318741551 cloudcover | |
doc01 0.9162907318741551 mountainous | |
doc04 0.9162907318741551 california's | |
bash-3.2$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docPipe = LOAD '$docPath' USING PigStorage('\t', 'tagsource') AS (doc_id, text); | |
docPipe = FILTER docPipe BY doc_id != 'doc_id'; | |
stopPipe = LOAD '$stopPath' USING PigStorage('\t', 'tagsource') AS (stop:chararray); | |
stopPipe = FILTER stopPipe BY stop != 'stop'; | |
-- specify a regex operation to split the "document" text lines into a token stream | |
tokenPipe = FOREACH docPipe GENERATE doc_id, FLATTEN(TOKENIZE(LOWER(text), ' [](),.')) AS token; | |
tokenPipe = FILTER tokenPipe BY token MATCHES '\\w.*'; | |
-- perform a left join to remove stop words, discarding the rows | |
-- which joined with stop words, i.e., were non-null after left join | |
tokenPipe = JOIN tokenPipe BY token LEFT, stopPipe BY stop; | |
tokenPipe = FILTER tokenPipe BY stopPipe::stop is NULL; | |
-- DUMP tokenPipe; | |
-- one branch of the flow tallies the token counts for term frequency (TF) | |
tfGroups = GROUP tokenPipe BY (doc_id, token); | |
tfPipe = FOREACH tfGroups GENERATE FLATTEN(group) AS (doc_id, tf_token), COUNT(tokenPipe) AS tf_count; | |
-- DUMP tfPipe; | |
-- one branch counts the number of documents (D) | |
dPipe = FOREACH tokenPipe GENERATE doc_id; | |
dPipe = DISTINCT dPipe; | |
dGroups = GROUP dPipe ALL; | |
dPipe = FOREACH dGroups GENERATE COUNT(dPipe) AS n_docs; | |
-- DUMP dPipe; | |
-- one branch tallies the token counts for document frequency (DF) | |
dfPipe = DISTINCT tokenPipe; | |
dfGroups = GROUP dfPipe BY token; | |
dfPipe = FOREACH dfGroups GENERATE group AS df_token, COUNT(dfPipe) AS df_count; | |
-- DUMP dfPipe; | |
-- join to bring together all the components for calculating TF-IDF | |
idfPipe = CROSS dfPipe, dPipe; | |
tfidfPipe = JOIN tfPipe BY tf_token, idfPipe BY df_token; | |
tfidfPipe = FOREACH tfidfPipe GENERATE doc_id, (double) tf_count * LOG( (double) n_docs / ( 1.0 + (double) df_count ) ) AS tfidf, tf_token AS token; | |
-- output | |
STORE tfidfPipe INTO '$tfidfPath' using PigStorage('\t', 'tagsource'); | |
EXPLAIN -out dot/tfidf_pig.dot -dot tfidfPipe; | |
-- determine the word counts | |
-- THIS PART DIES IN APACHE PIG W/O HELPFUL EXCEPTION MESSAGES | |
--tokenGroups = GROUP tokenPipe BY token; | |
--wcPipe = FOREACH tokenGroups GENERATE COUNT(tokenPipe) AS count, group AS token; | |
--wcPipe = ORDER wcPipe BY count DESC; | |
--STORE wcPipe INTO '$wcPath' using PigStorage('\t', 'tagsource'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment