Last active
December 10, 2015 02:58
-
-
Save ceteri/4371896 to your computer and use it in GitHub Desktop.
Cascading for the Impatient, Part 8 -- Scalding examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.twitter.scalding._ | |
class Example3(args : Args) extends Job(args) { | |
Tsv(args("doc"), ('doc_id, 'text), skipHeader = true) | |
.read | |
.flatMap('text -> 'token) { text : String => text.split("[ \\[\\]\\(\\),.]") } | |
.mapTo('token -> 'token) { token : String => scrub(token) } | |
.filter('token) { token : String => token.length > 0 } | |
.groupBy('token) { _.size('count) } | |
.write(Tsv(args("wc"), writeHeader = true)) | |
def scrub(token : String) : String = { | |
token | |
.trim | |
.toLowerCase | |
} | |
// kudos to Chris Severs for this workaround, when running "fat jars" - | |
// avoids the "ClassNotFoundException cascading.*" exception on a Hadoop cluster | |
override def config(implicit mode: Mode): Map[AnyRef, AnyRef] = { | |
super.config(mode) ++ Map("cascading.app.appjar.class" -> classOf[Example3]) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.twitter.scalding._ | |
class Example4(args : Args) extends Job(args) { | |
val stopPipe = Tsv(args("stop"), ('stop), skipHeader = true) | |
.read | |
Tsv(args("doc"), ('doc_id, 'text), skipHeader = true) | |
.read | |
.flatMap('text -> 'token) { text : String => text.split("[ \\[\\]\\(\\),.]") } | |
.mapTo('token -> 'token) { token : String => scrub(token) } | |
.filter('token) { token : String => token.length > 0 } | |
.leftJoinWithTiny('token -> 'stop, stopPipe) | |
.filter('stop) { stop : String => stop == null } | |
.groupBy('token) { _.size('count) } | |
.write(Tsv(args("wc"), writeHeader = true)) | |
def scrub(token : String) : String = { | |
token | |
.trim | |
.toLowerCase | |
} | |
// kudos to Chris Severs for this workaround, when running "fat jars" - | |
// avoids the "ClassNotFoundException cascading.*" exception on a Hadoop cluster | |
override def config(implicit mode: Mode): Map[AnyRef, AnyRef] = { | |
super.config(mode) ++ Map("cascading.app.appjar.class" -> classOf[Example4]) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bash-3.2$ rm -rf output | |
bash-3.2$ scald.rb --hdfs-local src/main/scala/Example3.scala --doc data/rain.txt --wc output/wc | |
12/12/24 22:25:36 INFO util.HadoopUtil: resolving application jar from found main method on: com.twitter.scalding.Tool$ | |
12/12/24 22:25:36 INFO planner.HadoopPlanner: using application jar: /Users/ceteri/opt/scalding/target/scalding-assembly-0.8.2-SNAPSHOT.jar | |
12/12/24 22:25:36 INFO property.AppProps: using app.id: C981F81D95C117AC8481F87752188431 | |
12/12/24 22:25:36 INFO flow.Flow: [Example3] starting | |
12/12/24 22:25:36 INFO flow.Flow: [Example3] source: Hfs["TextDelimited[['doc_id', 'text']]"]["data/rain.txt"]"] | |
12/12/24 22:25:36 INFO flow.Flow: [Example3] sink: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"] | |
12/12/24 22:25:36 INFO flow.Flow: [Example3] parallel execution is enabled: false | |
12/12/24 22:25:36 INFO flow.Flow: [Example3] starting jobs: 1 | |
12/12/24 22:25:36 INFO flow.Flow: [Example3] allocating threads: 1 | |
12/12/24 22:25:36 INFO flow.FlowStep: [Example3] starting step: (1/1) output/wc | |
12/12/24 22:25:36 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId= | |
12/12/24 22:25:36 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
12/12/24 22:25:36 INFO flow.FlowStep: [Example3] submitted hadoop job: job_local_0001 | |
12/12/24 22:25:36 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
12/12/24 22:25:36 INFO io.MultiInputSplit: current split input path: file:/Users/ceteri/src/concur/Impatient/part8/data/rain.txt | |
12/12/24 22:25:36 INFO mapred.MapTask: numReduceTasks: 1 | |
12/12/24 22:25:36 INFO mapred.MapTask: io.sort.mb = 100 | |
12/12/24 22:25:37 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
12/12/24 22:25:37 INFO mapred.MapTask: record buffer = 262144/327680 | |
12/12/24 22:25:37 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/24 22:25:37 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/24 22:25:37 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['doc_id', 'text']]"]["data/rain.txt"]"] | |
12/12/24 22:25:37 INFO hadoop.FlowMapper: sinking to: GroupBy(Tsv(data/rain.txt,'doc_id', 'text',true,false))[by:[{1}:'token']] | |
12/12/24 22:25:37 INFO assembly.AggregateBy: using threshold value: 100000 | |
12/12/24 22:25:37 INFO mapred.MapTask: Starting flush of map output | |
12/12/24 22:25:37 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/24 22:25:37 INFO mapred.MapTask: Finished spill 0 | |
12/12/24 22:25:37 INFO mapred.TaskRunner: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting | |
12/12/24 22:25:37 INFO mapred.LocalJobRunner: file:/Users/ceteri/src/concur/Impatient/part8/data/rain.txt:0+510 | |
12/12/24 22:25:37 INFO mapred.TaskRunner: Task 'attempt_local_0001_m_000000_0' done. | |
12/12/24 22:25:37 INFO mapred.LocalJobRunner: | |
12/12/24 22:25:37 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/24 22:25:37 INFO mapred.Merger: Merging 1 sorted segments | |
12/12/24 22:25:37 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 797 bytes | |
12/12/24 22:25:37 INFO mapred.LocalJobRunner: | |
12/12/24 22:25:37 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/24 22:25:37 INFO hadoop.FlowReducer: sourcing from: GroupBy(Tsv(data/rain.txt,'doc_id', 'text',true,false))[by:[{1}:'token']] | |
12/12/24 22:25:37 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"] | |
12/12/24 22:25:37 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/24 22:25:37 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/24 22:25:37 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/24 22:25:37 INFO mapred.TaskRunner: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting | |
12/12/24 22:25:37 INFO mapred.LocalJobRunner: | |
12/12/24 22:25:37 INFO mapred.TaskRunner: Task attempt_local_0001_r_000000_0 is allowed to commit now | |
12/12/24 22:25:37 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part8/output/wc | |
12/12/24 22:25:37 INFO mapred.LocalJobRunner: reduce > reduce | |
12/12/24 22:25:37 INFO mapred.TaskRunner: Task 'attempt_local_0001_r_000000_0' done. | |
12/12/24 22:25:41 INFO util.Hadoop18TapUtil: deleting temp path output/wc/_temporary | |
bash-3.2$ head output/wc/part-00000 | |
token count | |
a 8 | |
air 1 | |
an 1 | |
and 2 | |
area 4 | |
as 2 | |
australia 1 | |
back 1 | |
broken 1 | |
bash-3.2$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bash-3.2$ rm -rf output | |
bash-3.2$ scald.rb --hdfs-local src/main/scala/Example4.scala --doc data/rain.txt --stop data/en.stop --wc output/wc | |
compiling src/scala/Example4.scala | |
scalac -classpath /var/folders/bl/zrtbg3cd57lfsgzzllhnxxjc0000gn/T/maven/hadoop-core-0.20.2.jar:/var/folders/bl/zrtbg3cd57lfsgzzllhnxxjc0000gn/T/maven/log4j-1.2.15.jar:/var/folders/bl/zrtbg3cd57lfsgzzllhnxxjc0000gn/T/maven/commons-httpclient-3.1.jar:/var/folders/bl/zrtbg3cd57lfsgzzllhnxxjc0000gn/T/maven/commons-cli-1.2.jar:/var/folders/bl/zrtbg3cd57lfsgzzllhnxxjc0000gn/T/maven/zookeeper-3.3.4.jar:/Users/ceteri/.sbt/boot/scala-2.9.2/lib/scala-library.jar:/Users/ceteri/opt/scalding/target/scalding-assembly-0.8.2-SNAPSHOT.jar -d /var/folders/bl/zrtbg3cd57lfsgzzllhnxxjc0000gn/T/script-build src/scala/Example4.scala | |
12/12/25 22:38:35 INFO util.HadoopUtil: resolving application jar from found main method on: com.twitter.scalding.Tool$ | |
12/12/25 22:38:35 INFO planner.HadoopPlanner: using application jar: /Users/ceteri/opt/scalding/target/scalding-assembly-0.8.2-SNAPSHOT.jar | |
12/12/25 22:38:35 INFO property.AppProps: using app.id: 3B93725AC368DAB472DE063E7A6789A7 | |
12/12/25 22:38:35 INFO flow.Flow: [Example4] starting | |
12/12/25 22:38:35 INFO flow.Flow: [Example4] source: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"] | |
12/12/25 22:38:35 INFO flow.Flow: [Example4] source: Hfs["TextDelimited[['doc_id', 'text']]"]["data/rain.txt"]"] | |
12/12/25 22:38:35 INFO flow.Flow: [Example4] sink: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"] | |
12/12/25 22:38:35 INFO flow.Flow: [Example4] parallel execution is enabled: false | |
12/12/25 22:38:35 INFO flow.Flow: [Example4] starting jobs: 1 | |
12/12/25 22:38:35 INFO flow.Flow: [Example4] allocating threads: 1 | |
12/12/25 22:38:35 INFO flow.FlowStep: [Example4] starting step: (1/1) output/wc | |
12/12/25 22:38:35 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId= | |
12/12/25 22:38:36 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
12/12/25 22:38:36 INFO flow.FlowStep: [Example4] submitted hadoop job: job_local_0001 | |
12/12/25 22:38:36 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
12/12/25 22:38:36 INFO io.MultiInputSplit: current split input path: file:/Users/ceteri/src/concur/Impatient/part8/data/rain.txt | |
12/12/25 22:38:36 INFO mapred.MapTask: numReduceTasks: 1 | |
12/12/25 22:38:36 INFO mapred.MapTask: io.sort.mb = 100 | |
12/12/25 22:38:36 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
12/12/25 22:38:36 INFO mapred.MapTask: record buffer = 262144/327680 | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['doc_id', 'text']]"]["data/rain.txt"]"] | |
12/12/25 22:38:36 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"] | |
12/12/25 22:38:36 INFO hadoop.FlowMapper: sinking to: GroupBy(_pipe_0*_pipe_1)[by:[{1}:'token']] | |
12/12/25 22:38:36 INFO assembly.AggregateBy: using threshold value: 100000 | |
12/12/25 22:38:36 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec | |
12/12/25 22:38:36 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
12/12/25 22:38:36 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec | |
12/12/25 22:38:36 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO mapred.MapTask: Starting flush of map output | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO mapred.MapTask: Finished spill 0 | |
12/12/25 22:38:36 INFO mapred.TaskRunner: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting | |
12/12/25 22:38:36 INFO mapred.LocalJobRunner: file:/Users/ceteri/src/concur/Impatient/part8/data/rain.txt:0+510 | |
12/12/25 22:38:36 INFO mapred.TaskRunner: Task 'attempt_local_0001_m_000000_0' done. | |
12/12/25 22:38:36 INFO mapred.LocalJobRunner: | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO mapred.Merger: Merging 1 sorted segments | |
12/12/25 22:38:36 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 561 bytes | |
12/12/25 22:38:36 INFO mapred.LocalJobRunner: | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO hadoop.FlowReducer: sourcing from: GroupBy(_pipe_0*_pipe_1)[by:[{1}:'token']] | |
12/12/25 22:38:36 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"] | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
12/12/25 22:38:36 INFO mapred.TaskRunner: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting | |
12/12/25 22:38:36 INFO mapred.LocalJobRunner: | |
12/12/25 22:38:36 INFO mapred.TaskRunner: Task attempt_local_0001_r_000000_0 is allowed to commit now | |
12/12/25 22:38:36 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part8/output/wc | |
12/12/25 22:38:36 INFO mapred.LocalJobRunner: reduce > reduce | |
12/12/25 22:38:36 INFO mapred.TaskRunner: Task 'attempt_local_0001_r_000000_0' done. | |
12/12/25 22:38:41 INFO util.Hadoop18TapUtil: deleting temp path output/wc/_temporary | |
bash-3.2$ head output/wc/part-00000 | |
token count | |
air 1 | |
area 4 | |
australia 1 | |
broken 1 | |
california's 1 | |
cause 1 | |
cloudcover 1 | |
death 1 | |
deserts 1 | |
bash-3.2$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bash-3.2$ hadoop -version | |
Warning: $HADOOP_HOME is deprecated. | |
java version "1.6.0_37" | |
Java(TM) SE Runtime Environment (build 1.6.0_37-b06-434-11M3909) | |
Java HotSpot(TM) 64-Bit Server VM (build 20.12-b01-434, mixed mode) | |
bash-3.2$ gradle -version | |
------------------------------------------------------------ | |
Gradle 1.3 | |
------------------------------------------------------------ | |
Gradle build time: Tuesday, November 20, 2012 11:37:38 AM UTC | |
Groovy: 1.8.6 | |
Ant: Apache Ant(TM) version 1.8.4 compiled on May 22 2012 | |
Ivy: 2.2.0 | |
JVM: 1.6.0_37 (Apple Inc. 20.12-b01-434) | |
OS: Mac OS X 10.7.5 x86_64 | |
bash-3.2$ gradle clean jar | |
:clean | |
:compileJava UP-TO-DATE | |
:compileScala | |
:processResources UP-TO-DATE | |
:classes | |
:jar | |
BUILD SUCCESSFUL | |
Total time: 14.744 secs | |
bash-3.2$ rm -rf output/ | |
bash-3.2$ hadoop jar build/libs/impatient.jar Example3 --hdfs \ | |
> --doc data/rain.txt --wc output/wc | |
Warning: $HADOOP_HOME is deprecated. | |
2013-01-03 11:39:53.267 java[21043:1903] Unable to load realm info from SCDynamicStore | |
13/01/03 11:39:53 INFO util.HadoopUtil: resolving application jar from found main method on: com.twitter.scalding.Tool$ | |
13/01/03 11:39:53 INFO planner.HadoopPlanner: using application jar: /tmp/hadoop-ceteri/hadoop-unjar5395931113881089164/lib/scalding_2.9.2-0.8.1.jar | |
13/01/03 11:39:53 INFO property.AppProps: using app.id: C75CAAB35A506A370971626CC5069D95 | |
13/01/03 11:39:53 INFO flow.Flow: [Example3] starting | |
13/01/03 11:39:53 INFO flow.Flow: [Example3] source: Hfs["TextDelimited[['doc_id', 'text']]"]["data/rain.txt"]"] | |
13/01/03 11:39:53 INFO flow.Flow: [Example3] sink: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"] | |
13/01/03 11:39:53 INFO flow.Flow: [Example3] parallel execution is enabled: false | |
13/01/03 11:39:53 INFO flow.Flow: [Example3] starting jobs: 1 | |
13/01/03 11:39:53 INFO flow.Flow: [Example3] allocating threads: 1 | |
13/01/03 11:39:53 INFO flow.FlowStep: [Example3] starting step: (1/1) output/wc | |
13/01/03 11:39:53 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
13/01/03 11:39:53 WARN snappy.LoadSnappy: Snappy native library not loaded | |
13/01/03 11:39:53 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
13/01/03 11:39:53 INFO flow.FlowStep: [Example3] submitted hadoop job: job_local_0001 | |
13/01/03 11:39:53 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/01/03 11:39:53 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
13/01/03 11:39:53 INFO io.MultiInputSplit: current split input path: file:/Users/ceteri/src/concur/Impatient/part8/data/rain.txt | |
13/01/03 11:39:53 INFO mapred.MapTask: numReduceTasks: 1 | |
13/01/03 11:39:53 INFO mapred.MapTask: io.sort.mb = 100 | |
13/01/03 11:39:53 INFO mapred.MapTask: data buffer = 79691776/99614720 | |
13/01/03 11:39:53 INFO mapred.MapTask: record buffer = 262144/327680 | |
13/01/03 11:39:54 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
13/01/03 11:39:54 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
13/01/03 11:39:54 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['doc_id', 'text']]"]["data/rain.txt"]"] | |
13/01/03 11:39:54 INFO hadoop.FlowMapper: sinking to: GroupBy(Tsv(data/rain.txt,'doc_id', 'text',true,false))[by:[{1}:'token']] | |
13/01/03 11:39:54 INFO assembly.AggregateBy: using threshold value: 100000 | |
13/01/03 11:39:54 INFO mapred.MapTask: Starting flush of map output | |
13/01/03 11:39:54 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
13/01/03 11:39:54 INFO mapred.MapTask: Finished spill 0 | |
13/01/03 11:39:54 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting | |
13/01/03 11:39:56 INFO mapred.LocalJobRunner: file:/Users/ceteri/src/concur/Impatient/part8/data/rain.txt:0+510 | |
13/01/03 11:39:56 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done. | |
13/01/03 11:39:56 INFO mapred.Task: Using ResourceCalculatorPlugin : null | |
13/01/03 11:39:56 INFO mapred.LocalJobRunner: | |
13/01/03 11:39:56 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
13/01/03 11:39:56 INFO mapred.Merger: Merging 1 sorted segments | |
13/01/03 11:39:56 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 797 bytes | |
13/01/03 11:39:56 INFO mapred.LocalJobRunner: | |
13/01/03 11:39:56 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
13/01/03 11:39:56 INFO hadoop.FlowReducer: sourcing from: GroupBy(Tsv(data/rain.txt,'doc_id', 'text',true,false))[by:[{1}:'token']] | |
13/01/03 11:39:56 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"] | |
13/01/03 11:39:56 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
13/01/03 11:39:56 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
13/01/03 11:39:56 INFO hadoop.TupleSerialization: using default comparator: com.twitter.scalding.IntegralComparator | |
13/01/03 11:39:56 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting | |
13/01/03 11:39:56 INFO mapred.LocalJobRunner: | |
13/01/03 11:39:56 INFO mapred.Task: Task attempt_local_0001_r_000000_0 is allowed to commit now | |
13/01/03 11:39:56 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part8/output/wc | |
13/01/03 11:39:59 INFO mapred.LocalJobRunner: reduce > reduce | |
13/01/03 11:39:59 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done. | |
13/01/03 11:40:03 INFO util.Hadoop18TapUtil: deleting temp path output/wc/_temporary | |
bash-3.2$ cat output/wc/part-00000 | |
token count | |
a 8 | |
air 1 | |
an 1 | |
and 2 | |
area 4 | |
as 2 | |
australia 1 | |
back 1 | |
broken 1 | |
california's 1 | |
cause 1 | |
cloudcover 1 | |
death 1 | |
deserts 1 | |
downwind 1 | |
dry 3 | |
dvd 1 | |
effect 1 | |
in 1 | |
is 4 | |
known 1 | |
land 2 | |
lee 2 | |
leeward 2 | |
less 1 | |
lies 1 | |
mountain 3 | |
mountainous 1 | |
of 6 | |
on 2 | |
or 2 | |
primary 1 | |
produces 1 | |
rain 5 | |
ranges 1 | |
secrets 1 | |
shadow 4 | |
side 2 | |
sinking 1 | |
such 1 | |
that 1 | |
the 5 | |
this 2 | |
two 1 | |
valley 1 | |
with 1 | |
women 1 | |
bash-3.2$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bash-3.2$ mvn clean package | |
[INFO] Scanning for projects... | |
[INFO] | |
[INFO] ------------------------------------------------------------------------ | |
[INFO] Building part8 1.0.0 | |
[INFO] ------------------------------------------------------------------------ | |
[INFO] | |
[INFO] --- maven-clean-plugin:2.4.1:clean (default-clean) @ part8 --- | |
[INFO] Deleting /Users/ceteri/src/concur/Impatient/part8/target | |
[INFO] | |
[INFO] --- maven-resources-plugin:2.5:resources (default-resources) @ part8 --- | |
[debug] execute contextualize | |
[INFO] Using 'UTF-8' encoding to copy filtered resources. | |
[INFO] Copying 0 resource | |
[INFO] | |
[INFO] --- maven-compiler-plugin:2.3.2:compile (default-compile) @ part8 --- | |
[INFO] No sources to compile | |
[INFO] | |
[INFO] --- scala-maven-plugin:3.1.1:compile (default) @ part8 --- | |
[INFO] /Users/ceteri/src/concur/Impatient/part8/src/main/scala:-1: info: compiling | |
[INFO] Compiling 2 source files to /Users/ceteri/src/concur/Impatient/part8/target/classes at 1357963343144 | |
[INFO] prepare-compile in 0 s | |
[INFO] compile in 5 s | |
[INFO] | |
[INFO] --- maven-resources-plugin:2.5:testResources (default-testResources) @ part8 --- | |
[debug] execute contextualize | |
[INFO] Using 'UTF-8' encoding to copy filtered resources. | |
[INFO] Copying 0 resource | |
[INFO] | |
[INFO] --- maven-compiler-plugin:2.3.2:testCompile (default-testCompile) @ part8 --- | |
[INFO] No sources to compile | |
[INFO] | |
[INFO] --- scala-maven-plugin:3.1.1:testCompile (default) @ part8 --- | |
[WARNING] No source files found. | |
[INFO] | |
[INFO] --- maven-surefire-plugin:2.10:test (default-test) @ part8 --- | |
[INFO] Surefire report directory: /Users/ceteri/src/concur/Impatient/part8/target/surefire-reports | |
------------------------------------------------------- | |
T E S T S | |
------------------------------------------------------- | |
Results : | |
Tests run: 0, Failures: 0, Errors: 0, Skipped: 0 | |
[INFO] | |
[INFO] --- maven-dependency-plugin:2.6:copy-dependencies (copy-dependencies) @ part8 --- | |
[INFO] Copying chill_2.9.2-0.0.3.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/chill_2.9.2-0.0.3.jar | |
[INFO] Copying jsr305-1.3.9.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/jsr305-1.3.9.jar | |
[INFO] Copying antlr-runtime-3.2.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/antlr-runtime-3.2.jar | |
[INFO] Copying log4j-1.2.16.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/log4j-1.2.16.jar | |
[INFO] Copying scalding_2.9.2-0.8.1.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/scalding_2.9.2-0.8.1.jar | |
[INFO] Copying guava-10.0.1.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/guava-10.0.1.jar | |
[INFO] Copying algebird_2.9.2-0.1.3.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/algebird_2.9.2-0.1.3.jar | |
[INFO] Copying ical4j-1.0.2.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/ical4j-1.0.2.jar | |
[INFO] Copying commons-lang-2.4.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/commons-lang-2.4.jar | |
[INFO] Copying minlog-1.2.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/minlog-1.2.jar | |
[INFO] Copying commons-logging-1.0.3.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/commons-logging-1.0.3.jar | |
[INFO] Copying cascading-core-2.0.2.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/cascading-core-2.0.2.jar | |
[INFO] Copying jerkson_2.9.2-0.7.0.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/jerkson_2.9.2-0.7.0.jar | |
[INFO] Copying commons-codec-1.4.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/commons-codec-1.4.jar | |
[INFO] Copying reflectasm-1.07-shaded.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/reflectasm-1.07-shaded.jar | |
[INFO] Copying jgrapht-jdk1.6-0.8.1.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/jgrapht-jdk1.6-0.8.1.jar | |
[INFO] Copying jackson-annotations-2.0.5.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/jackson-annotations-2.0.5.jar | |
[INFO] Copying meat-locker-0.3.1.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/meat-locker-0.3.1.jar | |
[INFO] Copying natty-0.7.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/natty-0.7.jar | |
[INFO] Copying objenesis-1.2.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/objenesis-1.2.jar | |
[INFO] Copying stringtemplate-3.2.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/stringtemplate-3.2.jar | |
[INFO] Copying janino-2.5.16.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/janino-2.5.16.jar | |
[INFO] Copying slf4j-api-1.6.1.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/slf4j-api-1.6.1.jar | |
[INFO] Copying riffle-0.1-dev.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/riffle-0.1-dev.jar | |
[INFO] Copying kryo-2.17.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/kryo-2.17.jar | |
[INFO] Copying slf4j-log4j12-1.6.1.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/slf4j-log4j12-1.6.1.jar | |
[INFO] Copying backport-util-concurrent-3.1.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/backport-util-concurrent-3.1.jar | |
[INFO] Copying cascading.kryo-0.4.5.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/cascading.kryo-0.4.5.jar | |
[INFO] Copying jackson-databind-2.0.5.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/jackson-databind-2.0.5.jar | |
[INFO] Copying antlr-2.7.7.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/antlr-2.7.7.jar | |
[INFO] Copying cascading-local-2.0.2.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/cascading-local-2.0.2.jar | |
[INFO] Copying scala-library-2.9.2.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/scala-library-2.9.2.jar | |
[INFO] Copying maple-0.2.4.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/maple-0.2.4.jar | |
[INFO] Copying jackson-core-2.0.5.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/jackson-core-2.0.5.jar | |
[INFO] Copying cascading-hadoop-2.0.2.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/cascading-hadoop-2.0.2.jar | |
[INFO] Copying asm-4.0.jar to /Users/ceteri/src/concur/Impatient/part8/target/classes/lib/asm-4.0.jar | |
[INFO] | |
[INFO] --- maven-jar-plugin:2.4:jar (default-jar) @ part8 --- | |
[INFO] Building jar: /Users/ceteri/src/concur/Impatient/part8/target/part8-1.0.0.jar | |
[INFO] ------------------------------------------------------------------------ | |
[INFO] BUILD SUCCESS | |
[INFO] ------------------------------------------------------------------------ | |
[INFO] Total time: 10.012s | |
[INFO] Finished at: Fri Jan 11 20:02:30 PST 2013 | |
[INFO] Final Memory: 10M/81M | |
[INFO] ------------------------------------------------------------------------ | |
bash-3.2$ |
when I run a maven built fat jar on hadoop cluster, I get classnotfound error
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
When I run the examples, I get the flow.Flow INFO logging, but I'm not seeing any other logging (especially hadoop and mapred). I checked the logging conf for my Hadoop and it has a console appender on the root logger set to INFO. I thought that would get me the output I was looking for in Scalding. Any idea what I need to set to get Hadoop to log at info? BTW, this is all running with --local.