public
Last active

cascading.avro wordcount example

  • Download Gist
AvroReadExample.java
Java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
package cascading.avro.examples;
 
import java.util.Properties;
 
import cascading.flow.Flow;
import cascading.flow.FlowDef;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.operation.aggregator.Count;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexSplitGenerator;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.Scheme;
import cascading.scheme.hadoop.TextDelimited;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.avro.AvroScheme;
 
import org.apache.avro.Schema;
 
 
public class
AvroReadExample
{
public static void
main( String[] args ) throws Exception
{
String docPath = args[ 0 ];
String wcPath = args[ 1 ];
 
 
Properties properties = new Properties();
AppProps.setApplicationJarClass( properties, AvroReadExample.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );
 
// create source and sink taps
// Source is Avro, note there is no schema needed.
Tap docTap = new Hfs( new AvroScheme(), docPath );
Tap wcTap = new Hfs( new TextDelimited(), wcPath, true );
 
Pipe wcPipe = new Pipe( "wordcount" );
wcPipe = new GroupBy( wcPipe, new Fields("count") );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(new Fields("countcount")), Fields.ALL );
 
 
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( wcPipe, docTap )
.addTailSink( wcPipe, wcTap );
 
// write a DOT file and run the flow
Flow wcFlow = flowConnector.connect( flowDef );
wcFlow.writeDOT( "dot/wcr.dot" );
wcFlow.complete();
}
}
AvroWordCountWrite.java
Java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
package cascading.avro.examples;
 
import java.util.Properties;
 
import cascading.flow.Flow;
import cascading.flow.FlowDef;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.operation.aggregator.Count;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexSplitGenerator;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.Scheme;
import cascading.scheme.hadoop.TextDelimited;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.avro.AvroScheme;
 
import org.apache.avro.Schema;
 
 
public class
WordCountAvroWrite
{
public static void
main( String[] args ) throws Exception
{
String docPath = args[ 0 ];
String wcPath = args[ 1 ];
 
// Get the schema from a file
Schema schema = new Schema.Parser().parse(WordCountAvroWrite.class.getResourceAsStream("/wc.avsc"));
 
Properties properties = new Properties();
AppProps.setApplicationJarClass( properties, WordCountAvroWrite.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );
 
 
// create source and sink taps
Tap docTap = new Hfs( new TextLine( new Fields("text")), docPath );
// Create the output tap with AvroScheme and the schema we read up.
Tap wcTap = new Hfs( new AvroScheme( schema ), wcPath );
 
//Everything from here on is the same. No changes needed.
 
// specify a regex operation to split the "document" text lines into a token stream
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
 
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
 
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
 
// write a DOT file and run the flow
Flow wcFlow = flowConnector.connect( flowDef );
wcFlow.writeDOT( "dot/wc.dot" );
wcFlow.complete();
}
}
wc.avsc
1 2 3 4 5 6 7 8 9
{
"type":"record",
"name":"wordcount",
"namespace":"cascading.avro.examples",
"fields":[
{"name":"token", "type":"string"},
{"name":"count", "type":"long"}
]
}

Count words from your favorite source and then write to Avro with the WordCountAvroWrite job. Read up your new avros using the AvroReadExample job.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.