ccsevers/AvroReadExample.java

## AvroReadExample.java
package cascading.avro.examples;

import java.util.Properties;

import cascading.flow.Flow;
import cascading.flow.FlowDef;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.operation.aggregator.Count;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexSplitGenerator;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.Scheme;
import cascading.scheme.hadoop.TextDelimited;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.avro.AvroScheme;

import org.apache.avro.Schema;


public class
  AvroReadExample
  {
  public static void
  main( String[] args ) throws Exception
    {
    String docPath = args[ 0 ];
    String wcPath = args[ 1 ];


    Properties properties = new Properties();

    AppProps.setApplicationJarClass( properties, AvroReadExample.class );
    HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );

    // create source and sink taps
    // Source is Avro, note there is no schema needed.
    Tap docTap = new Hfs( new AvroScheme(), docPath );
    Tap wcTap = new Hfs( new TextDelimited(), wcPath, true );

    Pipe wcPipe = new Pipe( "wordcount" );
    wcPipe = new GroupBy( wcPipe, new Fields("count") );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(new Fields("countcount")), Fields.ALL );


    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( wcPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wcr.dot" );
    wcFlow.complete();
    }
  }

## AvroWordCountWrite.java
package cascading.avro.examples;

import java.util.Properties;

import cascading.flow.Flow;
import cascading.flow.FlowDef;
import cascading.flow.hadoop.HadoopFlowConnector;
import cascading.operation.aggregator.Count;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexSplitGenerator;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.Scheme;
import cascading.scheme.hadoop.TextDelimited;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.avro.AvroScheme;

import org.apache.avro.Schema;


public class
  WordCountAvroWrite
  {
  public static void
  main( String[] args ) throws Exception
    {
    String docPath = args[ 0 ];
    String wcPath = args[ 1 ];

    // Get the schema from a file
    Schema schema = new Schema.Parser().parse(WordCountAvroWrite.class.getResourceAsStream("/wc.avsc"));

    Properties properties = new Properties();

    AppProps.setApplicationJarClass( properties, WordCountAvroWrite.class );
    HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );


    // create source and sink taps
    Tap docTap = new Hfs( new TextLine( new Fields("text")), docPath );
    // Create the output tap with AvroScheme and the schema we read up.
    Tap wcTap = new Hfs( new AvroScheme( schema ), wcPath );

    //Everything from here on is the same. No changes needed.

    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields( "token" );
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
    // only returns "token"
    Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
  }

## wc.avsc
{
	"type":"record",
	"name":"wordcount",
	"namespace":"cascading.avro.examples",
	"fields":[
		{"name":"token", "type":"string"},
		{"name":"count", "type":"long"}
	]
}
	package cascading.avro.examples;

	import java.util.Properties;

	import cascading.flow.Flow;
	import cascading.flow.FlowDef;
	import cascading.flow.hadoop.HadoopFlowConnector;
	import cascading.operation.aggregator.Count;
	import cascading.operation.regex.RegexFilter;
	import cascading.operation.regex.RegexSplitGenerator;
	import cascading.pipe.Each;
	import cascading.pipe.Every;
	import cascading.pipe.GroupBy;
	import cascading.pipe.Pipe;
	import cascading.property.AppProps;
	import cascading.scheme.Scheme;
	import cascading.scheme.hadoop.TextDelimited;
	import cascading.scheme.hadoop.TextLine;
	import cascading.tap.Tap;
	import cascading.tap.hadoop.Hfs;
	import cascading.tuple.Fields;
	import cascading.avro.AvroScheme;

	import org.apache.avro.Schema;


	public class
	AvroReadExample
	{
	public static void
	main( String[] args ) throws Exception
	{
	String docPath = args[ 0 ];
	String wcPath = args[ 1 ];


	Properties properties = new Properties();

	AppProps.setApplicationJarClass( properties, AvroReadExample.class );
	HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );

	// create source and sink taps
	// Source is Avro, note there is no schema needed.
	Tap docTap = new Hfs( new AvroScheme(), docPath );
	Tap wcTap = new Hfs( new TextDelimited(), wcPath, true );

	Pipe wcPipe = new Pipe( "wordcount" );
	wcPipe = new GroupBy( wcPipe, new Fields("count") );
	wcPipe = new Every( wcPipe, Fields.ALL, new Count(new Fields("countcount")), Fields.ALL );


	// connect the taps, pipes, etc., into a flow
	FlowDef flowDef = FlowDef.flowDef()
	.setName( "wc" )
	.addSource( wcPipe, docTap )
	.addTailSink( wcPipe, wcTap );

	// write a DOT file and run the flow
	Flow wcFlow = flowConnector.connect( flowDef );
	wcFlow.writeDOT( "dot/wcr.dot" );
	wcFlow.complete();
	}
	}
	{
	"type":"record",
	"name":"wordcount",
	"namespace":"cascading.avro.examples",
	"fields":[
	{"name":"token", "type":"string"},
	{"name":"count", "type":"long"}
	]
	}