public
Created

Avro Reducer that fails with "No field named chromosome in: null"

  • Download Gist
Avro Reducer
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
public static class AvroBamCoverageReducer
extends AvroReducer<String, Integer, Pair<GenericRecord, Integer>> {
 
public void reduce(String key, Iterator<Integer> values,
AvroCollector<Pair<GenericRecord, Integer>> collector, Reporter reporter)
throws IOException {
int sum = 0;
while(values.hasNext()) {
Integer value = values.next();
sum += value.intValue();
}
 
GenericRecord record = new GenericData.Record(coverageSchema);
String[] parts = key.toString().split(":");
record.put("chromosome", parts[0]);
record.put("position", parts[1]);
record.put("depth", sum);
 
// I couldn't figure out how to have null as the second part of the pair
Pair pair = new Pair(record, new Integer(parts[0]));
collector.collect(pair);
}
}
 
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: BAMCoverage <in> <out>");
System.exit(2);
}
 
Job job = new Job(conf, "BAM Coverage");
job.setJarByClass(BAMCoverage.class);
 
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
Path outputPath = new Path(otherArgs[1]);
outputPath.getFileSystem(conf).delete(outputPath);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
 
job.setMapperClass(BamCoverageMapper.class);
job.setInputFormatClass(AnySAMInputFormat.class);
Schema stringSchema = Schema.create(Schema.Type.STRING);
Schema intSchema = Schema.create(Schema.Type.INT);
Schema mapPairSchema = Pair.getPairSchema(stringSchema, intSchema);
AvroJob.setMapOutputSchema(conf, mapPairSchema);
 
AvroJob.setReducerClass(conf, AvroBamCoverageReducer.class);
Schema reducePairSchema = Pair.getPairSchema(coverageSchema, intSchema);
AvroJob.setOutputSchema(conf, reducePairSchema);
 
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
Avro Schema
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
{
"type" : "record",
"name" : "PerPositionCoverage",
"namespace" : "edu.wustl.genome.hadoop.coverage",
"fields" : [ {
"name" : "chromosome",
"type" : "string"
}, {
"name" : "position",
"type" : "int"
}, {
"name" : "depth",
"type" : "int"
} ]
}
Stack Trace
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
2012-11-01 16:27:26,906 WARN org.apache.hadoop.mapred.Child: Error running child
org.apache.avro.file.DataFileWriter$AppendWriteException: org.apache.avro.AvroRuntimeException: No field named chromosome in: null
at org.apache.avro.file.DataFileWriter.append(DataFileWriter.java:261)
at org.apache.avro.mapred.AvroOutputFormat$1.write(AvroOutputFormat.java:116)
at org.apache.avro.mapred.AvroOutputFormat$1.write(AvroOutputFormat.java:113)
at org.apache.hadoop.mapred.ReduceTask$3.collect(ReduceTask.java:446)
at org.apache.avro.mapred.HadoopReducer$ReduceCollector.collect(HadoopReducer.java:50)
at org.apache.avro.mapred.AvroReducer.reduce(AvroReducer.java:50)
at org.apache.avro.mapred.HadoopReducerBase.reduce(HadoopReducerBase.java:61)
at org.apache.avro.mapred.HadoopReducerBase.reduce(HadoopReducerBase.java:30)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:469)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:417)
at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:416)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1232)
at org.apache.hadoop.mapred.Child.main(Child.java:264)
Caused by: org.apache.avro.AvroRuntimeException: No field named chromosome in: null
at org.apache.avro.reflect.ReflectData.findField(ReflectData.java:194)
at org.apache.avro.reflect.ReflectData.getField(ReflectData.java:179)
at org.apache.avro.reflect.ReflectData.getField(ReflectData.java:96)
at org.apache.avro.generic.GenericDatumWriter.writeRecord(GenericDatumWriter.java:102)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:65)
at org.apache.avro.reflect.ReflectDatumWriter.write(ReflectDatumWriter.java:102)
at org.apache.avro.generic.GenericDatumWriter.writeRecord(GenericDatumWriter.java:104)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:65)
at org.apache.avro.reflect.ReflectDatumWriter.write(ReflectDatumWriter.java:102)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:57)
at org.apache.avro.file.DataFileWriter.append(DataFileWriter.java:255)
... 14 more

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.