Skip to content

Instantly share code, notes, and snippets.

@bderickson
Created November 1, 2012 21:49
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save bderickson/3996847 to your computer and use it in GitHub Desktop.
Avro Reducer that fails with "No field named chromosome in: null"
public static class AvroBamCoverageReducer
extends AvroReducer<String, Integer, Pair<GenericRecord, Integer>> {
public void reduce(String key, Iterator<Integer> values,
AvroCollector<Pair<GenericRecord, Integer>> collector, Reporter reporter)
throws IOException {
int sum = 0;
while(values.hasNext()) {
Integer value = values.next();
sum += value.intValue();
}
GenericRecord record = new GenericData.Record(coverageSchema);
String[] parts = key.toString().split(":");
record.put("chromosome", parts[0]);
record.put("position", parts[1]);
record.put("depth", sum);
// I couldn't figure out how to have null as the second part of the pair
Pair pair = new Pair(record, new Integer(parts[0]));
collector.collect(pair);
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: BAMCoverage <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "BAM Coverage");
job.setJarByClass(BAMCoverage.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
Path outputPath = new Path(otherArgs[1]);
outputPath.getFileSystem(conf).delete(outputPath);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
job.setMapperClass(BamCoverageMapper.class);
job.setInputFormatClass(AnySAMInputFormat.class);
Schema stringSchema = Schema.create(Schema.Type.STRING);
Schema intSchema = Schema.create(Schema.Type.INT);
Schema mapPairSchema = Pair.getPairSchema(stringSchema, intSchema);
AvroJob.setMapOutputSchema(conf, mapPairSchema);
AvroJob.setReducerClass(conf, AvroBamCoverageReducer.class);
Schema reducePairSchema = Pair.getPairSchema(coverageSchema, intSchema);
AvroJob.setOutputSchema(conf, reducePairSchema);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
{
"type" : "record",
"name" : "PerPositionCoverage",
"namespace" : "edu.wustl.genome.hadoop.coverage",
"fields" : [ {
"name" : "chromosome",
"type" : "string"
}, {
"name" : "position",
"type" : "int"
}, {
"name" : "depth",
"type" : "int"
} ]
}
2012-11-01 16:27:26,906 WARN org.apache.hadoop.mapred.Child: Error running child
org.apache.avro.file.DataFileWriter$AppendWriteException: org.apache.avro.AvroRuntimeException: No field named chromosome in: null
at org.apache.avro.file.DataFileWriter.append(DataFileWriter.java:261)
at org.apache.avro.mapred.AvroOutputFormat$1.write(AvroOutputFormat.java:116)
at org.apache.avro.mapred.AvroOutputFormat$1.write(AvroOutputFormat.java:113)
at org.apache.hadoop.mapred.ReduceTask$3.collect(ReduceTask.java:446)
at org.apache.avro.mapred.HadoopReducer$ReduceCollector.collect(HadoopReducer.java:50)
at org.apache.avro.mapred.AvroReducer.reduce(AvroReducer.java:50)
at org.apache.avro.mapred.HadoopReducerBase.reduce(HadoopReducerBase.java:61)
at org.apache.avro.mapred.HadoopReducerBase.reduce(HadoopReducerBase.java:30)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:469)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:417)
at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:416)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1232)
at org.apache.hadoop.mapred.Child.main(Child.java:264)
Caused by: org.apache.avro.AvroRuntimeException: No field named chromosome in: null
at org.apache.avro.reflect.ReflectData.findField(ReflectData.java:194)
at org.apache.avro.reflect.ReflectData.getField(ReflectData.java:179)
at org.apache.avro.reflect.ReflectData.getField(ReflectData.java:96)
at org.apache.avro.generic.GenericDatumWriter.writeRecord(GenericDatumWriter.java:102)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:65)
at org.apache.avro.reflect.ReflectDatumWriter.write(ReflectDatumWriter.java:102)
at org.apache.avro.generic.GenericDatumWriter.writeRecord(GenericDatumWriter.java:104)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:65)
at org.apache.avro.reflect.ReflectDatumWriter.write(ReflectDatumWriter.java:102)
at org.apache.avro.generic.GenericDatumWriter.write(GenericDatumWriter.java:57)
at org.apache.avro.file.DataFileWriter.append(DataFileWriter.java:255)
... 14 more
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment