Skip to content

Instantly share code, notes, and snippets.

@lindenb
Last active December 10, 2015 01:09
Show Gist options
  • Save lindenb/4356276 to your computer and use it in GitHub Desktop.
Save lindenb/4356276 to your computer and use it in GitHub Desktop.
/*
Author: Pierre Lindenbaum PhD.
*/
import java.io.File;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamWriter;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordIterator;
public class Biostar59647
{
public static void main(String[] args)
throws Exception
{
if(args.length<3)
{
System.err.println("Usage: ref bam chr:start-end chr:start-end ...");
return;
}
File refFile=new File(args[0]);
File bamFile=new File(args[1]);
SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT);
ReferenceSequenceFile reference=ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
SAMFileReader samFileReader=new SAMFileReader(bamFile);
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8");
w.writeStartDocument("UTF-8","1.0");
w.writeStartElement("biostar59647");
w.writeAttribute("ref", args[0]);
w.writeAttribute("bam", args[1]);
for(int optind=2;optind<args.length;++optind)
{
String s=args[optind];
int colon=s.indexOf(':'); if(colon==-1) continue;
String chrom=s.substring(0,colon);
int t=s.indexOf('-',colon+1);if(t==-1) continue;
int start=Integer.parseInt(s.substring(colon+1,t));
int end=Integer.parseInt(s.substring(t+1));
ReferenceSequence refseq=reference.getSubsequenceAt(chrom, start, end);
if(refseq==null) continue;
byte dna[]=refseq.getBases();
w.writeStartElement("interval");
w.writeAttribute("chrom", chrom);
w.writeAttribute("start", String.valueOf(start));
w.writeAttribute("end", String.valueOf(end));
SAMRecordIterator iter=samFileReader.queryOverlapping(chrom, start, end);
while(iter.hasNext())
{
SAMRecord rec=iter.next();
final byte readbases[]=rec.getReadBases();
w.writeStartElement("read");
w.writeStartElement("name");
w.writeCharacters(rec.getReadName());
w.writeEndElement();
w.writeStartElement("sequence");
w.writeCharacters(new String(readbases));
w.writeEndElement();
w.writeStartElement("flags");
w.writeCharacters(String.valueOf(rec.getFlags()));
w.writeEndElement();
w.writeStartElement("chrom");
w.writeCharacters(rec.getReferenceName());
w.writeEndElement();
w.writeStartElement("pos");
w.writeCharacters(String.valueOf(rec.getAlignmentStart()));
w.writeEndElement();
w.writeStartElement("cigar");
w.writeCharacters(rec.getCigarString());
w.writeEndElement();
w.writeStartElement("align");
int readIndex = 0;
int refIndex = rec.getAlignmentStart();
for (final CigarElement e : rec.getCigar().getCigarElements())
{
switch (e.getOperator())
{
case H : break; // ignore hard clips
case P : break; // ignore pads
case I : //cont.
case S :
{
final int length = e.getLength();
for(int i=0;i<length;++i)
{
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("read-index",String.valueOf(readIndex+1));
if(readIndex>=0 && readIndex< readbases.length)
{
w.writeAttribute("read-base",String.valueOf((char)(readbases[readIndex])));
}
readIndex++;
}
break;
}
case N : //cont. -- reference skip
case D :
{
final int length = e.getLength();
for(int i=0;i<length;++i)
{
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("ref-index",String.valueOf(refIndex));
int idx=refIndex-start;
if(idx>=0 && idx< dna.length)
{
w.writeAttribute("ref-base",String.valueOf((char)(dna[idx])));
}
refIndex++;
}
break;
}
case M :
case EQ :
case X :
{
final int length = e.getLength();
for(int i=0;i<length;++i)
{
w.writeEmptyElement(e.getOperator().name());
char baseRead='\0';
if(readIndex>=0 && readIndex< readbases.length)
{
baseRead=(char)(rec.getReadBases()[readIndex]);
w.writeAttribute("read-index",String.valueOf(readIndex+1));
w.writeAttribute("read-base",String.valueOf(baseRead));
}
w.writeAttribute("ref-index",String.valueOf(refIndex));
int idx2=refIndex-start;
if(idx2>=0 && idx2< dna.length)
{
char baseRef=(char)(dna[idx2]);
w.writeAttribute("ref-base",String.valueOf(baseRef));
if(Character.toUpperCase(baseRef)!=Character.toUpperCase(baseRead))
{
w.writeAttribute("mismatch","true");
}
}
refIndex++;
readIndex++;
}
break;
}
default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator());
}
}
w.writeEndElement();
w.writeEndElement();
}
iter.close();
w.writeEndElement();
}
samFileReader.close();
w.writeEndElement();
w.writeEndDocument();
w.flush();
w.close();
}
}
<?xml version="1.0" encoding="UTF-8"?>
<biostar59647 ref="/home/lindenb/samtools-0.1.18/examples/toy.fa" bam="/home/lindenb/samtools-0.1.18/examples/toy.bam">
<interval chrom="ref" start="9" end="20">
<read>
<name>r001</name>
<sequence>TTAGATAAAGAGGATACTG</sequence>
<flags>163</flags>
<chrom>ref</chrom>
<pos>7</pos>
<cigar>8M4I4M1D3M</cigar>
<align>
<M read-index="1" read-base="T" ref-index="7"/>
<M read-index="2" read-base="T" ref-index="8"/>
<M read-index="3" read-base="A" ref-index="9" ref-base="A"/>
<M read-index="4" read-base="G" ref-index="10" ref-base="G"/>
<M read-index="5" read-base="A" ref-index="11" ref-base="A"/>
<M read-index="6" read-base="T" ref-index="12" ref-base="T"/>
<M read-index="7" read-base="A" ref-index="13" ref-base="A"/>
<M read-index="8" read-base="A" ref-index="14" ref-base="A"/>
<I read-index="9" read-base="A"/>
<I read-index="10" read-base="G"/>
<I read-index="11" read-base="A"/>
<I read-index="12" read-base="G"/>
<M read-index="13" read-base="G" ref-index="15" ref-base="G"/>
<M read-index="14" read-base="A" ref-index="16" ref-base="A"/>
<M read-index="15" read-base="T" ref-index="17" ref-base="T"/>
<M read-index="16" read-base="A" ref-index="18" ref-base="A"/>
<D ref-index="19" ref-base="G"/>
<M read-index="17" read-base="C" ref-index="20" ref-base="C"/>
<M read-index="18" read-base="T" ref-index="21"/>
<M read-index="19" read-base="G" ref-index="22"/>
</align>
</read>
<read>
<name>r002</name>
<sequence>AAAAGATAAGGGATAAA</sequence>
<flags>0</flags>
<chrom>ref</chrom>
<pos>9</pos>
<cigar>1S2I6M1P1I1P1I4M2I</cigar>
<align>
<S read-index="1" read-base="A"/>
<I read-index="2" read-base="A"/>
<I read-index="3" read-base="A"/>
<M read-index="4" read-base="A" ref-index="9" ref-base="A"/>
<M read-index="5" read-base="G" ref-index="10" ref-base="G"/>
<M read-index="6" read-base="A" ref-index="11" ref-base="A"/>
<M read-index="7" read-base="T" ref-index="12" ref-base="T"/>
<M read-index="8" read-base="A" ref-index="13" ref-base="A"/>
<M read-index="9" read-base="A" ref-index="14" ref-base="A"/>
<I read-index="10" read-base="G"/>
<I read-index="11" read-base="G"/>
<M read-index="12" read-base="G" ref-index="15" ref-base="G"/>
<M read-index="13" read-base="A" ref-index="16" ref-base="A"/>
<M read-index="14" read-base="T" ref-index="17" ref-base="T"/>
<M read-index="15" read-base="A" ref-index="18" ref-base="A"/>
<I read-index="16" read-base="A"/>
<I read-index="17" read-base="A"/>
</align>
</read>
<read>
<name>r003</name>
<sequence>AGCTAA</sequence>
<flags>0</flags>
<chrom>ref</chrom>
<pos>9</pos>
<cigar>5H6M</cigar>
<align>
<M read-index="1" read-base="A" ref-index="9" ref-base="A"/>
<M read-index="2" read-base="G" ref-index="10" ref-base="G"/>
<M read-index="3" read-base="C" ref-index="11" ref-base="A" mismatch="true"/>
<M read-index="4" read-base="T" ref-index="12" ref-base="T"/>
<M read-index="5" read-base="A" ref-index="13" ref-base="A"/>
<M read-index="6" read-base="A" ref-index="14" ref-base="A"/>
</align>
</read>
<read>
<name>r004</name>
<sequence>ATAGCTCTCAGC</sequence>
<fl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment