Last active
December 10, 2015 01:09
-
-
Save lindenb/4356276 to your computer and use it in GitHub Desktop.
Code for http://www.biostars.org/p/59647/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Author: Pierre Lindenbaum PhD. | |
*/ | |
import java.io.File; | |
import javax.xml.stream.XMLOutputFactory; | |
import javax.xml.stream.XMLStreamWriter; | |
import net.sf.picard.reference.ReferenceSequence; | |
import net.sf.picard.reference.ReferenceSequenceFile; | |
import net.sf.picard.reference.ReferenceSequenceFileFactory; | |
import net.sf.samtools.CigarElement; | |
import net.sf.samtools.SAMFileReader; | |
import net.sf.samtools.SAMFileReader.ValidationStringency; | |
import net.sf.samtools.SAMRecord; | |
import net.sf.samtools.SAMRecordIterator; | |
public class Biostar59647 | |
{ | |
public static void main(String[] args) | |
throws Exception | |
{ | |
if(args.length<3) | |
{ | |
System.err.println("Usage: ref bam chr:start-end chr:start-end ..."); | |
return; | |
} | |
File refFile=new File(args[0]); | |
File bamFile=new File(args[1]); | |
SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT); | |
ReferenceSequenceFile reference=ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); | |
SAMFileReader samFileReader=new SAMFileReader(bamFile); | |
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); | |
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8"); | |
w.writeStartDocument("UTF-8","1.0"); | |
w.writeStartElement("biostar59647"); | |
w.writeAttribute("ref", args[0]); | |
w.writeAttribute("bam", args[1]); | |
for(int optind=2;optind<args.length;++optind) | |
{ | |
String s=args[optind]; | |
int colon=s.indexOf(':'); if(colon==-1) continue; | |
String chrom=s.substring(0,colon); | |
int t=s.indexOf('-',colon+1);if(t==-1) continue; | |
int start=Integer.parseInt(s.substring(colon+1,t)); | |
int end=Integer.parseInt(s.substring(t+1)); | |
ReferenceSequence refseq=reference.getSubsequenceAt(chrom, start, end); | |
if(refseq==null) continue; | |
byte dna[]=refseq.getBases(); | |
w.writeStartElement("interval"); | |
w.writeAttribute("chrom", chrom); | |
w.writeAttribute("start", String.valueOf(start)); | |
w.writeAttribute("end", String.valueOf(end)); | |
SAMRecordIterator iter=samFileReader.queryOverlapping(chrom, start, end); | |
while(iter.hasNext()) | |
{ | |
SAMRecord rec=iter.next(); | |
final byte readbases[]=rec.getReadBases(); | |
w.writeStartElement("read"); | |
w.writeStartElement("name"); | |
w.writeCharacters(rec.getReadName()); | |
w.writeEndElement(); | |
w.writeStartElement("sequence"); | |
w.writeCharacters(new String(readbases)); | |
w.writeEndElement(); | |
w.writeStartElement("flags"); | |
w.writeCharacters(String.valueOf(rec.getFlags())); | |
w.writeEndElement(); | |
w.writeStartElement("chrom"); | |
w.writeCharacters(rec.getReferenceName()); | |
w.writeEndElement(); | |
w.writeStartElement("pos"); | |
w.writeCharacters(String.valueOf(rec.getAlignmentStart())); | |
w.writeEndElement(); | |
w.writeStartElement("cigar"); | |
w.writeCharacters(rec.getCigarString()); | |
w.writeEndElement(); | |
w.writeStartElement("align"); | |
int readIndex = 0; | |
int refIndex = rec.getAlignmentStart(); | |
for (final CigarElement e : rec.getCigar().getCigarElements()) | |
{ | |
switch (e.getOperator()) | |
{ | |
case H : break; // ignore hard clips | |
case P : break; // ignore pads | |
case I : //cont. | |
case S : | |
{ | |
final int length = e.getLength(); | |
for(int i=0;i<length;++i) | |
{ | |
w.writeEmptyElement(e.getOperator().name()); | |
w.writeAttribute("read-index",String.valueOf(readIndex+1)); | |
if(readIndex>=0 && readIndex< readbases.length) | |
{ | |
w.writeAttribute("read-base",String.valueOf((char)(readbases[readIndex]))); | |
} | |
readIndex++; | |
} | |
break; | |
} | |
case N : //cont. -- reference skip | |
case D : | |
{ | |
final int length = e.getLength(); | |
for(int i=0;i<length;++i) | |
{ | |
w.writeEmptyElement(e.getOperator().name()); | |
w.writeAttribute("ref-index",String.valueOf(refIndex)); | |
int idx=refIndex-start; | |
if(idx>=0 && idx< dna.length) | |
{ | |
w.writeAttribute("ref-base",String.valueOf((char)(dna[idx]))); | |
} | |
refIndex++; | |
} | |
break; | |
} | |
case M : | |
case EQ : | |
case X : | |
{ | |
final int length = e.getLength(); | |
for(int i=0;i<length;++i) | |
{ | |
w.writeEmptyElement(e.getOperator().name()); | |
char baseRead='\0'; | |
if(readIndex>=0 && readIndex< readbases.length) | |
{ | |
baseRead=(char)(rec.getReadBases()[readIndex]); | |
w.writeAttribute("read-index",String.valueOf(readIndex+1)); | |
w.writeAttribute("read-base",String.valueOf(baseRead)); | |
} | |
w.writeAttribute("ref-index",String.valueOf(refIndex)); | |
int idx2=refIndex-start; | |
if(idx2>=0 && idx2< dna.length) | |
{ | |
char baseRef=(char)(dna[idx2]); | |
w.writeAttribute("ref-base",String.valueOf(baseRef)); | |
if(Character.toUpperCase(baseRef)!=Character.toUpperCase(baseRead)) | |
{ | |
w.writeAttribute("mismatch","true"); | |
} | |
} | |
refIndex++; | |
readIndex++; | |
} | |
break; | |
} | |
default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); | |
} | |
} | |
w.writeEndElement(); | |
w.writeEndElement(); | |
} | |
iter.close(); | |
w.writeEndElement(); | |
} | |
samFileReader.close(); | |
w.writeEndElement(); | |
w.writeEndDocument(); | |
w.flush(); | |
w.close(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<biostar59647 ref="/home/lindenb/samtools-0.1.18/examples/toy.fa" bam="/home/lindenb/samtools-0.1.18/examples/toy.bam"> | |
<interval chrom="ref" start="9" end="20"> | |
<read> | |
<name>r001</name> | |
<sequence>TTAGATAAAGAGGATACTG</sequence> | |
<flags>163</flags> | |
<chrom>ref</chrom> | |
<pos>7</pos> | |
<cigar>8M4I4M1D3M</cigar> | |
<align> | |
<M read-index="1" read-base="T" ref-index="7"/> | |
<M read-index="2" read-base="T" ref-index="8"/> | |
<M read-index="3" read-base="A" ref-index="9" ref-base="A"/> | |
<M read-index="4" read-base="G" ref-index="10" ref-base="G"/> | |
<M read-index="5" read-base="A" ref-index="11" ref-base="A"/> | |
<M read-index="6" read-base="T" ref-index="12" ref-base="T"/> | |
<M read-index="7" read-base="A" ref-index="13" ref-base="A"/> | |
<M read-index="8" read-base="A" ref-index="14" ref-base="A"/> | |
<I read-index="9" read-base="A"/> | |
<I read-index="10" read-base="G"/> | |
<I read-index="11" read-base="A"/> | |
<I read-index="12" read-base="G"/> | |
<M read-index="13" read-base="G" ref-index="15" ref-base="G"/> | |
<M read-index="14" read-base="A" ref-index="16" ref-base="A"/> | |
<M read-index="15" read-base="T" ref-index="17" ref-base="T"/> | |
<M read-index="16" read-base="A" ref-index="18" ref-base="A"/> | |
<D ref-index="19" ref-base="G"/> | |
<M read-index="17" read-base="C" ref-index="20" ref-base="C"/> | |
<M read-index="18" read-base="T" ref-index="21"/> | |
<M read-index="19" read-base="G" ref-index="22"/> | |
</align> | |
</read> | |
<read> | |
<name>r002</name> | |
<sequence>AAAAGATAAGGGATAAA</sequence> | |
<flags>0</flags> | |
<chrom>ref</chrom> | |
<pos>9</pos> | |
<cigar>1S2I6M1P1I1P1I4M2I</cigar> | |
<align> | |
<S read-index="1" read-base="A"/> | |
<I read-index="2" read-base="A"/> | |
<I read-index="3" read-base="A"/> | |
<M read-index="4" read-base="A" ref-index="9" ref-base="A"/> | |
<M read-index="5" read-base="G" ref-index="10" ref-base="G"/> | |
<M read-index="6" read-base="A" ref-index="11" ref-base="A"/> | |
<M read-index="7" read-base="T" ref-index="12" ref-base="T"/> | |
<M read-index="8" read-base="A" ref-index="13" ref-base="A"/> | |
<M read-index="9" read-base="A" ref-index="14" ref-base="A"/> | |
<I read-index="10" read-base="G"/> | |
<I read-index="11" read-base="G"/> | |
<M read-index="12" read-base="G" ref-index="15" ref-base="G"/> | |
<M read-index="13" read-base="A" ref-index="16" ref-base="A"/> | |
<M read-index="14" read-base="T" ref-index="17" ref-base="T"/> | |
<M read-index="15" read-base="A" ref-index="18" ref-base="A"/> | |
<I read-index="16" read-base="A"/> | |
<I read-index="17" read-base="A"/> | |
</align> | |
</read> | |
<read> | |
<name>r003</name> | |
<sequence>AGCTAA</sequence> | |
<flags>0</flags> | |
<chrom>ref</chrom> | |
<pos>9</pos> | |
<cigar>5H6M</cigar> | |
<align> | |
<M read-index="1" read-base="A" ref-index="9" ref-base="A"/> | |
<M read-index="2" read-base="G" ref-index="10" ref-base="G"/> | |
<M read-index="3" read-base="C" ref-index="11" ref-base="A" mismatch="true"/> | |
<M read-index="4" read-base="T" ref-index="12" ref-base="T"/> | |
<M read-index="5" read-base="A" ref-index="13" ref-base="A"/> | |
<M read-index="6" read-base="A" ref-index="14" ref-base="A"/> | |
</align> | |
</read> | |
<read> | |
<name>r004</name> | |
<sequence>ATAGCTCTCAGC</sequence> | |
<fl | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment