Created
December 14, 2017 18:00
-
-
Save nh13/9094165d2225039a07f8b220bb0547a3 to your computer and use it in GitHub Desktop.
Bioinformatic Tool Series: SAM to FASTQ with UMIs and Barcodes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* The MIT License | |
* | |
* Copyright (c) 2017 Fulcrum Genomics LLC | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
* | |
*/ | |
package com.fulcrumgenomics.bam | |
import com.fulcrumgenomics.FgBioDef._ | |
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource} | |
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} | |
import com.fulcrumgenomics.commons.CommonsDef.PathToBam | |
import com.fulcrumgenomics.commons.collection.SelfClosingIterator | |
import com.fulcrumgenomics.commons.io.{Io, PathUtil} | |
import com.fulcrumgenomics.commons.util.LazyLogging | |
import com.fulcrumgenomics.fastq.{FastqRecord, FastqWriter} | |
import com.fulcrumgenomics.sopt._ | |
import com.fulcrumgenomics.util.ProgressLogger | |
import htsjdk.samtools.util.SequenceUtil | |
object SamToFastq { | |
/** The tag that stores bases to prepend to SEQ. */ | |
val FivePrimeBasesTag: String = "S5" | |
/** The tag that stores qualities to prepend to SEQ. */ | |
val FivePrimeQualitiesTag: String = "Q5" | |
/** The tag that stores bases to append to SEQ. */ | |
val ThreePrimeBasesTag: String = "S3" | |
/** The tag that stores qualities to append to SEQ. */ | |
val ThreePrimeQualitiesTag: String = "Q3" | |
} | |
@clp(description = | |
""" | |
|Converts a SAM or BAM to FASTQ. | |
| | |
|Three outputs will be created: | |
|1. Fragment reads will be written to the file `<output>.fragments.fq.gz` | |
|2. First of pair reads will be written to the file `<output>.R1.fq.gz` | |
|3. Second of pair reads will be written to the file `<output>.R2.fq.gz` | |
| | |
|Bases and qualities stored in the five prime and three prime bases/qualities tags will be prepended and appended | |
|to the bases/qualities found in the SEQ/QUAL field respectively. The SEQ field will be reverse complemented if | |
|mapped to the reverse strand. | |
""", | |
group = ClpGroups.SamOrBam) | |
class SamToFastq | |
( @arg(flag='i', doc="Input BAM file.") val input: PathToBam, | |
@arg(flag='o', doc="Output FASTQ file prefix.") val output: FilePath, | |
@arg(doc="The SAM tag storing the 5' bases to prepend to SEQ.") val fivePrimeBasesTag: String = SamToFastq.FivePrimeBasesTag, | |
@arg(doc="The SAM tag storing the 5' qualities to prepend to SEQ.") val fivePrimeQualitiesTag: String = SamToFastq.FivePrimeQualitiesTag, | |
@arg(doc="The SAM tag storing the 3' bases to append to SEQ.") val threePrimeBasesTag: String = SamToFastq.ThreePrimeBasesTag, | |
@arg(doc="The SAM tag storing the 3' qualities to append to SEQ.") val threePrimeQualitiesTag: String = SamToFastq.ThreePrimeQualitiesTag | |
) extends FgBioTool with LazyLogging { | |
Io.assertReadable(input) | |
Io.assertCanWriteFile(output) | |
validate(fivePrimeBasesTag.length == 2, s"--five-prime-bases-tag must be length two, was '${fivePrimeBasesTag.length}'") | |
validate(fivePrimeQualitiesTag.length == 2, s"--five-prime-qualities-tag must be length two, was '${fivePrimeQualitiesTag.length}'") | |
validate(threePrimeBasesTag.length == 2, s"--three-prime-bases-tag must be length two, was '${threePrimeBasesTag.length}'") | |
validate(threePrimeQualitiesTag.length == 2, s"--three-prime-qualities-tag must be length two, was '${threePrimeQualitiesTag.length}'") | |
override def execute(): Unit = { | |
val writers: Map[ReadType, FastqWriter] = ReadType.values.map { | |
case _: ReadType.Fragment.type => ReadType.Fragment -> FastqWriter(PathUtil.pathTo(output + ".fragments.fq.gz")) | |
case _: ReadType.ReadOne.type => ReadType.ReadOne -> FastqWriter(PathUtil.pathTo(output + ".R1.fq.gz")) | |
case _: ReadType.ReadTwo.type => ReadType.ReadTwo -> FastqWriter(PathUtil.pathTo(output + ".R2.fq.gz")) | |
}.toMap | |
val iterator = buildIterator() | |
val progress = ProgressLogger(logger, verb="written", unit=5e6.toInt) | |
iterator.foreach { rec => | |
val readType = ReadType.from(rec) | |
val writer = writers(readType) | |
val fastq = toFastqRecord(rec, readType) | |
writer += fastq | |
progress.record() | |
} | |
writers.values.foreach(_.safelyClose) | |
} | |
/** Builds an iterator ensuring the records are returned in queryname order. If the input is not queryname sorted, | |
* the records are sorted. */ | |
private def buildIterator(): Iterator[SamRecord] = { | |
val in = SamSource(input) | |
val order = SamOrder(in.header).getOrElse(SamOrder.Unsorted) | |
if (order == SamOrder.Queryname) { | |
new SelfClosingIterator(in.iterator.bufferBetter, () => in.safelyClose()) | |
} | |
else { | |
logger.info("Sorting into queryname order.") | |
val progress = ProgressLogger(this.logger, "Queryname sorted") | |
val sort = Bams.sorter(SamOrder.Queryname, in.header) | |
in.foreach { rec => | |
sort += rec | |
progress.record(rec) | |
} | |
new SelfClosingIterator(sort.iterator.bufferBetter, () => sort.close()) | |
} | |
} | |
/** Converts a [[SamRecord]] into a [[FastqRecord]]. Prepends and appends to the bases the sequence stored in the | |
* five prime and three prime bases tags respectively. Similarly, prepends and appends to the qualities the qualities | |
* stored in the five prime and three prime qualities tags respectively. */ | |
private def toFastqRecord(rec: SamRecord, readType: ReadType): FastqRecord = { | |
val bases = rec.bases | |
val quals = if (rec.positiveStrand) rec.qualsString else rec.qualsString.reverse | |
if (rec.negativeStrand) SequenceUtil.reverseComplement(bases) | |
val fivePrimeBases = rec.get(this.fivePrimeBasesTag).getOrElse("") | |
val fivePrimeQuals = rec.get(this.fivePrimeQualitiesTag).getOrElse("") | |
val threePrimeBases = rec.get(this.threePrimeBasesTag).getOrElse("") | |
val threePrimeQuals = rec.get(this.threePrimeQualitiesTag).getOrElse("") | |
val readNumber = readType match { | |
case _: ReadType.Fragment.type => None | |
case _: ReadType.ReadOne.type => Some(1) | |
case _: ReadType.ReadTwo.type => Some(2) | |
} | |
FastqRecord( | |
name = rec.name, | |
bases = fivePrimeBases + new String(bases) + threePrimeBases, | |
quals = fivePrimeQuals + quals + threePrimeQuals, | |
readNumber = readNumber | |
) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* The MIT License | |
* | |
* Copyright (c) 2017 Fulcrum Genomics LLC | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
* | |
*/ | |
package com.fulcrumgenomics.bam | |
import java.nio.file.Files | |
import com.fulcrumgenomics.FgBioDef.{DirPath, FilePath} | |
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord} | |
import com.fulcrumgenomics.commons.io.PathUtil | |
import com.fulcrumgenomics.fastq.{FastqRecord, FastqSource} | |
import com.fulcrumgenomics.testing.SamBuilder._ | |
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} | |
import org.scalatest.OptionValues | |
class SamToFastqTest extends UnitSpec with OptionValues { | |
private def addTags(rec: SamRecord, b5: Option[String] = None, q5: Option[String] = None, b3: Option[String] = None, q3: Option[String] = None): Unit = { | |
b5.foreach { value => rec(SamToFastq.FivePrimeBasesTag) = value} | |
q5.foreach { value => rec(SamToFastq.FivePrimeQualitiesTag) = value} | |
b3.foreach { value => rec(SamToFastq.ThreePrimeBasesTag) = value} | |
q3.foreach { value => rec(SamToFastq.ThreePrimeQualitiesTag) = value} | |
} | |
private object FastqResults { | |
def apply(path: DirPath): FastqResults = { | |
FastqResults( | |
frags = FastqSource(PathUtil.pathTo(path + ".fragments.fq.gz")).toSeq, | |
r1s = FastqSource(PathUtil.pathTo(path + ".R1.fq.gz")).toSeq, | |
r2s = FastqSource(PathUtil.pathTo(path + ".R2.fq.gz")).toSeq | |
) | |
} | |
} | |
private case class FastqResults(frags: Seq[FastqRecord], r1s: Seq[FastqRecord], r2s: Seq[FastqRecord]) | |
private def checkFastqRecord(rec: FastqRecord, name: String, bases: String, quals: String): Unit = { | |
rec.name shouldBe name | |
rec.bases shouldBe bases | |
rec.quals shouldBe quals | |
} | |
private def toOutput: FilePath = { | |
val dir = Files.createTempDirectory("SamToFastq") | |
dir.toFile.deleteOnExit() | |
PathUtil.pathTo(dir.toAbsolutePath.toString, "fastqs") | |
} | |
"SamToFastq" should "convert fragment records" in { | |
val builder = new SamBuilder(sort=Some(SamOrder.Queryname), readLength=10) | |
builder.addFrag(name="q1", bases="A"*10, quals="I"*10, unmapped=true).foreach { rec => addTags(rec=rec, b5=Some("A"), q5=Some("5"), b3=Some("C"), q3=Some("3")) } | |
builder.addFrag(name="q2", bases="A"*10, quals="I"*10, start=10, strand=Plus).foreach { rec => addTags(rec=rec, b5=Some("C"), q5=Some("6"), b3=Some("G"), q3=Some("4")) } | |
builder.addFrag(name="q3", bases="A"*10, quals="I"*5 + "J"*5, start=10, strand=Minus).foreach { rec => addTags(rec=rec, b5=Some("G"), q5=Some("7"), b3=Some("T"), q3=Some("5")) } | |
val output = toOutput | |
new SamToFastq(input=builder.toTempFile(), output=output).execute() | |
val results = FastqResults(output) | |
results.frags should have size 3 | |
results.r1s shouldBe 'empty | |
results.r2s shouldBe 'empty | |
checkFastqRecord(rec=results.frags(0), name="q1", bases="A"*11 + "C", quals="5" + "I"*10 + "3") | |
checkFastqRecord(rec=results.frags(1), name="q2", bases="C" + "A"*10 + "G", quals="6" + "I"*10 + "4") | |
checkFastqRecord(rec=results.frags(2), name="q3", bases="G" + "T"*10 + "T", quals="7" + "J"*5 + "I"*5 + "5") | |
} | |
it should "convert paired records" in { | |
val builder = new SamBuilder(sort=Some(SamOrder.Queryname), readLength=10) | |
// unmapped | |
builder.addPair(name="q1", bases1="A"*10, quals1="I"*10, unmapped1=true, bases2="A"*10, quals2="I"*10, unmapped2=true).foreach { rec => | |
addTags(rec=rec, b5=Some("A"), q5=Some("5"), b3=Some("C"), q3=Some("3")) | |
} | |
// FR | |
builder.addPair(name="q2", bases1="A"*10, quals1="I"*5 + "J"*5, start1=10, bases2="A"*10, quals2="I"*5 + "J"*5, start2=10).foreach { rec => | |
addTags(rec=rec, b5=Some("C"), q5=Some("6"), b3=Some("G"), q3=Some("4")) } | |
// RF | |
builder.addPair(name="q3", bases1="A"*10, quals1="I"*5 + "J"*5, start1=10, strand1=Minus, bases2="A"*10, quals2="I"*5 + "J"*5, start2=10, strand2=Plus).foreach { rec => | |
addTags(rec=rec, b5=Some("G"), q5=Some("7"), b3=Some("T"), q3=Some("5")) } | |
val output = toOutput | |
new SamToFastq(input=builder.toTempFile(), output=output).execute() | |
val results = FastqResults(output) | |
results.frags shouldBe 'empty | |
results.r1s should have size 3 | |
results.r2s should have size 3 | |
checkFastqRecord(rec=results.r1s(0), name="q1", bases="A"*11 + "C", quals="5" + "I"*10 + "3") | |
checkFastqRecord(rec=results.r1s(1), name="q2", bases="C" + "A"*10 + "G", quals="6" + "I"*5 + "J"*5 + "4") // forward | |
checkFastqRecord(rec=results.r1s(2), name="q3", bases="G" + "T"*10 + "T", quals="7" + "J"*5 + "I"*5 + "5") // reverse | |
checkFastqRecord(rec=results.r2s(0), name="q1", bases="A" + "T"*10 + "C", quals="5" + "I"*10 + "3") | |
checkFastqRecord(rec=results.r2s(1), name="q2", bases="C" + "T"*10 + "G", quals="6" + "J"*5 + "I"*5 + "4") // reverse | |
checkFastqRecord(rec=results.r2s(2), name="q3", bases="G" + "A"*10 + "T", quals="7" + "I"*5 + "J"*5 + "5") // forward | |
} | |
it should "not prepend or append if no tags are given" in { | |
val builder = new SamBuilder(sort=Some(SamOrder.Queryname), readLength=10) | |
builder.addFrag(name="q1", bases="A"*10, quals="I"*10, unmapped=true) | |
builder.addPair(name="q2", bases1="A"*10, quals1="I"*10, unmapped1=true, bases2="A"*10, quals2="I"*10, unmapped2=true) | |
val output = toOutput | |
new SamToFastq(input=builder.toTempFile(), output=output).execute() | |
val results = FastqResults(output) | |
results.frags should have size 1 | |
results.r1s should have size 1 | |
results.r2s should have size 1 | |
checkFastqRecord(rec=results.frags(0), name="q1", bases="A"*10, quals="I"*10) | |
checkFastqRecord(rec=results.r1s(0), name="q2", bases="A"*10, quals="I"*10) | |
checkFastqRecord(rec=results.r2s(0), name="q2", bases="T"*10, quals="I"*10) | |
} | |
it should "sort the input to queryname" in { | |
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10) | |
builder.addFrag(name="q1", bases="A"*10, quals="I"*10, unmapped=true) | |
builder.addFrag(name="q2", bases="A"*10, quals="I"*10, start=100) | |
builder.addFrag(name="q3", bases="A"*10, quals="I"*10, start=50) | |
val output = toOutput | |
new SamToFastq(input=builder.toTempFile(), output=output).execute() | |
builder.map(_.name).toSeq should contain theSameElementsInOrderAs Seq("q3", "q2", "q1") | |
val results = FastqResults(output) | |
results.frags should have size 3 | |
results.r1s shouldBe 'empty | |
results.r2s shouldBe 'empty | |
checkFastqRecord(rec=results.frags(0), name="q1", bases="A"*10, quals="I"*10) | |
checkFastqRecord(rec=results.frags(1), name="q2", bases="A"*10, quals="I"*10) | |
checkFastqRecord(rec=results.frags(2), name="q3", bases="A"*10, quals="I"*10) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment