Skip to content

Instantly share code, notes, and snippets.

@hohonuuli
Created March 19, 2019 05:26
Show Gist options
  • Save hohonuuli/c9bb2fc0535c5d30cb524bff81a8ac28 to your computer and use it in GitHub Desktop.
Save hohonuuli/c9bb2fc0535c5d30cb524bff81a8ac28 to your computer and use it in GitHub Desktop.
Trivial compression of DNA in Scala
import scala.collection.BitSet
import scala.collection.mutable
case class CompressedGene(nucleotideCount: Int, bits: BitSet) {
override def toString(): String = CompressedGene.decompress(this)
}
object CompressedGene {
def apply(gene: String): CompressedGene = {
val bitSet = new mutable.BitSet
for (i <- 0 until gene.size) {
val nucleotide = gene(i).toUpper
val a = i * 2
val b = a + 1
nucleotide match {
case 'A' => // Do nothing // 0b00
case 'C' => bitSet.add(b) // 0b01
case 'G' => bitSet.add(a) // 0b10
case 'T' => // 0b11
bitSet.add(a)
bitSet.add(b)
case s: Char =>
throw new IllegalArgumentException(s"Invalide Nucleotide: $s")
}
}
CompressedGene(gene.size, bitSet)
}
def decompress(compressedGene: CompressedGene): String = {
val gene: StringBuilder = new StringBuilder()
for (i <- 0 until compressedGene.nucleotideCount) {
val a = i * 2
val b = a + 1
val (b0, b1) = (compressedGene.bits(a), compressedGene.bits(b))
val nucleotide = if (!b0 && !b1) "A"
else if (!b0 && b1) "C"
else if (b0 && !b1) "G"
else "T"
gene.append(nucleotide)
}
gene.toString
}
}
@gryphis
Copy link

gryphis commented Oct 21, 2019

Thanks for this cute application, I read it at medium.com.
May I handover an alternative proposal for the Scala text?
It's still really close to your Python code but much more the way of Scala.

It's running with Ammonite (ammonite.io) or Scala as a script:

$ amm --help | head -n1
Ammonite REPL & Script-Runner, 1.7.4
$ amm ComprGene.scala 
Compiling /home/IGEL/kahnt/Scala/ComprGene.scala
size orig=40000, Bit2ByteCount=5000, orig==expd?:true
AGCTGACGCGTGGCATCAGGTGGTGGTAGATACGGCTGGGCCTTTCACGT...
$ scala --version
Scala code runner version 2.13.0 -- Copyright 2002-2019, LAMP/EPFL and Lightbend, Inc.
$ scala ComprGene.scala 
size orig=40000, Bit2ByteCount=5000, orig==expd?:true
GACGTTCATGGAATCCTCTTTAAGGCTCAGTCGACTCTCTGGTTTGTTAT...

ComprGene.scala:

import scala.collection.mutable.BitSet

case class CompressedGene(nucleotideCount: Int, bits: BitSet) {
  def decompress: String = {
    val gene = for (i <- 0 until this.nucleotideCount; bitpos <- Some(i*2)) yield {
      (this.bits(bitpos+1), this.bits(bitpos)) match {
        case (false,false) => "A"
        case (false, true) => "C"
        case ( true,false) => "G"
        case ( true, true) => "T"
      }
    }
    gene.mkString
  }
  override def toString(): String = this.decompress
}

object CompressedGene {
  def apply(gene: String): CompressedGene = {
    var bitSet = new BitSet
    for ((nucleotide, i) <- gene.zipWithIndex; bitpos <- Some(i*2)) {
      nucleotide.toUpper match {
        case 'A' => // Do nothing                  // 0b00
        case 'C' => bitSet +=               bitpos // 0b01
        case 'G' => bitSet += (bitpos+1)           // 0b10
        case 'T' => bitSet += (bitpos+1) += bitpos // 0b11
        case unknown: Char =>
          throw new IllegalArgumentException(s"Invalid Nucleotide: $unknown")
      }
    }
    CompressedGene(gene.size, bitSet)
  }
}

val orig = scala.util.Random.shuffle("ACGT"*10000).toString
val cmpr = CompressedGene(orig)
val expd = cmpr.toString
println(s"size orig=${orig.size}, Bit2ByteCount=${(orig.size+7)/8}, orig==expd?:${(orig == expd)}")
println(orig.slice(0,50)+{if (orig.size > 50) "..."; else ""})
if (orig != expd) {
  println(expd)
}

Cheers, gryphis

@gryphis
Copy link

gryphis commented Oct 21, 2019

Here the Python code for the same output like Scala:

import random
acgt = "ACGT"*10000
orig = ''.join(random.sample(list(acgt),len(acgt)))
cmpr = CompressedGene(orig)
expd = cmpr.decompress()
print("size orig={}, Bit2ByteCount={}, orig==expd?:{}".format(len(orig), int((len(orig)+7)/8), (orig == expd)))
print(orig[0:50]+("..." if len(orig) > 50 else ""))
if (orig != expd):
    print(expd)

@gryphis
Copy link

gryphis commented Oct 22, 2019

Ups, a 'typo' in both: (len(orig)+7)/8 ==> (len(orig)+3)/4

@hohonuuli
Copy link
Author

@gryphis Thanks for sharing your version!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment