Created
March 19, 2019 05:26
-
-
Save hohonuuli/c9bb2fc0535c5d30cb524bff81a8ac28 to your computer and use it in GitHub Desktop.
Trivial compression of DNA in Scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.collection.BitSet | |
import scala.collection.mutable | |
case class CompressedGene(nucleotideCount: Int, bits: BitSet) { | |
override def toString(): String = CompressedGene.decompress(this) | |
} | |
object CompressedGene { | |
def apply(gene: String): CompressedGene = { | |
val bitSet = new mutable.BitSet | |
for (i <- 0 until gene.size) { | |
val nucleotide = gene(i).toUpper | |
val a = i * 2 | |
val b = a + 1 | |
nucleotide match { | |
case 'A' => // Do nothing // 0b00 | |
case 'C' => bitSet.add(b) // 0b01 | |
case 'G' => bitSet.add(a) // 0b10 | |
case 'T' => // 0b11 | |
bitSet.add(a) | |
bitSet.add(b) | |
case s: Char => | |
throw new IllegalArgumentException(s"Invalide Nucleotide: $s") | |
} | |
} | |
CompressedGene(gene.size, bitSet) | |
} | |
def decompress(compressedGene: CompressedGene): String = { | |
val gene: StringBuilder = new StringBuilder() | |
for (i <- 0 until compressedGene.nucleotideCount) { | |
val a = i * 2 | |
val b = a + 1 | |
val (b0, b1) = (compressedGene.bits(a), compressedGene.bits(b)) | |
val nucleotide = if (!b0 && !b1) "A" | |
else if (!b0 && b1) "C" | |
else if (b0 && !b1) "G" | |
else "T" | |
gene.append(nucleotide) | |
} | |
gene.toString | |
} | |
} |
Here the Python code for the same output like Scala:
import random
acgt = "ACGT"*10000
orig = ''.join(random.sample(list(acgt),len(acgt)))
cmpr = CompressedGene(orig)
expd = cmpr.decompress()
print("size orig={}, Bit2ByteCount={}, orig==expd?:{}".format(len(orig), int((len(orig)+7)/8), (orig == expd)))
print(orig[0:50]+("..." if len(orig) > 50 else ""))
if (orig != expd):
print(expd)
Ups, a 'typo' in both: (len(orig)+7)/8 ==> (len(orig)+3)/4
@gryphis Thanks for sharing your version!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this cute application, I read it at medium.com.
May I handover an alternative proposal for the Scala text?
It's still really close to your Python code but much more the way of Scala.
It's running with Ammonite (ammonite.io) or Scala as a script:
ComprGene.scala:
Cheers, gryphis