Skip to content

Instantly share code, notes, and snippets.

@dwijnand
Last active September 17, 2015 13:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dwijnand/ff36c425f17d4b9430af to your computer and use it in GitHub Desktop.
Save dwijnand/ff36c425f17d4b9430af to your computer and use it in GitHub Desktop.
// sbt.version = 0.13.5
lazy val `norvig-ngrams` = (project in file(".")
settings (
name := "norvig-ngrams",
scalaVersion := "2.11.2",
incOptions := incOptions.value.withNameHashing(true),
scalacOptions ++= Seq("-encoding", "utf8"),
scalacOptions ++= Seq("-deprecation", "-feature", "-unchecked", "-Xlint"),
scalacOptions += "-Xfatal-warnings",
scalacOptions += "-Yinline-warnings",
scalacOptions += "-Yno-adapted-args",
scalacOptions += "-Ywarn-value-discard",
libraryDependencies += "org.specs2" %% "specs2" % "2.4" % "test",
excludeFilter in Test := (excludeFilter in unmanagedSources).value, // unlink
excludeFilter in Compile ~= (_ || "*Spec.scala"),
includeFilter in Test ~= (_ && "*Spec.scala"))
settings (inConfig(Test)(Defaults.addBaseSources): _*)
settings (
parallelExecution in Test := false,
logBuffered in Test := false)
)
import scala.language.{higherKinds, implicitConversions}
import scala.collection.generic.CanBuildFrom
import scala.util.{Random, Try}
import scala.{PartialFunction => =?>}
import scala.collection.mutable
import scala.io.Source
import scala.math.log10
import java.util.NoSuchElementException
package ngrams {
object Ngrams {
//// Word Segmentation (p. 223) ////
/** Return a list of words that is the best segmentation of text. */
lazy val segment: String => Seq[String] = Memo1 { text =>
if (text.isEmpty) Seq.empty
else {
val candidates = splits(text) map { case (first, rem) => Seq(first) ++ segment(rem) }
candidates maxBy Pwords
}
}
/** Return a list of all possible (first, rem) pairs, len(first)<=L. */
def splits(text: String, L: Int = 20): Seq[(String, String)] =
0 until (text.length min L) map (i => text splitAt i + 1)
/** The Naive Bayes probability of a sequence of words. */
def Pwords(words: Seq[String]): Double = (words map Pw).product
/** A probability distribution estimated from counts in datafile. */
final case class Pdist(
data: Vector[Vector[String]],
_N: Option[Long] = None,
_missingFn: Option[(String, Double) => Double] = None)
extends (String => Double) {
val dict = {
val b = mutable.Map[String, Long]()
data foreach { entry =>
val Seq(key, count) = entry
b += key -> (b.getOrElse(key, 0L) + count.toLong)
}
b.toMap
}
val N = _N.getOrElse(dict.size.toLong).toDouble
val missingFn = _missingFn getOrElse ((key: String, N: Double) => 1.0 / N)
def get(key: String): Long = dict(key)
def apply(key: String): Double = {
if (dict contains key) dict(key) / N
else missingFn(key, N)
}
}
/** Read key,value pairs from file */
def datafile(name: String, sep: Char = '\t'): Vector[Vector[String]] =
(Source fromFile s"../ngrams/$name").getLines().toVector map (_ split sep) map (_.toVector)
/** Estimate the probability of an unknown word */
def avoidLongWords(word: String, N: Double) = 10.0 / (N * (10 pow word.length))
val N = 1024908267229L // Number of tokens in corpus
val Pw = Pdist(datafile("count_1w.txt"), Some(N), Some(avoidLongWords _))
//// segment2: second version, with bigram counts, (p. 226-227) ////
/** The conditional probability P(word | previous-word) */
def cPw(word: String, prev: String): Double = {
try {
(P2w get s"$prev $word") / (Pw get prev).toDouble
} catch {
case _: NoSuchElementException => Pw(word)
}
}
val P2w = Pdist(datafile("count_2w.txt"), Some(N))
/** Return (log P(words), words), where words is the best segmentation. */
lazy val segment2: (String, Option[String]) => (Double, Seq[String]) = Memo2 { (text, prevOpt) =>
if (text.isEmpty) (0.0, Seq.empty[String])
else {
val prev = prevOpt getOrElse "<S>"
val candidates = splits(text) map { case (first, rem) =>
combine(log10(cPw(first, prev)), first, segment2(rem, first))
}
candidates maxBy (_._1)
}
}
def segment2(text: String, prev: String = "<S>"): (Double, Seq[String]) = segment2(text, Some(prev))
/** Combine first and rem results into one (probability, words) pair. */
def combine(Pfirst: Double, first: String, PremRem: (Double, Seq[String])): (Double, Seq[String]) =
(Pfirst + PremRem._1, first +: PremRem._2)
//// Secret Codes (p. 228-230) ////
/** Encode a message with a substitution cipher. */
def encode(msg: String, key0: String) = {
val key = ul(key0)
val alphabet52 = ul(alphabet)
msg map (ch => alphabet52 index ch map key.apply getOrElse ch)
}
def ul(text: String) = text.toUpperCase ++ text.toLowerCase
val alphabet = "abcdefghijklmnopqrstuvwxyz"
/** Encode a message with a shift (Caesar) cipher. */
def shift(msg: String, n: Int = 13) = encode(msg, (alphabet substring n) ++ alphabet.substring(0, n))
/** The Naive Bayes probability of a sequence of words. */
def logPwords(text: String): Double = logPwords(allwords(text))
def logPwords(words: Seq[String]): Double = (words map (w => log10(Pw(w)))).sum
/** Return a list of alphabetic words in text, lowercase. */
def allwords(text: String) = ("[a-z]+".r findAllIn text.toLowerCase).toVector
/** Find the best decoding of a message encoded with a shift cipher. */
def decodeShift(msg: String) = {
val candidates = (0 until alphabet.length) map (n => shift(msg, n))
candidates maxBy logPwords
}
/** Encode with a shift (Caesar) cipher, yielding only letters [a-z]. */
def shift2(msg: String, n: Int = 13) = shift(justLetters(msg), n)
/** Lowercase text and remove all characters expect [a-z]. */
def justLetters(text: String) = "[^a-z]".r.replaceAllIn(text.toLowerCase, "")
/** Decode a message encoded with a shift ciper, with no spaces. */
def decodeShift2(msg: String) = {
val candidates = (0 until alphabet.length) map (n => segment2(shift(msg, n)))
val (_, words) = candidates maxBy (_._1)
words mkString " "
}
//// General substitution cipher (p. 231-233) ////
/** The log-probability of text using a letter 3-gram model. */
def logP3letters(text: String) = (ngrams(text, 3) map (g => log10(P3l(g)))).sum
/** List all the (overlapping) ngrams in a sequence. */
def ngrams(seq: String, n: Int) = (0 to (seq.length - n)) map (i => seq.substring(i, i + n))
val P3l = Pdist(datafile("count_3l.txt"))
val P2l = Pdist(datafile("count_2l.txt"))
/** Search for an x that miximizes f(x), considering neighbors(x). */
def hillclimb(
x0: String,
f: String => Double,
neighbors: String => TraversableOnce[String],
steps: Int = 10000)
= {
var x = x0
var fx = f(x)
var neighborhood = neighbors(x).toIterator
(0 until steps) foreach { i =>
val x2 = neighborhood.next()
val fx2 = f(x2)
if (fx2 > fx) {
x = x2
fx = fx2
neighborhood = neighbors(x).toIterator
}
}
x
}
/** Decode a substitution cipher with random restart hillclimbing. */
def decodeSubst(msg0: String, steps: Int = 4000, restarts: Int = 20) = {
val msg = allwords(msg0) mkString ""
val candidates = (0 until restarts) map { r =>
hillclimb(
encode(msg, alphabet.shuffle() mkString ""),
logP3letters,
neighboringMsgs,
steps)
}
val (_, words) = candidates map (x => segment2(x)) maxBy (_._1)
words mkString " "
}
/** Generate nearby keys, hopefully better ones. */
def neighboringMsgs(msg: String): TraversableOnce[String] = {
def swap(a: Char, b: Char) = (
msg.iterator
map (_.toInt)
map (i => if (i == a) -1 else i)
map (i => if (i == b) a else i)
map (i => if (i == -1) b else i)
map (_.toChar)
mkString ""
)
val smallest = ngrams(msg, 2) map (n => (P2l(n), n)) sortBy (_._1) take 20 map (_._2)
val bigramSwaps = smallest flatMap { bigram =>
val List(b1, b2) = bigram.toList
alphabet flatMap {
case c if b1 == b2 =>
if (P2l("" + c + c) > P2l(bigram)) Some(swap(c, b1)) else None
case c if P2l("" + c + b2) > P2l(bigram) => Some(swap(c, b1))
case c if P2l("" + b1 + c) > P2l(bigram) => Some(swap(c, b2))
case _ => None
}
}
bigramSwaps.toStream ++
(Stream continually swap(alphabet.randomChoice().get, alphabet.randomChoice().get))
}
}
case class Memo1[T1, R](f: T1 => R) extends (T1 => R) {
private val cache = mutable.Map.empty[T1, R]
def apply(v1: T1) = cache getOrElseUpdate(v1, f(v1))
}
case class Memo2[T1, T2, R](f: (T1, T2) => R) extends ((T1, T2) => R) {
private val cache = mutable.Map.empty[(T1, T2), R]
def apply(v1: T1, v2: T2) = cache getOrElseUpdate((v1, v2), f(v1, v2))
}
}
package object ngrams {
@inline implicit class StringW(private val s: String) extends AnyVal {
@inline def toLongOpt: Option[Long] = Try(s.toLong).toOption
@inline def getOrElse(idx: Int, default: =>Char) =
if (idx >= 0 && idx < s.length) s(idx) else default
@inline def index(ch: Char) = s indexOf ch match {
case -1 => None
case i => Some(i)
}
@inline def shuffle(rand: Random = Random): String = s.toTraversable shuffle rand mkString ""
@inline def randomChoice(rand: Random = Random) = s.toTraversable randomChoice rand
@inline def splitAtOpt(n: Int) =
s.toVector splitAtOpt n map { case (a, b) => (a mkString "", b mkString "")}
}
@inline implicit class IntW(private val i: Int) extends AnyVal {
@inline def pow(exp: Int) = math.pow(i.toDouble, exp.toDouble)
}
@inline implicit class RandomW(private val r: Random) extends AnyVal {
def withSeed(s: Long): Random = { r setSeed s; r }
}
@inline implicit class TraversableOnceW[A, CC[X] <: TraversableOnce[X]](private val xs: CC[A])
extends AnyVal
def shuffle(rand: Random = Random)(implicit bf: CanBuildFrom[CC[A], A, CC[A]]) =
rand.shuffle(xs)(bf)
def randomChoice[A1 >: A](rand: Random = Random) = (rand shuffle xs.toTraversable).headOption
}
@inline implicit class AnyW[A](private val x: A) extends AnyVal {
@inline def |>[B](f: A => B): B = f(x)
@inline def maybe[B](pf: A =?> B): Option[B] = pf lift x
}
@inline implicit class VectorW[A](private val xs: Vector[A]) extends AnyVal {
@inline def splitAtLast: (Vector[A], Option[A]) = {
val idx = xs.length - 1
(xs take idx, (xs drop idx).headOption)
}
@inline def add[A1 >: A](x: A1): Vector[A1] = xs :+ x
@inline def addOpt[A1 >: A](x: Option[A1]): Vector[A1] = xs ++ x.toVector
@inline def splitAtOpt(n: Int): Option[(Vector[A], Vector[A])] =
if (xs.length >= n) Some(xs splitAt n) else None
}
}
package ngrams
import org.specs2.matcher.MatchResult
import org.specs2.mutable.{Specification, Tables}
class NgramsSpec extends Specification with Tables {
import Ngrams._
def expect[A](matchResults: MatchResult[A]*): MatchResult[A] = matchResults reduce (_ and _)
val declOfIndep = "when in the course of human events it becomes necessary"
val taleOf2Cities = "it was the best of times it was the worst of times it was the age of wisdom " +
"it was the age of foolishness"
val metamorphosis = "as gregor samsa awoke one morning from uneasy dreams he found himself " +
"transformed in his bed into a gigantic insect"
def hobbit(sitdown: String) = s"in a hole in the ground there lived a hobbit not a nasty dirty " +
s"wet hole filled with the ends of worms and an oozy smell nor yet a dry bare sandy hole with " +
s"nothing in it to $sitdown on or to eat it was a hobbit hole and that means comfort"
val h2g2 = "far out in the uncharted backwaters of the unfashionable end of the western spiral " +
"arm of the galaxy lies a small un regarded yellow sun"
val starwars = "general kenobi years ago you served my father in the clone wars now he begs you " +
"to help him in his struggle against the empire"
"Pw" >> {
val text = declOfIndep filter (_ != ' ')
"first" | "P(first)" | "P(rem)" | "P(first) x P(rem)" |
"w" ! 2E-4 ! 2E-33 ! 4E-37 |
"wh" ! 5E-6 ! 6E-33 ! 3E-38 |
"whe" ! 3E-7 ! 3E-32 ! 9E-39 |
"when" ! 6E-4 ! 7E-29 ! 4E-32 |
"wheni" ! 1E-16 ! 3E-30 ! 3E-46 |
"whenin" ! 1E-17 ! 8E-27 ! 8E-44 |> {
(first, Pfirst, Prem, Pproduct) => {
val rem = text drop first.length
expect(
Pfirst * Prem must be ~(Pproduct, Pproduct),
Pw(first) must be ~(Pfirst, Pfirst),
Pw(rem) must be ~(Prem, Prem)
)
}
}
}
def segmentStrs(hobbit: String) =
"choose spain" ::
"this is a test" ::
declOfIndep ::
"who represents" ::
"experts exchange" ::
"speed of art" ::
"now is the time for all good" ::
"it is a truth universally acknowledged" ::
taleOf2Cities ::
metamorphosis ::
hobbit ::
h2g2 ::
Nil
def runSegment(segment: String => Seq[String]) =
(text: String) => segment(text filter (_ != ' ')) === (text split ' ').toSeq
"segment" >> foreach(segmentStrs(hobbit("sitdown" )))(runSegment(segment))
"segment2" >> foreach(segmentStrs(hobbit("sit down")))(runSegment((s: String) => segment2(s)._2))
"shift" >> {
"secret" >> {
shift("Listen, do you want to know a secret?") ==== "Yvfgra, qb lbh jnag gb xabj n frperg?"
}
"hal9000" >> (shift("HAL 9000 xyz", 1) ==== "IBM 9000 yza")
}
"decodeShift" >> {
decodeShift("Yvfgra, qb lbh jnag gb xabj n frperg?") ==== "Listen, do you want to know a secret?"
}
"decodeShift2" >> foreach (
"listen do you want to know a secret" ::
"rosebud" ::
"is it safe" ::
"whats the frequency kenneth" ::
starwars ::
Nil)(text => decodeShift2(shift2(text)) ==== text)
"decodeSubst" >> {
"secret code breaker" >> pending {
val msg = "DSDRO XFIJV DIYSB ANQAL TAIMX VBDMB GASSA QRTRT CGGXJ MMTQC IPJSB AQPDR SDIMS " +
"DUAMB CQCMS AQDRS DMRJN SBAGC IYTCY ASBCS MQXKS CICGX RSRCQ ACOGA SJPAS AQHDI ASBAK GCDIS " +
"AWSJN CMDKB AQHAR RCYAE"
val dec = "it is by knowing the frequency which letters usually occur and other distinctive " +
"characteristics of the language that crypt analysts are able to determine the plain text " +
"of a cipher message j"
dec ==== decodeSubst(msg)
}
"ww1 german spy" >> pending {
val msg = "NKDIF SERLJ MIBFK FKDLV NQIBR HLCJU KFTFL KSTEN YQNDQ NTTEB TTENM QLJFS NOSUM " +
"MLQTL CTENC QNKRE BTTBR HKLQT ELCBQ QBSFS KLTML SSFAI NLKBR RLUKT LCJUK FTFLK FKSUC CFRFN " +
"KRYXB"
val dec = "english complaining over lack of munitions they regret that the promised support " +
"of the french attack north of arras is not possible on account of munition insufficiency wa"
dec ==== decodeSubst(msg)
}
"1992 KGB to CIA" >> pending {
val msg = "CNLGV QVELH WTTAI LEHOT WEQVP CEBTQ FJNPP EDMFM LFCYF SQFSP NDHQF OEUTNPPTPP CTDQN" +
" IFSQD TWHTN HHLFJ OLFSD HQFED HEGNQ TWVNQ HTNHH LFJWE BBITS PTHDT XQQFO EUTYF SLFJE " +
"DEFDN IFSQG NLNGN PCTTQ EDOED FGQFI TLXNI"
val dec = "march third week bridge with smile to pass info from you to us and to give " +
"assessment about new dead drop ground to indicate what dead drop will be used next to give" +
" your opinion about caracas meeting in october xab"
dec ==== decodeSubst(msg)
}
"1943 german u-boat" >> pending {
val msg = "WLJIU JYBRK PWFPF IJQSK PWRSS WEPTM MJRBS BJIRA BASPP IHBGP RWMWQ SOPSV PPIMJ " +
"BISUF WIFOT HWBIS WBIQW FBJRB GPILP PXLPM SAJQQ PMJQS RJASW LSBLW GBHMJ QSWIL PXWOL"
val dec = "a cony ov is headed northeast take up positions fifteen miles apart between point " +
"yd and bu maintain radio silence except for reports of tactical importance x abc"
dec ==== decodeSubst(msg)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment