Last active
September 17, 2015 13:28
-
-
Save dwijnand/ff36c425f17d4b9430af to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// sbt.version = 0.13.5 | |
lazy val `norvig-ngrams` = (project in file(".") | |
settings ( | |
name := "norvig-ngrams", | |
scalaVersion := "2.11.2", | |
incOptions := incOptions.value.withNameHashing(true), | |
scalacOptions ++= Seq("-encoding", "utf8"), | |
scalacOptions ++= Seq("-deprecation", "-feature", "-unchecked", "-Xlint"), | |
scalacOptions += "-Xfatal-warnings", | |
scalacOptions += "-Yinline-warnings", | |
scalacOptions += "-Yno-adapted-args", | |
scalacOptions += "-Ywarn-value-discard", | |
libraryDependencies += "org.specs2" %% "specs2" % "2.4" % "test", | |
excludeFilter in Test := (excludeFilter in unmanagedSources).value, // unlink | |
excludeFilter in Compile ~= (_ || "*Spec.scala"), | |
includeFilter in Test ~= (_ && "*Spec.scala")) | |
settings (inConfig(Test)(Defaults.addBaseSources): _*) | |
settings ( | |
parallelExecution in Test := false, | |
logBuffered in Test := false) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.language.{higherKinds, implicitConversions} | |
import scala.collection.generic.CanBuildFrom | |
import scala.util.{Random, Try} | |
import scala.{PartialFunction => =?>} | |
import scala.collection.mutable | |
import scala.io.Source | |
import scala.math.log10 | |
import java.util.NoSuchElementException | |
package ngrams { | |
object Ngrams { | |
//// Word Segmentation (p. 223) //// | |
/** Return a list of words that is the best segmentation of text. */ | |
lazy val segment: String => Seq[String] = Memo1 { text => | |
if (text.isEmpty) Seq.empty | |
else { | |
val candidates = splits(text) map { case (first, rem) => Seq(first) ++ segment(rem) } | |
candidates maxBy Pwords | |
} | |
} | |
/** Return a list of all possible (first, rem) pairs, len(first)<=L. */ | |
def splits(text: String, L: Int = 20): Seq[(String, String)] = | |
0 until (text.length min L) map (i => text splitAt i + 1) | |
/** The Naive Bayes probability of a sequence of words. */ | |
def Pwords(words: Seq[String]): Double = (words map Pw).product | |
/** A probability distribution estimated from counts in datafile. */ | |
final case class Pdist( | |
data: Vector[Vector[String]], | |
_N: Option[Long] = None, | |
_missingFn: Option[(String, Double) => Double] = None) | |
extends (String => Double) { | |
val dict = { | |
val b = mutable.Map[String, Long]() | |
data foreach { entry => | |
val Seq(key, count) = entry | |
b += key -> (b.getOrElse(key, 0L) + count.toLong) | |
} | |
b.toMap | |
} | |
val N = _N.getOrElse(dict.size.toLong).toDouble | |
val missingFn = _missingFn getOrElse ((key: String, N: Double) => 1.0 / N) | |
def get(key: String): Long = dict(key) | |
def apply(key: String): Double = { | |
if (dict contains key) dict(key) / N | |
else missingFn(key, N) | |
} | |
} | |
/** Read key,value pairs from file */ | |
def datafile(name: String, sep: Char = '\t'): Vector[Vector[String]] = | |
(Source fromFile s"../ngrams/$name").getLines().toVector map (_ split sep) map (_.toVector) | |
/** Estimate the probability of an unknown word */ | |
def avoidLongWords(word: String, N: Double) = 10.0 / (N * (10 pow word.length)) | |
val N = 1024908267229L // Number of tokens in corpus | |
val Pw = Pdist(datafile("count_1w.txt"), Some(N), Some(avoidLongWords _)) | |
//// segment2: second version, with bigram counts, (p. 226-227) //// | |
/** The conditional probability P(word | previous-word) */ | |
def cPw(word: String, prev: String): Double = { | |
try { | |
(P2w get s"$prev $word") / (Pw get prev).toDouble | |
} catch { | |
case _: NoSuchElementException => Pw(word) | |
} | |
} | |
val P2w = Pdist(datafile("count_2w.txt"), Some(N)) | |
/** Return (log P(words), words), where words is the best segmentation. */ | |
lazy val segment2: (String, Option[String]) => (Double, Seq[String]) = Memo2 { (text, prevOpt) => | |
if (text.isEmpty) (0.0, Seq.empty[String]) | |
else { | |
val prev = prevOpt getOrElse "<S>" | |
val candidates = splits(text) map { case (first, rem) => | |
combine(log10(cPw(first, prev)), first, segment2(rem, first)) | |
} | |
candidates maxBy (_._1) | |
} | |
} | |
def segment2(text: String, prev: String = "<S>"): (Double, Seq[String]) = segment2(text, Some(prev)) | |
/** Combine first and rem results into one (probability, words) pair. */ | |
def combine(Pfirst: Double, first: String, PremRem: (Double, Seq[String])): (Double, Seq[String]) = | |
(Pfirst + PremRem._1, first +: PremRem._2) | |
//// Secret Codes (p. 228-230) //// | |
/** Encode a message with a substitution cipher. */ | |
def encode(msg: String, key0: String) = { | |
val key = ul(key0) | |
val alphabet52 = ul(alphabet) | |
msg map (ch => alphabet52 index ch map key.apply getOrElse ch) | |
} | |
def ul(text: String) = text.toUpperCase ++ text.toLowerCase | |
val alphabet = "abcdefghijklmnopqrstuvwxyz" | |
/** Encode a message with a shift (Caesar) cipher. */ | |
def shift(msg: String, n: Int = 13) = encode(msg, (alphabet substring n) ++ alphabet.substring(0, n)) | |
/** The Naive Bayes probability of a sequence of words. */ | |
def logPwords(text: String): Double = logPwords(allwords(text)) | |
def logPwords(words: Seq[String]): Double = (words map (w => log10(Pw(w)))).sum | |
/** Return a list of alphabetic words in text, lowercase. */ | |
def allwords(text: String) = ("[a-z]+".r findAllIn text.toLowerCase).toVector | |
/** Find the best decoding of a message encoded with a shift cipher. */ | |
def decodeShift(msg: String) = { | |
val candidates = (0 until alphabet.length) map (n => shift(msg, n)) | |
candidates maxBy logPwords | |
} | |
/** Encode with a shift (Caesar) cipher, yielding only letters [a-z]. */ | |
def shift2(msg: String, n: Int = 13) = shift(justLetters(msg), n) | |
/** Lowercase text and remove all characters expect [a-z]. */ | |
def justLetters(text: String) = "[^a-z]".r.replaceAllIn(text.toLowerCase, "") | |
/** Decode a message encoded with a shift ciper, with no spaces. */ | |
def decodeShift2(msg: String) = { | |
val candidates = (0 until alphabet.length) map (n => segment2(shift(msg, n))) | |
val (_, words) = candidates maxBy (_._1) | |
words mkString " " | |
} | |
//// General substitution cipher (p. 231-233) //// | |
/** The log-probability of text using a letter 3-gram model. */ | |
def logP3letters(text: String) = (ngrams(text, 3) map (g => log10(P3l(g)))).sum | |
/** List all the (overlapping) ngrams in a sequence. */ | |
def ngrams(seq: String, n: Int) = (0 to (seq.length - n)) map (i => seq.substring(i, i + n)) | |
val P3l = Pdist(datafile("count_3l.txt")) | |
val P2l = Pdist(datafile("count_2l.txt")) | |
/** Search for an x that miximizes f(x), considering neighbors(x). */ | |
def hillclimb( | |
x0: String, | |
f: String => Double, | |
neighbors: String => TraversableOnce[String], | |
steps: Int = 10000) | |
= { | |
var x = x0 | |
var fx = f(x) | |
var neighborhood = neighbors(x).toIterator | |
(0 until steps) foreach { i => | |
val x2 = neighborhood.next() | |
val fx2 = f(x2) | |
if (fx2 > fx) { | |
x = x2 | |
fx = fx2 | |
neighborhood = neighbors(x).toIterator | |
} | |
} | |
x | |
} | |
/** Decode a substitution cipher with random restart hillclimbing. */ | |
def decodeSubst(msg0: String, steps: Int = 4000, restarts: Int = 20) = { | |
val msg = allwords(msg0) mkString "" | |
val candidates = (0 until restarts) map { r => | |
hillclimb( | |
encode(msg, alphabet.shuffle() mkString ""), | |
logP3letters, | |
neighboringMsgs, | |
steps) | |
} | |
val (_, words) = candidates map (x => segment2(x)) maxBy (_._1) | |
words mkString " " | |
} | |
/** Generate nearby keys, hopefully better ones. */ | |
def neighboringMsgs(msg: String): TraversableOnce[String] = { | |
def swap(a: Char, b: Char) = ( | |
msg.iterator | |
map (_.toInt) | |
map (i => if (i == a) -1 else i) | |
map (i => if (i == b) a else i) | |
map (i => if (i == -1) b else i) | |
map (_.toChar) | |
mkString "" | |
) | |
val smallest = ngrams(msg, 2) map (n => (P2l(n), n)) sortBy (_._1) take 20 map (_._2) | |
val bigramSwaps = smallest flatMap { bigram => | |
val List(b1, b2) = bigram.toList | |
alphabet flatMap { | |
case c if b1 == b2 => | |
if (P2l("" + c + c) > P2l(bigram)) Some(swap(c, b1)) else None | |
case c if P2l("" + c + b2) > P2l(bigram) => Some(swap(c, b1)) | |
case c if P2l("" + b1 + c) > P2l(bigram) => Some(swap(c, b2)) | |
case _ => None | |
} | |
} | |
bigramSwaps.toStream ++ | |
(Stream continually swap(alphabet.randomChoice().get, alphabet.randomChoice().get)) | |
} | |
} | |
case class Memo1[T1, R](f: T1 => R) extends (T1 => R) { | |
private val cache = mutable.Map.empty[T1, R] | |
def apply(v1: T1) = cache getOrElseUpdate(v1, f(v1)) | |
} | |
case class Memo2[T1, T2, R](f: (T1, T2) => R) extends ((T1, T2) => R) { | |
private val cache = mutable.Map.empty[(T1, T2), R] | |
def apply(v1: T1, v2: T2) = cache getOrElseUpdate((v1, v2), f(v1, v2)) | |
} | |
} | |
package object ngrams { | |
@inline implicit class StringW(private val s: String) extends AnyVal { | |
@inline def toLongOpt: Option[Long] = Try(s.toLong).toOption | |
@inline def getOrElse(idx: Int, default: =>Char) = | |
if (idx >= 0 && idx < s.length) s(idx) else default | |
@inline def index(ch: Char) = s indexOf ch match { | |
case -1 => None | |
case i => Some(i) | |
} | |
@inline def shuffle(rand: Random = Random): String = s.toTraversable shuffle rand mkString "" | |
@inline def randomChoice(rand: Random = Random) = s.toTraversable randomChoice rand | |
@inline def splitAtOpt(n: Int) = | |
s.toVector splitAtOpt n map { case (a, b) => (a mkString "", b mkString "")} | |
} | |
@inline implicit class IntW(private val i: Int) extends AnyVal { | |
@inline def pow(exp: Int) = math.pow(i.toDouble, exp.toDouble) | |
} | |
@inline implicit class RandomW(private val r: Random) extends AnyVal { | |
def withSeed(s: Long): Random = { r setSeed s; r } | |
} | |
@inline implicit class TraversableOnceW[A, CC[X] <: TraversableOnce[X]](private val xs: CC[A]) | |
extends AnyVal | |
def shuffle(rand: Random = Random)(implicit bf: CanBuildFrom[CC[A], A, CC[A]]) = | |
rand.shuffle(xs)(bf) | |
def randomChoice[A1 >: A](rand: Random = Random) = (rand shuffle xs.toTraversable).headOption | |
} | |
@inline implicit class AnyW[A](private val x: A) extends AnyVal { | |
@inline def |>[B](f: A => B): B = f(x) | |
@inline def maybe[B](pf: A =?> B): Option[B] = pf lift x | |
} | |
@inline implicit class VectorW[A](private val xs: Vector[A]) extends AnyVal { | |
@inline def splitAtLast: (Vector[A], Option[A]) = { | |
val idx = xs.length - 1 | |
(xs take idx, (xs drop idx).headOption) | |
} | |
@inline def add[A1 >: A](x: A1): Vector[A1] = xs :+ x | |
@inline def addOpt[A1 >: A](x: Option[A1]): Vector[A1] = xs ++ x.toVector | |
@inline def splitAtOpt(n: Int): Option[(Vector[A], Vector[A])] = | |
if (xs.length >= n) Some(xs splitAt n) else None | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package ngrams | |
import org.specs2.matcher.MatchResult | |
import org.specs2.mutable.{Specification, Tables} | |
class NgramsSpec extends Specification with Tables { | |
import Ngrams._ | |
def expect[A](matchResults: MatchResult[A]*): MatchResult[A] = matchResults reduce (_ and _) | |
val declOfIndep = "when in the course of human events it becomes necessary" | |
val taleOf2Cities = "it was the best of times it was the worst of times it was the age of wisdom " + | |
"it was the age of foolishness" | |
val metamorphosis = "as gregor samsa awoke one morning from uneasy dreams he found himself " + | |
"transformed in his bed into a gigantic insect" | |
def hobbit(sitdown: String) = s"in a hole in the ground there lived a hobbit not a nasty dirty " + | |
s"wet hole filled with the ends of worms and an oozy smell nor yet a dry bare sandy hole with " + | |
s"nothing in it to $sitdown on or to eat it was a hobbit hole and that means comfort" | |
val h2g2 = "far out in the uncharted backwaters of the unfashionable end of the western spiral " + | |
"arm of the galaxy lies a small un regarded yellow sun" | |
val starwars = "general kenobi years ago you served my father in the clone wars now he begs you " + | |
"to help him in his struggle against the empire" | |
"Pw" >> { | |
val text = declOfIndep filter (_ != ' ') | |
"first" | "P(first)" | "P(rem)" | "P(first) x P(rem)" | | |
"w" ! 2E-4 ! 2E-33 ! 4E-37 | | |
"wh" ! 5E-6 ! 6E-33 ! 3E-38 | | |
"whe" ! 3E-7 ! 3E-32 ! 9E-39 | | |
"when" ! 6E-4 ! 7E-29 ! 4E-32 | | |
"wheni" ! 1E-16 ! 3E-30 ! 3E-46 | | |
"whenin" ! 1E-17 ! 8E-27 ! 8E-44 |> { | |
(first, Pfirst, Prem, Pproduct) => { | |
val rem = text drop first.length | |
expect( | |
Pfirst * Prem must be ~(Pproduct, Pproduct), | |
Pw(first) must be ~(Pfirst, Pfirst), | |
Pw(rem) must be ~(Prem, Prem) | |
) | |
} | |
} | |
} | |
def segmentStrs(hobbit: String) = | |
"choose spain" :: | |
"this is a test" :: | |
declOfIndep :: | |
"who represents" :: | |
"experts exchange" :: | |
"speed of art" :: | |
"now is the time for all good" :: | |
"it is a truth universally acknowledged" :: | |
taleOf2Cities :: | |
metamorphosis :: | |
hobbit :: | |
h2g2 :: | |
Nil | |
def runSegment(segment: String => Seq[String]) = | |
(text: String) => segment(text filter (_ != ' ')) === (text split ' ').toSeq | |
"segment" >> foreach(segmentStrs(hobbit("sitdown" )))(runSegment(segment)) | |
"segment2" >> foreach(segmentStrs(hobbit("sit down")))(runSegment((s: String) => segment2(s)._2)) | |
"shift" >> { | |
"secret" >> { | |
shift("Listen, do you want to know a secret?") ==== "Yvfgra, qb lbh jnag gb xabj n frperg?" | |
} | |
"hal9000" >> (shift("HAL 9000 xyz", 1) ==== "IBM 9000 yza") | |
} | |
"decodeShift" >> { | |
decodeShift("Yvfgra, qb lbh jnag gb xabj n frperg?") ==== "Listen, do you want to know a secret?" | |
} | |
"decodeShift2" >> foreach ( | |
"listen do you want to know a secret" :: | |
"rosebud" :: | |
"is it safe" :: | |
"whats the frequency kenneth" :: | |
starwars :: | |
Nil)(text => decodeShift2(shift2(text)) ==== text) | |
"decodeSubst" >> { | |
"secret code breaker" >> pending { | |
val msg = "DSDRO XFIJV DIYSB ANQAL TAIMX VBDMB GASSA QRTRT CGGXJ MMTQC IPJSB AQPDR SDIMS " + | |
"DUAMB CQCMS AQDRS DMRJN SBAGC IYTCY ASBCS MQXKS CICGX RSRCQ ACOGA SJPAS AQHDI ASBAK GCDIS " + | |
"AWSJN CMDKB AQHAR RCYAE" | |
val dec = "it is by knowing the frequency which letters usually occur and other distinctive " + | |
"characteristics of the language that crypt analysts are able to determine the plain text " + | |
"of a cipher message j" | |
dec ==== decodeSubst(msg) | |
} | |
"ww1 german spy" >> pending { | |
val msg = "NKDIF SERLJ MIBFK FKDLV NQIBR HLCJU KFTFL KSTEN YQNDQ NTTEB TTENM QLJFS NOSUM " + | |
"MLQTL CTENC QNKRE BTTBR HKLQT ELCBQ QBSFS KLTML SSFAI NLKBR RLUKT LCJUK FTFLK FKSUC CFRFN " + | |
"KRYXB" | |
val dec = "english complaining over lack of munitions they regret that the promised support " + | |
"of the french attack north of arras is not possible on account of munition insufficiency wa" | |
dec ==== decodeSubst(msg) | |
} | |
"1992 KGB to CIA" >> pending { | |
val msg = "CNLGV QVELH WTTAI LEHOT WEQVP CEBTQ FJNPP EDMFM LFCYF SQFSP NDHQF OEUTNPPTPP CTDQN" + | |
" IFSQD TWHTN HHLFJ OLFSD HQFED HEGNQ TWVNQ HTNHH LFJWE BBITS PTHDT XQQFO EUTYF SLFJE " + | |
"DEFDN IFSQG NLNGN PCTTQ EDOED FGQFI TLXNI" | |
val dec = "march third week bridge with smile to pass info from you to us and to give " + | |
"assessment about new dead drop ground to indicate what dead drop will be used next to give" + | |
" your opinion about caracas meeting in october xab" | |
dec ==== decodeSubst(msg) | |
} | |
"1943 german u-boat" >> pending { | |
val msg = "WLJIU JYBRK PWFPF IJQSK PWRSS WEPTM MJRBS BJIRA BASPP IHBGP RWMWQ SOPSV PPIMJ " + | |
"BISUF WIFOT HWBIS WBIQW FBJRB GPILP PXLPM SAJQQ PMJQS RJASW LSBLW GBHMJ QSWIL PXWOL" | |
val dec = "a cony ov is headed northeast take up positions fifteen miles apart between point " + | |
"yd and bu maintain radio silence except for reports of tactical importance x abc" | |
dec ==== decodeSubst(msg) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment