Skip to content

Instantly share code, notes, and snippets.

@tomtung
Created October 23, 2012 08:13
Show Gist options
  • Save tomtung/3937587 to your computer and use it in GitHub Desktop.
Save tomtung/3937587 to your computer and use it in GitHub Desktop.
Forward-backward Algorithm For Part-of-speech Tagging
// Example: distinguishing between vowels and consonants
val corpus =
Seq( // I swear these are all English words
"antidisestablishmentarian", "ablutophobia", "automatonophobia", "autotonsorialist",
"arachibutyrophobia", "automysophobia", "batrachophagous", "ballistocardiograph",
"blandiloquence", "brachydactylous", "bathythermograph", "cacodemomania",
"caesaropapism", "catapedamania", "cheiloproclitic", "chronosynchronicity",
"dendrochronology", "deorsumversion", "dermatoglyphics", "dolichocephalic",
"dysmorphophobia", "eellogofusciouhipoppokunurious", "electroencephalograph", "epiphenomenalism",
"electrodynamometer", "ephemeromorph", "floccinaucinihilipilification", "forisfamiliate",
"flagelliferous", "fibriophobia", "frumentaceous", "flibbertigibbet",
"gynotikolobomassophile", "germanophilia", "gluconeogenesis", "graminivorous",
"grammaticaster", "haematogenesis", "haussmannize", "helioseismology",
"honorificabilitudinity", "hydrometeorology", "hypercatalectic", "iatromathematics",
"ichthyophagous", "immunopathology", "interramification", "incomprehensibleness",
"juglandaceous", "japanophobia", "juglandaceous", "japanophilia",
"jaculiferous", "kakorrhaphiophobia", "kephalonomancy", "katathermometer",
"kosmikophobia", "keraunophobia", "logizomechanophobia", "libanophorous",
"lepidopterology", "lautenclavicymbel", "latitudinarianism", "medomalacuphobia",
"macrocephalous", "margaritomancy", "maschalephidrosis", "micropalaeontology",
"nucleomituphobia", "neopharmaphobia", "nyctohylophobia", "nigroglobulate",
"neurophysiology", "oneirogmophobia", "ophthalmophobia", "otorhinolaryngology",
"orphanotrophism", "ophthalmoscope", "paraskavedekatriaphobia", "palaeoanthropology",
"penecontemporaneous", "pneumatophilosophy", "podobromhidrosis", "quadragesimarian",
"quomodocunquize", "quoddamodotative", "quinquagenarian", "quasquicentennial",
"rhabdophobia", "radiometeorograph", "rhinotillexomania", "representationalism",
"radappertization", "siderodromophobia", "spermatophobia", "sacramentarianism",
"spectroheliokinematograph", "sphygmomanometer", "telephonophobia", "theologicophobia",
"triskaidekaphobia", "thanatognomonic", "theophilanthropism", "triboluminescence",
"ultramicroscope", "umbraculiform", "ubiquitarianism", "unconsentaneous",
"utilitarianism", "venustraphobia", "verminophobia", "vitricophobic",
"volumenometer", "voicespondence", "walloonphobia", "weltanschauung",
"whigmaleerie", "weatherometer", "whippoorwill", "xenodocheionology",
"xanthophobia", "xenoglossophobia", "xeroradiography", "xanthocyanopsy",
"yogibogeybox", "yarborough", "yarnwindle", "yeomanette",
"yttriferous", "zemmiphobia", "zalambdodont", "zeusophobia", "zenzizenzizenzic"
).map(_.iterator.map(_.toString).toIndexedSeq)
val nTag = 2 // 2 tags: vowel or consonant
val (model, prob) = computeForwardBackward(corpus, nTag)
println("Best corpus probability = 2^" + prob)
model.b.zipWithIndex.drop(1).foreach({
case (words, i) => {
println("P(Letter|Tag" + i + "):")
words.zipWithIndex.map({
case (logP, wordIdx) => (logP, model.words(wordIdx))
}).sortBy(-_._1).foreach({
case (logP, word) =>
println(word + ":\t" + math.pow(2, logP))
})
println()
}
})
/*
Output:
P(Letter|Tag1):
n: 0.09837982985409412
r: 0.09747466863106026
h: 0.08963206738930135
p: 0.0811569848839846
t: 0.07857387790975762
l: 0.07634134515642844
...
P(Letter|Tag2):
o: 0.284764667819875
i: 0.21201862830500298
a: 0.17999825418893892
e: 0.16713940843565922
u: 0.05067696417951677
y: 0.04177186447424547
...
*/
corpus.foreach(x => {
println(x.mkString)
println(Model.computePosTags(x, model)._2.mkString)
})
/*
Output:
antidisestablishmentarian
2112121211211211121121221
ablutophobia
211212112121
automatonophobia
1212121212112121
autotonsorialist
1212121121221211
arachibutyrophobia
212112121212112121
...
*/
import collection.mutable
/**
* Compute log of base 2
*/
def log2(d: Double) = math.log(d) / math.log(2.0)
/**
* Given l1 = log(x), l2 = log(y), compute the approximate value of log(x+y)
*/
def logPlus(l1: Double, l2: Double) = {
assert(!l1.isNaN && !l2.isNaN)
val l_larger = math.max(l1, l2)
val l_smaller = math.min(l1, l2)
if (l_smaller.isNegInfinity) l_larger
else if (l_larger - l_smaller > 32) l_larger
else l_larger + log2(1 + math.pow(2, l_smaller - l_larger))
}
/**
* Normalize the given log vector, so that 2 to the power of each element sum up to 1.
*/
def normalizeLog(log_vec: IndexedSeq[Double]) = {
val log_sum = log_vec.reduce(logPlus)
log_vec.map(_ - log_sum)
}
/**
* Create a random double Vector whose elements are positive and sum up to 1
*/
def randomLogProbVector(size: Int) =
normalizeLog(IndexedSeq.fill(size)(util.Random.nextDouble()))
/**
* Create a random Vector[Vector[Double]], each row of which sum up to 1
*/
def randomProbLogMatrix(nRows: Int, nColumns: Int) =
IndexedSeq.fill(nRows)(randomLogProbVector(nColumns))
/**
* Parameters for part-of-speech tagging
* @param words The vocabulary.
* The index of each word corresponds to the word index in b
* @param nTag Number of distinct tags
* @param t t[tag1, tag2] = log P(tag2|tag1), i.e. bi-gram model
* @param b b[tag, word] = log P(word|tag)
*/
case class Model(words: IndexedSeq[String], nTag: Int,
t: IndexedSeq[IndexedSeq[Double]],
b: IndexedSeq[IndexedSeq[Double]]) {
assert(words.distinct.size == words.size)
lazy val wordToIndex = words.zipWithIndex.toMap
/**
* Tags are indexed from 1 to nTag.
* 0 represents the start and the end of a tag sequence.
*/
val tags = 1 to nTag
def params = (words, nTag, t, b)
}
object Model {
/**
* Create a model with random parameters for random restarts.
*/
def newRandomModel(words: IndexedSeq[String], nTag: Int) =
Model(words, nTag,
(Double.NaN +: randomLogProbVector(nTag)) +: randomProbLogMatrix(nTag, nTag + 1),
mutable.IndexedSeq.fill(words.size)(Double.NaN) +: randomProbLogMatrix(nTag, words.size))
/**
* Construct a lattice and compute the best part-of-speech tagging using Viterbi decoding
* @return The lattice and the best path
*/
def computePosTags(example: IndexedSeq[Int], model: Model) = {
val (_, nTag, t, b) = model.params
def newLattice[T](v: T) = IndexedSeq.fill(example.size)(
mutable.IndexedSeq.fill(nTag + 1)(v)
)
val p = newLattice(Double.NaN)
val track = newLattice(0)
for (i <- 0 until example.size; tag <- model.tags) {
if (i == 0) {
p(i)(tag) = t(0)(tag) + b(tag)(example(i))
}
else {
val (prevP, prevTag) = model.tags.map(prev_tag =>
p(i - 1)(prev_tag) + t(prev_tag)(tag)
).zip(model.tags).maxBy(_._1)
p(i)(tag) = prevP + b(tag)(example(i))
track(i)(tag) = prevTag
}
}
val path = {
val (_, lastTag) = p(example.size - 1).zipWithIndex.drop(1).maxBy(_._1)
def buildPath(i: Int, tag: Int, accu: List[Int] = Nil): IndexedSeq[Int] =
if (i == 0) (tag :: accu).toIndexedSeq
else buildPath(i - 1, track(i)(tag), tag :: accu)
buildPath(example.size - 1, lastTag)
}
(p.map(_.toIndexedSeq), path)
}
/**
* Construct a lattice and compute the best part-of-speech tagging using Viterbi decoding
* @return The lattice and the best path
*/
def computePosTags[X: ClassManifest](example: IndexedSeq[String],
model: Model): (IndexedSeq[IndexedSeq[Double]], IndexedSeq[Int]) =
computePosTags(example.map(model.wordToIndex), model)
/**
* Construct a lattice and compute the α values (forward)
* @return α and the log probability of the example
*/
def computeΑ(example: IndexedSeq[Int], model: Model) = {
val (_, nTag, t, b) = model.params
val α = IndexedSeq.fill(example.size)(
mutable.IndexedSeq.fill(nTag + 1)(Double.NaN)
)
for (i <- 0 until example.size; tag <- model.tags) {
α(i)(tag) =
if (i == 0)
t(0)(tag) + b(tag)(example(i))
else
model.tags.map(prev_tag =>
α(i - 1)(prev_tag) + t(prev_tag)(tag)
).reduce(logPlus) + b(tag)(example(i))
}
val α_end = (model.tags).map(tag => α(example.size - 1)(tag) + t(tag)(0)).reduce(logPlus)
(α.map(_.toIndexedSeq), α_end)
}
/**
* Construct a lattice and compute the β values (backward)
* @return β and the log probability of the example
*/
def computeΒ(example: IndexedSeq[Int], model: Model) = {
val (_, nTag, t, b) = model.params
val β = IndexedSeq.fill(example.size)(
mutable.IndexedSeq.fill(nTag + 1)(Double.NaN)
)
for (i <- (example.size - 1) to 0 by -1; tag <- model.tags) {
β(i)(tag) =
if (i == example.size - 1)
t(tag)(0)
else model.tags.map(next_tag =>
t(tag)(next_tag) + b(next_tag)(example(i + 1)) + β(i + 1)(next_tag)
).reduce(logPlus)
}
val β_start = (model.tags).map(tag => t(0)(tag) + b(tag)(example(0)) + β(0)(tag)).reduce(logPlus)
(β.map(_.toIndexedSeq), β_start)
}
}
/**
* The recursive helper function that computes the model for each random restart
*/
@annotation.tailrec
def computeForwardBackwardImpl(indexedCorpus: Seq[IndexedSeq[Int]],
model: Model, nIter: Int,
prevLogJointProb: Double,
isConverged: (Int, Double, Double) => Boolean): (Model, Double) = {
import Model._
val (words, nTag, t, b) = model.params
val ex_α_β_logProb = indexedCorpus.map(ex => {
val (α, α_end) = computeΑ(ex, model)
val (β, β_start) = computeΒ(ex, model)
assert(α_end - β_start < 1e10)
(ex, α, β, α_end)
})
val logJointProb = ex_α_β_logProb.map(_._4).reduce(_ + _)
if (nIter != 0 && isConverged(nIter, prevLogJointProb, logJointProb))
(model, logJointProb)
else {
val new_model = {
// Container for collecting partial counts for t
val c_t = IndexedSeq.fill(nTag + 1)(mutable.IndexedSeq.fill(nTag + 1)(Double.NegativeInfinity))
// Container for collecting partial counts for b
val c_b =
mutable.IndexedSeq.fill(words.size)(Double.NaN) +:
IndexedSeq.fill(nTag)(mutable.IndexedSeq.fill(words.size)(Double.NegativeInfinity))
ex_α_β_logProb.foreach({
case (ex, α, β, logProb) =>
// For each example, collect partial counts for t
for (tag1 <- model.tags) {
c_t(0)(tag1) =
logPlus(c_t(0)(tag1),
t(0)(tag1) + b(tag1)(ex(0)) + β(0)(tag1) - logProb)
c_t(tag1)(0) =
logPlus(c_t(tag1)(0),
α(ex.size - 1)(tag1) + t(tag1)(0) - logProb)
for (i <- 0 to ex.size - 2; tag2 <- model.tags) {
c_t(tag1)(tag2) =
logPlus(c_t(tag1)(tag2),
α(i)(tag1) + t(tag1)(tag2) + b(tag2)(ex(i + 1)) + β(i + 1)(tag2) - logProb)
}
}
// For each example, collect partial counts for b
for (tag <- model.tags; i <- 0 until ex.size) {
c_b(tag)(ex(i)) =
logPlus(c_b(tag)(ex(i)),
α(i)(tag) + β(i)(tag) - logProb)
}
})
// Compute the revised t and b by normalizing the partial counts
val new_t = (0.0 +: normalizeLog(c_t.head.drop(1))) +: c_t.tail.map(normalizeLog)
val new_b = mutable.IndexedSeq.fill(words.size)(Double.NaN) +: c_b.tail.map(normalizeLog)
Model(words, nTag, new_t, new_b)
}
computeForwardBackwardImpl(indexedCorpus, new_model, nIter + 1, logJointProb, isConverged)
}
}
/**
* Compute parameters for unsupervised POS tagging
* @param corpus A collection of training examples
* @param nTag Number of tags
* @param isConverged A function for testing convergence.
* The three parameters are: the number of current iteration,
* log training probability from last iteration and
* log training probability of current iteration
* @param nRandomRestart Number of random restarts
* @param onEachRestartCompleted The function is called asynchronously after each random restart.
* The three parameters are: current round of random restart,
* the resulting model and the log training probability
* @return The trained model and the log probability it assigns to corpus
*/
def computeForwardBackward(corpus: Seq[IndexedSeq[String]],
nTag: Int,
isConverged: (Int, Double, Double) => Boolean = (i, l0, l1) => l1 < l0 && (l1 / l0) > 0.99999,
nRandomRestart: Int = 10,
onEachRestartCompleted: (Int, Model, Double) => Unit = null) = {
import scala.actors.Futures._
val words = corpus.flatten.distinct.toIndexedSeq
val wordToIndex = words.zipWithIndex.toMap
val indexedCorpus = corpus.map(_.map(wordToIndex).toIndexedSeq)
val (bestModel, bestLogJointProb) = (1 to nRandomRestart).map(currRound => {
val initModel = Model.newRandomModel(words, nTag)
val (model, logJointProb) =
computeForwardBackwardImpl(
indexedCorpus, initModel, 0, Double.NegativeInfinity, isConverged)
if (onEachRestartCompleted != null) {
future {
onEachRestartCompleted(currRound, model, logJointProb)
}
}
(model, logJointProb)
}).maxBy(_._2)
(bestModel, bestLogJointProb)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment