sorami/julia-advent-calendar-2015.ipynb

## julia-advent-calendar-2015.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Topic Modeling with Julia and Masashi Sada\n",
    "\n",
    "[さだとJuliaでLDA - Qiita](http://qiita.com/sorami/items/6c9eef251f82b424bf68)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO: Updating METADATA...\n",
      "INFO: Computing changes...\n",
      "INFO: No packages to install, update or remove\n",
      "INFO: Nothing to be done\n"
     ]
    }
   ],
   "source": [
    "Pkg.update()\n",
    "Pkg.add(\"MeCab\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "using MeCab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "LoadError: failed to create tagger\nwhile loading In[3], in expression starting on line 1",
     "output_type": "error",
     "traceback": [
      "LoadError: failed to create tagger\nwhile loading In[3], in expression starting on line 1",
      "",
      " in call at /Users/sorami/.julia/v0.4/MeCab/src/MeCab.jl:32",
      " in call at /Users/sorami/.julia/v0.4/MeCab/src/MeCab.jl:22"
     ]
    }
   ],
   "source": [
    "mecab = Mecab()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "→ For now, use *TinySegmenter.jl* instead."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "... 2015-12-27 MeCab.jl updated : [Merge pull request #11 from r9y9/library-ignore-path · chezou/MeCab.jl@0698ba9](https://github.com/chezou/MeCab.jl/commit/0698ba9653afe97a00d1af07f80494f430460fec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Pkg.update()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "using MeCab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MeCab.Mecab(Ptr{Void} @0x00007f8a8bf24780)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mecab = Mecab()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8-element Array{MeCab.MecabNode,1}:\n",
       " MeCab.MecabNode(\"すもも\",\"名詞,一般,*,*,*,*,すもも,スモモ,スモモ\")\n",
       " MeCab.MecabNode(\"も\",\"助詞,係助詞,*,*,*,*,も,モ,モ\")       \n",
       " MeCab.MecabNode(\"もも\",\"名詞,一般,*,*,*,*,もも,モモ,モモ\")    \n",
       " MeCab.MecabNode(\"も\",\"助詞,係助詞,*,*,*,*,も,モ,モ\")       \n",
       " MeCab.MecabNode(\"もも\",\"名詞,一般,*,*,*,*,もも,モモ,モモ\")    \n",
       " MeCab.MecabNode(\"も\",\"助詞,係助詞,*,*,*,*,も,モ,モ\")       \n",
       " MeCab.MecabNode(\"の\",\"助詞,連体化,*,*,*,*,の,ノ,ノ\")       \n",
       " MeCab.MecabNode(\"うち\",\"名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ\")"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results = parse(mecab, \"すももももももももものうち\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO: Updating METADATA...\n",
      "INFO: Computing changes...\n",
      "INFO: No packages to install, update or remove\n",
      "INFO: Nothing to be done\n",
      "INFO: Nothing to be done\n",
      "INFO: Nothing to be done\n",
      "INFO: Nothing to be done\n"
     ]
    }
   ],
   "source": [
    "Pkg.update()\n",
    "Pkg.add(\"DataStructures\")\n",
    "Pkg.add(\"TinySegmenter\")\n",
    "Pkg.add(\"StatsBase\")\n",
    "Pkg.add(\"Formatting\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "using DataStructures\n",
    "using TinySegmenter\n",
    "using StatsBase\n",
    "using Formatting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "srand(2015);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ntopics = 5;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "datadir = \"./data/lyrics/\";"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "type Document\n",
    "    id::Int\n",
    "    length::Int\n",
    "    words::Array{UTF8String,1}\n",
    "    topicids::Array{Int,1}\n",
    "    topicidcount::DefaultDict{Int,Int,Int}\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Set(UTF8String[\"「\",\"だっ\",\"よ\",\"う\",\"ね\",\"い\",\"、\",\"も\",\"一\",\")\"  …  \"」\",\"さ\",\"か\",\"なら\",\"な\",\"から\",\"だ\",\"なた\",\"　\",\"の\"])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# arbitrary words\n",
    "stopwords = Set{UTF8String}([\"　\", \"の\", \"に\", \"を\", \"は\", \"て\", \"た\", \"が\", \n",
    "                                \" \", \"と\", \"も\", \"で\", \"ない\", \"あ\", \"な\",\n",
    "                                \")\", \"(\", \"、\", \"か\", \"し\", \"その\", \"い\", \"から\", \"だ\", \"ば\",\n",
    "                                \"い\", \"よ\", \"だっ\", \"う\", \"よう\", \"ます\",\n",
    "                                \"へ\", \"なた\", \"一\", \"「\", \"」\", \"・\", \"お\", \"ね\", \"なっ\", \"…\",\n",
    "                                \"なら\", \"たら\", \"だけ\", \"てる\", \"たい\", \"や\", \"さ\", \"0\", \"。\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "corpus = Document[]\n",
    "\n",
    "let\n",
    "    for (docid, fname) in enumerate(readdir(datadir))\n",
    "        open(\"$(datadir)/$(fname)\", \"r\") do fin\n",
    "            words = UTF8String[]\n",
    "            for line in eachline(fin)\n",
    "                append!(words, tokenize(strip(line)))\n",
    "            end\n",
    "\n",
    "            words = filter(x -> x ∉ stopwords, words)\n",
    "            \n",
    "            doclength = length(words)\n",
    "            topicids = rand(1:ntopics, doclength) # initialize with random topics\n",
    "            topicidcount = DefaultDict(0, Dict{Int,Int}(counter(topicids)))\n",
    "            push!(corpus, Document(docid, doclength, words, topicids, topicidcount))\n",
    "        end\n",
    "    end\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "449"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "length(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "98"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus[99].length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{UTF8String,1}:\n",
       " \"笑っ\"\n",
       " \"君\" \n",
       " \"ため\"\n",
       " \"笑っ\"\n",
       " \"僕\" "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus[99].words[1:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{Int64,1}:\n",
       " 5\n",
       " 1\n",
       " 2\n",
       " 1\n",
       " 1"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus[99].topicids[1:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataStructures.DefaultDict{Int64,Int64,Int64} with 5 entries:\n",
       "  4 => 20\n",
       "  2 => 13\n",
       "  3 => 24\n",
       "  5 => 18\n",
       "  1 => 23"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus[99].topicidcount"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "type Topic\n",
    "    id::Int\n",
    "    count::Int\n",
    "    wordcount::DefaultDict{UTF8String,Int64,Int64}\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "topics = Dict{Int64,Topic}()\n",
    "for topicid in 1:ntopics\n",
    "    topics[topicid] = Topic(topicid, 0, DefaultDict(UTF8String, Int, 0))\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for document in corpus\n",
    "    # initial counts for each topics\n",
    "    for (word, topicid) in zip(document.words, document.topicids)\n",
    "        topics[topicid].count += 1\n",
    "        topics[topicid].wordcount[word] += 1\n",
    "    end\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10202"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topics[1].count"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Estimation by Gibbs Sampling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11950"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flat(A) = mapreduce(x->isa(x,Array)? flat(x): x, vcat, [], A) # c.f. http://rosettacode.org/wiki/Flatten_a_list#Julia\n",
    "vocabsize = length(Set(flat([doc.words for doc in corpus])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.01"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "α = 0.01\n",
    "β = 0.01"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "let\n",
    "    for _ in 1:1000 # arbitrary number of iterations\n",
    "        for document in corpus\n",
    "            for (i, word) in enumerate(document.words)\n",
    "                # decrement each count\n",
    "                topicid_current = document.topicids[i]\n",
    "                topics[topicid_current].count -= 1\n",
    "                topics[topicid_current].wordcount[word] -= 1\n",
    "                document.length -= 1\n",
    "                document.topicidcount[topicid_current] -= 1\n",
    "\n",
    "                # calc. probs for the topics, sample one\n",
    "                probs = Float64[]\n",
    "                for k in 1:ntopics\n",
    "                    topicprob = (document.topicidcount[k] + β) / (document.length + β*ntopics)\n",
    "                    wordprob = (topics[k].wordcount[word] + α) / (topics[k].count + α*vocabsize)\n",
    "                    push!(probs, topicprob * wordprob)\n",
    "                end\n",
    "                topicid_new = sample(1:ntopics, WeightVec(probs))\n",
    "\n",
    "                # increment each count, according to the new sampled topic\n",
    "                document.topicids[i] = topicid_new\n",
    "                topics[topicid_new].count += 1\n",
    "                topics[topicid_new].wordcount[word] += 1\n",
    "                document.length += 1\n",
    "                document.topicidcount[topicid_new] += 1\n",
    "            end\n",
    "        end\n",
    "    end\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic 1\n",
      "0.0109\t彼\n",
      "0.0090\t娘\n",
      "0.0074\t1\n",
      "0.0066\t恋\n",
      "0.0054\t子供\n",
      "0.0054\t5\n",
      "0.0053\tたち\n",
      "0.0051\t4\n",
      "0.0049\t帰っ\n",
      "0.0049\t出来\n",
      "\n",
      "Topic 2\n",
      "0.0199\tこと\n",
      "0.0146\t生き\n",
      "0.0116\tそれ\n",
      "0.0103\tこの\n",
      "0.0094\tだろ\n",
      "0.0092\t人\n",
      "0.0085\t自分\n",
      "0.0083\t心\n",
      "0.0074\t忘れ\n",
      "0.0074\t何\n",
      "\n",
      "Topic 3\n",
      "0.0525\t君\n",
      "0.0320\t僕\n",
      "0.0092\tいる\n",
      "0.0090\t時\n",
      "0.0075\t愛\n",
      "0.0073\t町\n",
      "0.0069\t忘れ\n",
      "0.0065\t風\n",
      "0.0063\t人\n",
      "0.0059\t手\n",
      "\n",
      "Topic 4\n",
      "0.0159\t人\n",
      "0.0136\t私\n",
      "0.0134\tあなた\n",
      "0.0117\tこと\n",
      "0.0100\tある\n",
      "0.0091\t日\n",
      "0.0084\t夢\n",
      "0.0079\t花\n",
      "0.0064\t愛\n",
      "0.0062\t時\n",
      "\n",
      "Topic 5\n",
      "0.0133\tがんばらんば\n",
      "0.0079\tがんばれ\n",
      "0.0065\t明日\n",
      "0.0065\tBye\n",
      "0.0061\t何\n",
      "0.0056\t来\n",
      "0.0056\t日\n",
      "0.0051\tせ\n",
      "0.0049\t今\n",
      "0.0049\tある\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for k in 1:ntopics\n",
    "    println(\"Topic $(k)\")\n",
    "    t = topics[k]\n",
    "    for (prob, word) in sort([(count / t.count, word) for (word, count) in t.wordcount], rev=true)[1:10]\n",
    "        printfmtln(\"{:.4f}\\t{}\", prob, word)\n",
    "    end\n",
    "    println()\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\\- End"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 0.4.2",
   "language": "julia",
   "name": "julia-0.4"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "0.4.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Topic Modeling with Julia and Masashi Sada\n",
	"\n",
	"[さだとJuliaでLDA - Qiita](http://qiita.com/sorami/items/6c9eef251f82b424bf68)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"INFO: Updating METADATA...\n",
	"INFO: Computing changes...\n",
	"INFO: No packages to install, update or remove\n",
	"INFO: Nothing to be done\n"
	]
	}
	],
	"source": [
	"Pkg.update()\n",
	"Pkg.add(\"MeCab\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"using MeCab"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"ename": "LoadError",
	"evalue": "LoadError: failed to create tagger\nwhile loading In[3], in expression starting on line 1",
	"output_type": "error",
	"traceback": [
	"LoadError: failed to create tagger\nwhile loading In[3], in expression starting on line 1",
	"",
	" in call at /Users/sorami/.julia/v0.4/MeCab/src/MeCab.jl:32",
	" in call at /Users/sorami/.julia/v0.4/MeCab/src/MeCab.jl:22"
	]
	}
	],
	"source": [
	"mecab = Mecab()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"→ For now, use TinySegmenter.jl instead."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"---"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"... 2015-12-27 MeCab.jl updated : [Merge pull request #11 from r9y9/library-ignore-path · chezou/MeCab.jl@0698ba9](https://github.com/chezou/MeCab.jl/commit/0698ba9653afe97a00d1af07f80494f430460fec)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"Pkg.update()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"using MeCab"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"MeCab.Mecab(Ptr{Void} @0x00007f8a8bf24780)"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"mecab = Mecab()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"8-element Array{MeCab.MecabNode,1}:\n",
	" MeCab.MecabNode(\"すもも\",\"名詞,一般,,,,,すもも,スモモ,スモモ\")\n",
	" MeCab.MecabNode(\"も\",\"助詞,係助詞,,,,,も,モ,モ\") \n",
	" MeCab.MecabNode(\"もも\",\"名詞,一般,,,,,もも,モモ,モモ\") \n",
	" MeCab.MecabNode(\"も\",\"助詞,係助詞,,,,,も,モ,モ\") \n",
	" MeCab.MecabNode(\"もも\",\"名詞,一般,,,,,もも,モモ,モモ\") \n",
	" MeCab.MecabNode(\"も\",\"助詞,係助詞,,,,,も,モ,モ\") \n",
	" MeCab.MecabNode(\"の\",\"助詞,連体化,,,,,の,ノ,ノ\") \n",
	" MeCab.MecabNode(\"うち\",\"名詞,非自立,副詞可能,,,*,うち,ウチ,ウチ\")"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"results = parse(mecab, \"すももももももももものうち\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"---"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"INFO: Updating METADATA...\n",
	"INFO: Computing changes...\n",
	"INFO: No packages to install, update or remove\n",
	"INFO: Nothing to be done\n",
	"INFO: Nothing to be done\n",
	"INFO: Nothing to be done\n",
	"INFO: Nothing to be done\n"
	]
	}
	],
	"source": [
	"Pkg.update()\n",
	"Pkg.add(\"DataStructures\")\n",
	"Pkg.add(\"TinySegmenter\")\n",
	"Pkg.add(\"StatsBase\")\n",
	"Pkg.add(\"Formatting\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"using DataStructures\n",
	"using TinySegmenter\n",
	"using StatsBase\n",
	"using Formatting"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"srand(2015);"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"ntopics = 5;"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"datadir = \"./data/lyrics/\";"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Documents"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"type Document\n",
	" id::Int\n",
	" length::Int\n",
	" words::Array{UTF8String,1}\n",
	" topicids::Array{Int,1}\n",
	" topicidcount::DefaultDict{Int,Int,Int}\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Set(UTF8String[\"「\",\"だっ\",\"よ\",\"う\",\"ね\",\"い\",\"、\",\"も\",\"一\",\")\" … \"」\",\"さ\",\"か\",\"なら\",\"な\",\"から\",\"だ\",\"なた\",\"　\",\"の\"])"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# arbitrary words\n",
	"stopwords = Set{UTF8String}([\"　\", \"の\", \"に\", \"を\", \"は\", \"て\", \"た\", \"が\", \n",
	" \" \", \"と\", \"も\", \"で\", \"ない\", \"あ\", \"な\",\n",
	" \")\", \"(\", \"、\", \"か\", \"し\", \"その\", \"い\", \"から\", \"だ\", \"ば\",\n",
	" \"い\", \"よ\", \"だっ\", \"う\", \"よう\", \"ます\",\n",
	" \"へ\", \"なた\", \"一\", \"「\", \"」\", \"・\", \"お\", \"ね\", \"なっ\", \"…\",\n",
	" \"なら\", \"たら\", \"だけ\", \"てる\", \"たい\", \"や\", \"さ\", \"0\", \"。\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"corpus = Document[]\n",
	"\n",
	"let\n",
	" for (docid, fname) in enumerate(readdir(datadir))\n",
	" open(\"$(datadir)/$(fname)\", \"r\") do fin\n",
	" words = UTF8String[]\n",
	" for line in eachline(fin)\n",
	" append!(words, tokenize(strip(line)))\n",
	" end\n",
	"\n",
	" words = filter(x -> x ∉ stopwords, words)\n",
	" \n",
	" doclength = length(words)\n",
	" topicids = rand(1:ntopics, doclength) # initialize with random topics\n",
	" topicidcount = DefaultDict(0, Dict{Int,Int}(counter(topicids)))\n",
	" push!(corpus, Document(docid, doclength, words, topicids, topicidcount))\n",
	" end\n",
	" end\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"449"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"length(corpus)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"98"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"corpus[99].length"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"5-element Array{UTF8String,1}:\n",
	" \"笑っ\"\n",
	" \"君\" \n",
	" \"ため\"\n",
	" \"笑っ\"\n",
	" \"僕\" "
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"corpus[99].words[1:5]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"5-element Array{Int64,1}:\n",
	" 5\n",
	" 1\n",
	" 2\n",
	" 1\n",
	" 1"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"corpus[99].topicids[1:5]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"DataStructures.DefaultDict{Int64,Int64,Int64} with 5 entries:\n",
	" 4 => 20\n",
	" 2 => 13\n",
	" 3 => 24\n",
	" 5 => 18\n",
	" 1 => 23"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"corpus[99].topicidcount"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Topics"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"type Topic\n",
	" id::Int\n",
	" count::Int\n",
	" wordcount::DefaultDict{UTF8String,Int64,Int64}\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"topics = Dict{Int64,Topic}()\n",
	"for topicid in 1:ntopics\n",
	" topics[topicid] = Topic(topicid, 0, DefaultDict(UTF8String, Int, 0))\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"for document in corpus\n",
	" # initial counts for each topics\n",
	" for (word, topicid) in zip(document.words, document.topicids)\n",
	" topics[topicid].count += 1\n",
	" topics[topicid].wordcount[word] += 1\n",
	" end\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"10202"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"topics[1].count"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"## Estimation by Gibbs Sampling"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"11950"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"flat(A) = mapreduce(x->isa(x,Array)? flat(x): x, vcat, [], A) # c.f. http://rosettacode.org/wiki/Flatten_a_list#Julia\n",
	"vocabsize = length(Set(flat([doc.words for doc in corpus])))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.01"
	]
	},
	"execution_count": 22,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"α = 0.01\n",
	"β = 0.01"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"let\n",
	" for _ in 1:1000 # arbitrary number of iterations\n",
	" for document in corpus\n",
	" for (i, word) in enumerate(document.words)\n",
	" # decrement each count\n",
	" topicid_current = document.topicids[i]\n",
	" topics[topicid_current].count -= 1\n",
	" topics[topicid_current].wordcount[word] -= 1\n",
	" document.length -= 1\n",
	" document.topicidcount[topicid_current] -= 1\n",
	"\n",
	" # calc. probs for the topics, sample one\n",
	" probs = Float64[]\n",
	" for k in 1:ntopics\n",
	" topicprob = (document.topicidcount[k] + β) / (document.length + β*ntopics)\n",
	" wordprob = (topics[k].wordcount[word] + α) / (topics[k].count + α*vocabsize)\n",
	" push!(probs, topicprob * wordprob)\n",
	" end\n",
	" topicid_new = sample(1:ntopics, WeightVec(probs))\n",
	"\n",
	" # increment each count, according to the new sampled topic\n",
	" document.topicids[i] = topicid_new\n",
	" topics[topicid_new].count += 1\n",
	" topics[topicid_new].wordcount[word] += 1\n",
	" document.length += 1\n",
	" document.topicidcount[topicid_new] += 1\n",
	" end\n",
	" end\n",
	" end\n",
	"end"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"## Results"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Topic 1\n",
	"0.0109\t彼\n",
	"0.0090\t娘\n",
	"0.0074\t1\n",
	"0.0066\t恋\n",
	"0.0054\t子供\n",
	"0.0054\t5\n",
	"0.0053\tたち\n",
	"0.0051\t4\n",
	"0.0049\t帰っ\n",
	"0.0049\t出来\n",
	"\n",
	"Topic 2\n",
	"0.0199\tこと\n",
	"0.0146\t生き\n",
	"0.0116\tそれ\n",
	"0.0103\tこの\n",
	"0.0094\tだろ\n",
	"0.0092\t人\n",
	"0.0085\t自分\n",
	"0.0083\t心\n",
	"0.0074\t忘れ\n",
	"0.0074\t何\n",
	"\n",
	"Topic 3\n",
	"0.0525\t君\n",
	"0.0320\t僕\n",
	"0.0092\tいる\n",
	"0.0090\t時\n",
	"0.0075\t愛\n",
	"0.0073\t町\n",
	"0.0069\t忘れ\n",
	"0.0065\t風\n",
	"0.0063\t人\n",
	"0.0059\t手\n",
	"\n",
	"Topic 4\n",
	"0.0159\t人\n",
	"0.0136\t私\n",
	"0.0134\tあなた\n",
	"0.0117\tこと\n",
	"0.0100\tある\n",
	"0.0091\t日\n",
	"0.0084\t夢\n",
	"0.0079\t花\n",
	"0.0064\t愛\n",
	"0.0062\t時\n",
	"\n",
	"Topic 5\n",
	"0.0133\tがんばらんば\n",
	"0.0079\tがんばれ\n",
	"0.0065\t明日\n",
	"0.0065\tBye\n",
	"0.0061\t何\n",
	"0.0056\t来\n",
	"0.0056\t日\n",
	"0.0051\tせ\n",
	"0.0049\t今\n",
	"0.0049\tある\n",
	"\n"
	]
	}
	],
	"source": [
	"for k in 1:ntopics\n",
	" println(\"Topic $(k)\")\n",
	" t = topics[k]\n",
	" for (prob, word) in sort([(count / t.count, word) for (word, count) in t.wordcount], rev=true)[1:10]\n",
	" printfmtln(\"{:.4f}\\t{}\", prob, word)\n",
	" end\n",
	" println()\n",
	"end"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"\\- End"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Julia 0.4.2",
	"language": "julia",
	"name": "julia-0.4"
	},
	"language_info": {
	"file_extension": ".jl",
	"mimetype": "application/julia",
	"name": "julia",
	"version": "0.4.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}