Last active
December 27, 2015 23:48
-
-
Save sorami/60b1aa2e358f98454625 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Topic Modeling with Julia and Masashi Sada\n", | |
"\n", | |
"[さだとJuliaでLDA - Qiita](http://qiita.com/sorami/items/6c9eef251f82b424bf68)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"INFO: Updating METADATA...\n", | |
"INFO: Computing changes...\n", | |
"INFO: No packages to install, update or remove\n", | |
"INFO: Nothing to be done\n" | |
] | |
} | |
], | |
"source": [ | |
"Pkg.update()\n", | |
"Pkg.add(\"MeCab\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"using MeCab" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"ename": "LoadError", | |
"evalue": "LoadError: failed to create tagger\nwhile loading In[3], in expression starting on line 1", | |
"output_type": "error", | |
"traceback": [ | |
"LoadError: failed to create tagger\nwhile loading In[3], in expression starting on line 1", | |
"", | |
" in call at /Users/sorami/.julia/v0.4/MeCab/src/MeCab.jl:32", | |
" in call at /Users/sorami/.julia/v0.4/MeCab/src/MeCab.jl:22" | |
] | |
} | |
], | |
"source": [ | |
"mecab = Mecab()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"→ For now, use *TinySegmenter.jl* instead." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"---" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"... 2015-12-27 MeCab.jl updated : [Merge pull request #11 from r9y9/library-ignore-path · chezou/MeCab.jl@0698ba9](https://github.com/chezou/MeCab.jl/commit/0698ba9653afe97a00d1af07f80494f430460fec)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"Pkg.update()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"using MeCab" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"MeCab.Mecab(Ptr{Void} @0x00007f8a8bf24780)" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"mecab = Mecab()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"8-element Array{MeCab.MecabNode,1}:\n", | |
" MeCab.MecabNode(\"すもも\",\"名詞,一般,*,*,*,*,すもも,スモモ,スモモ\")\n", | |
" MeCab.MecabNode(\"も\",\"助詞,係助詞,*,*,*,*,も,モ,モ\") \n", | |
" MeCab.MecabNode(\"もも\",\"名詞,一般,*,*,*,*,もも,モモ,モモ\") \n", | |
" MeCab.MecabNode(\"も\",\"助詞,係助詞,*,*,*,*,も,モ,モ\") \n", | |
" MeCab.MecabNode(\"もも\",\"名詞,一般,*,*,*,*,もも,モモ,モモ\") \n", | |
" MeCab.MecabNode(\"も\",\"助詞,係助詞,*,*,*,*,も,モ,モ\") \n", | |
" MeCab.MecabNode(\"の\",\"助詞,連体化,*,*,*,*,の,ノ,ノ\") \n", | |
" MeCab.MecabNode(\"うち\",\"名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ\")" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"results = parse(mecab, \"すももももももももものうち\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"---" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"INFO: Updating METADATA...\n", | |
"INFO: Computing changes...\n", | |
"INFO: No packages to install, update or remove\n", | |
"INFO: Nothing to be done\n", | |
"INFO: Nothing to be done\n", | |
"INFO: Nothing to be done\n", | |
"INFO: Nothing to be done\n" | |
] | |
} | |
], | |
"source": [ | |
"Pkg.update()\n", | |
"Pkg.add(\"DataStructures\")\n", | |
"Pkg.add(\"TinySegmenter\")\n", | |
"Pkg.add(\"StatsBase\")\n", | |
"Pkg.add(\"Formatting\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"using DataStructures\n", | |
"using TinySegmenter\n", | |
"using StatsBase\n", | |
"using Formatting" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"srand(2015);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ntopics = 5;" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"datadir = \"./data/lyrics/\";" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Documents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"type Document\n", | |
" id::Int\n", | |
" length::Int\n", | |
" words::Array{UTF8String,1}\n", | |
" topicids::Array{Int,1}\n", | |
" topicidcount::DefaultDict{Int,Int,Int}\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Set(UTF8String[\"「\",\"だっ\",\"よ\",\"う\",\"ね\",\"い\",\"、\",\"も\",\"一\",\")\" … \"」\",\"さ\",\"か\",\"なら\",\"な\",\"から\",\"だ\",\"なた\",\" \",\"の\"])" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# arbitrary words\n", | |
"stopwords = Set{UTF8String}([\" \", \"の\", \"に\", \"を\", \"は\", \"て\", \"た\", \"が\", \n", | |
" \" \", \"と\", \"も\", \"で\", \"ない\", \"あ\", \"な\",\n", | |
" \")\", \"(\", \"、\", \"か\", \"し\", \"その\", \"い\", \"から\", \"だ\", \"ば\",\n", | |
" \"い\", \"よ\", \"だっ\", \"う\", \"よう\", \"ます\",\n", | |
" \"へ\", \"なた\", \"一\", \"「\", \"」\", \"・\", \"お\", \"ね\", \"なっ\", \"…\",\n", | |
" \"なら\", \"たら\", \"だけ\", \"てる\", \"たい\", \"や\", \"さ\", \"0\", \"。\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"corpus = Document[]\n", | |
"\n", | |
"let\n", | |
" for (docid, fname) in enumerate(readdir(datadir))\n", | |
" open(\"$(datadir)/$(fname)\", \"r\") do fin\n", | |
" words = UTF8String[]\n", | |
" for line in eachline(fin)\n", | |
" append!(words, tokenize(strip(line)))\n", | |
" end\n", | |
"\n", | |
" words = filter(x -> x ∉ stopwords, words)\n", | |
" \n", | |
" doclength = length(words)\n", | |
" topicids = rand(1:ntopics, doclength) # initialize with random topics\n", | |
" topicidcount = DefaultDict(0, Dict{Int,Int}(counter(topicids)))\n", | |
" push!(corpus, Document(docid, doclength, words, topicids, topicidcount))\n", | |
" end\n", | |
" end\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"449" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"length(corpus)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"98" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"corpus[99].length" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"5-element Array{UTF8String,1}:\n", | |
" \"笑っ\"\n", | |
" \"君\" \n", | |
" \"ため\"\n", | |
" \"笑っ\"\n", | |
" \"僕\" " | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"corpus[99].words[1:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"5-element Array{Int64,1}:\n", | |
" 5\n", | |
" 1\n", | |
" 2\n", | |
" 1\n", | |
" 1" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"corpus[99].topicids[1:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"DataStructures.DefaultDict{Int64,Int64,Int64} with 5 entries:\n", | |
" 4 => 20\n", | |
" 2 => 13\n", | |
" 3 => 24\n", | |
" 5 => 18\n", | |
" 1 => 23" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"corpus[99].topicidcount" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Topics" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"type Topic\n", | |
" id::Int\n", | |
" count::Int\n", | |
" wordcount::DefaultDict{UTF8String,Int64,Int64}\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"topics = Dict{Int64,Topic}()\n", | |
"for topicid in 1:ntopics\n", | |
" topics[topicid] = Topic(topicid, 0, DefaultDict(UTF8String, Int, 0))\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for document in corpus\n", | |
" # initial counts for each topics\n", | |
" for (word, topicid) in zip(document.words, document.topicids)\n", | |
" topics[topicid].count += 1\n", | |
" topics[topicid].wordcount[word] += 1\n", | |
" end\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"10202" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"topics[1].count" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"## Estimation by Gibbs Sampling" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"11950" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"flat(A) = mapreduce(x->isa(x,Array)? flat(x): x, vcat, [], A) # c.f. http://rosettacode.org/wiki/Flatten_a_list#Julia\n", | |
"vocabsize = length(Set(flat([doc.words for doc in corpus])))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.01" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"α = 0.01\n", | |
"β = 0.01" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"let\n", | |
" for _ in 1:1000 # arbitrary number of iterations\n", | |
" for document in corpus\n", | |
" for (i, word) in enumerate(document.words)\n", | |
" # decrement each count\n", | |
" topicid_current = document.topicids[i]\n", | |
" topics[topicid_current].count -= 1\n", | |
" topics[topicid_current].wordcount[word] -= 1\n", | |
" document.length -= 1\n", | |
" document.topicidcount[topicid_current] -= 1\n", | |
"\n", | |
" # calc. probs for the topics, sample one\n", | |
" probs = Float64[]\n", | |
" for k in 1:ntopics\n", | |
" topicprob = (document.topicidcount[k] + β) / (document.length + β*ntopics)\n", | |
" wordprob = (topics[k].wordcount[word] + α) / (topics[k].count + α*vocabsize)\n", | |
" push!(probs, topicprob * wordprob)\n", | |
" end\n", | |
" topicid_new = sample(1:ntopics, WeightVec(probs))\n", | |
"\n", | |
" # increment each count, according to the new sampled topic\n", | |
" document.topicids[i] = topicid_new\n", | |
" topics[topicid_new].count += 1\n", | |
" topics[topicid_new].wordcount[word] += 1\n", | |
" document.length += 1\n", | |
" document.topicidcount[topicid_new] += 1\n", | |
" end\n", | |
" end\n", | |
" end\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"## Results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Topic 1\n", | |
"0.0109\t彼\n", | |
"0.0090\t娘\n", | |
"0.0074\t1\n", | |
"0.0066\t恋\n", | |
"0.0054\t子供\n", | |
"0.0054\t5\n", | |
"0.0053\tたち\n", | |
"0.0051\t4\n", | |
"0.0049\t帰っ\n", | |
"0.0049\t出来\n", | |
"\n", | |
"Topic 2\n", | |
"0.0199\tこと\n", | |
"0.0146\t生き\n", | |
"0.0116\tそれ\n", | |
"0.0103\tこの\n", | |
"0.0094\tだろ\n", | |
"0.0092\t人\n", | |
"0.0085\t自分\n", | |
"0.0083\t心\n", | |
"0.0074\t忘れ\n", | |
"0.0074\t何\n", | |
"\n", | |
"Topic 3\n", | |
"0.0525\t君\n", | |
"0.0320\t僕\n", | |
"0.0092\tいる\n", | |
"0.0090\t時\n", | |
"0.0075\t愛\n", | |
"0.0073\t町\n", | |
"0.0069\t忘れ\n", | |
"0.0065\t風\n", | |
"0.0063\t人\n", | |
"0.0059\t手\n", | |
"\n", | |
"Topic 4\n", | |
"0.0159\t人\n", | |
"0.0136\t私\n", | |
"0.0134\tあなた\n", | |
"0.0117\tこと\n", | |
"0.0100\tある\n", | |
"0.0091\t日\n", | |
"0.0084\t夢\n", | |
"0.0079\t花\n", | |
"0.0064\t愛\n", | |
"0.0062\t時\n", | |
"\n", | |
"Topic 5\n", | |
"0.0133\tがんばらんば\n", | |
"0.0079\tがんばれ\n", | |
"0.0065\t明日\n", | |
"0.0065\tBye\n", | |
"0.0061\t何\n", | |
"0.0056\t来\n", | |
"0.0056\t日\n", | |
"0.0051\tせ\n", | |
"0.0049\t今\n", | |
"0.0049\tある\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"for k in 1:ntopics\n", | |
" println(\"Topic $(k)\")\n", | |
" t = topics[k]\n", | |
" for (prob, word) in sort([(count / t.count, word) for (word, count) in t.wordcount], rev=true)[1:10]\n", | |
" printfmtln(\"{:.4f}\\t{}\", prob, word)\n", | |
" end\n", | |
" println()\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"\\- End" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Julia 0.4.2", | |
"language": "julia", | |
"name": "julia-0.4" | |
}, | |
"language_info": { | |
"file_extension": ".jl", | |
"mimetype": "application/julia", | |
"name": "julia", | |
"version": "0.4.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment