Skip to content

Instantly share code, notes, and snippets.

@k0f1sh
Last active May 14, 2020 16:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save k0f1sh/cf32020aae18c0fda0e43f135bdf119d to your computer and use it in GitHub Desktop.
Save k0f1sh/cf32020aae18c0fda0e43f135bdf119d to your computer and use it in GitHub Desktop.
(ns gengo04
(:require [clojure.java.io :as io]
[incanter.core :as incanter-core]
[incanter.charts :as incanter-charts]
))
;; https://nlp100.github.io/ja/ch04.html
;; $ mecab neko.txt -o neko.txt.mecab
;; 30
(def maps (doall
(->>
(line-seq (io/reader (io/resource "neko.txt.mecab")))
(map (fn [line]
(if-not (= line "EOS")
;; mecabの出力の構造:
;; 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音
(let [[surface details-str] (clojure.string/split line #"\t")
[pos pos1 _ _ _ _ base _ _] (clojure.string/split details-str #",")]
{:surface surface
:base base
:pos pos
:pos1 pos1}))))
(filter some?))))
;; 31
(def surfaces (->> maps
(map :surface)
(into #{})))
;; 32
(def bases (->> maps
(map :base)
(into #{})))
;; 33
(def no (->> (partition 3 1 maps)
(filter (fn [[a b c]]
(and (= (:surface b) "の")
(= (:pos a) "名詞")
(= (:pos c) "名詞"))))
(map (fn [[a b c]]
(str (:surface a) (:surface b) (:surface c))))))
;; 34
(def nouns-list (loop [maps maps
nouns-seq []]
(let [rest (drop-while #(not= (:pos %) "名詞") maps)]
(let [[nouns rest] (split-with #(= (:pos %) "名詞") rest)]
(if (empty? rest)
nouns-seq
(recur rest (conj nouns-seq nouns)))))))
;; 35
(def sorted-surface-freq (->> maps
(map :surface)
(frequencies)
(seq)
(sort-by second)
(reverse)))
;; 36
(let [top10 (take 10 sorted-surface-freq)]
(incanter-core/view
(incanter-charts/bar-chart
(map first top10)
(map second top10)
:title "頻度上位10語")))
;; 37
(def sentences (->> maps
(partition-by #(= (:pos1 %) "句点"))
(filter #(not= (:pos1 (first %)) "句点"))))
(def cooccurrence (->> sentences
(map (fn [sentence]
(if (some #(= (:surface %) "猫") sentence)
(filter #(not= (:surface %) "猫") sentence)
[])))
(flatten)
(map :surface)
(frequencies)
(sort-by second)
(reverse)))
(let [top10 (take 10 cooccurrence)]
(incanter-core/view
(incanter-charts/bar-chart
(map first top10)
(map second top10)
:title "猫との共起頻度上位10語")))
;; 38
(def histgram-data (->> maps
(map :surface)
(frequencies)
(map second)
(frequencies)
(sort-by second)
(reverse)))
(incanter-core/view
(incanter-charts/bar-chart
(map first histgram-data)
(map second histgram-data)
:title "単語の出現頻度のヒストグラム"))
;; 39
(let [freq-ranks (->> maps
(map :surface)
(frequencies)
(map second)
(frequencies)
(sort-by second)
(reverse))]
(incanter-core/view
(-> (incanter-charts/scatter-plot (vals freq-ranks) (keys freq-ranks))
(incanter-charts/set-axis :x (incanter-charts/log-axis))
(incanter-charts/set-axis :y (incanter-charts/log-axis)))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment