Skip to content

Instantly share code, notes, and snippets.

@k0f1sh
Last active May 11, 2020 14:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save k0f1sh/efcf85fcc07b3f334299cc33e8540e23 to your computer and use it in GitHub Desktop.
Save k0f1sh/efcf85fcc07b3f334299cc33e8540e23 to your computer and use it in GitHub Desktop.
(ns gengo03
(:require [cheshire.core :as cheshire]
[clojure.java.io :as io]))
;; https://nlp100.github.io/ja/ch03.html
;; 20
(def r (io/reader (io/resource "jawiki-country.json")))
(def jsons (doall
(map (fn [line]
(cheshire/decode line))
(line-seq r))))
(def uk (first (filter (fn [data] (= (get data "title") "イギリス")) jsons)))
(def uk-text (get uk "text"))
;; 21
(doseq [[category-line] (re-seq #"\[\[Category\:(.*?)\|*.*\]\]" uk-text)]
(println category-line))
;; 22
(doseq [[_ category] (re-seq #"\[\[Category\:(.*)\]\]" uk-text)]
(println (clojure.string/replace category #"\|.*" "")))
;; 23
(doseq [[m section] (re-seq #"==+(.*?)==+" uk-text)]
(let [e (re-find #"=+" m)
c (dec (count e))]
(println (str (clojure.string/trim section) " " c))))
;; 24
(doseq [[_ category] (re-seq #"\[\[ファイル\:(.*)\]\]" uk-text)]
(println (clojure.string/replace category #"\|.*" "")))
;; 25
(import '(java.io BufferedReader StringReader))
;; TODO reduceじゃなくて、 take-while, drop-whileをつかっていけそう
;; (def raw-lines (-> (reduce (fn [{:keys [lines inner-section?]} line]
;; (if inner-section?
;; (if (= line "}}")
;; {:lines (conj lines line) :inner-section? false}
;; {:lines (conj lines line) :inner-section? inner-section?})
;; (if (= line "{{基礎情報 国")
;; {:lines lines :inner-section? true}
;; {:lines lines :inner-section? inner-section?})))
;; {:lines []
;; :inner-section? false}
;; (line-seq (BufferedReader. (StringReader. uk-text))))
;; :lines))
(def raw-lines (->> (line-seq (BufferedReader. (StringReader. uk-text)))
(drop-while (fn [line] (not= line "{{基礎情報 国")))
(drop 1) ;; drop line of "{{基礎情報 国"
(take-while (fn [line] (not= line "}}")))))
(def templates (->> raw-lines
(map (fn [raw-line]
(let [[_ k v] (re-find #"\|(.*) = (.*)" raw-line)]
[k v])))
(filter (fn [[k _]] (not (nil? k))))
(into {})))
;; 26
(def templates2 (->> templates
(map (fn [[k v]]
[k (clojure.string/replace v #"'''''(.*?)'''''" "$1")]))
(map (fn [[k v]]
[k (clojure.string/replace v #"'''(.*?)'''" "$1")]))
(map (fn [[k v]]
[k (clojure.string/replace v #"''(.*?)''" "$1")]))
(into {})))
;; 27
(def templates3 (->> templates2
(map (fn [[k v]]
[k (clojure.string/replace v #"\[\[.*?\|(.*)\]\]" "$1")]))
(map (fn [[k v]]
[k (clojure.string/replace v #"\[\[(.*?)\]\]" "$1")]))
(into {})))
;; 28
;; いまいち
(def templates4 (->> templates3
(map (fn [[k v]]
[k (clojure.string/replace v #"\<ref\>.*\<\/ref\>" "")]))
(map (fn [[k v]]
[k (clojure.string/replace v #"\<ref .*?\>" "")]))
(map (fn [[k v]]
[k (clojure.string/replace v #"\<ref\>" "")]))
(map (fn [[k v]]
[k (clojure.string/replace v #"\<br \/\>" "")]))
(map (fn [[k v]]
[k (clojure.string/replace v #"\{\{(.*?)\}\}" "")]))
(into {})))
(doseq [[k v] templates4]
(println (str k " -> " v)))
;; 29
(def wikidata (let [filename (get templates4 "国旗画像")
url (str "https://en.wikipedia.org/w/api.php?action=query&prop=imageinfo&iiprop=url&titles=File%3A"
(clojure.string/replace filename #" " "+")
"&format=json")]
(cheshire/decode (slurp url))))
(get-in wikidata ["query" "pages" "23473560" "imageinfo" 0 "url"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment