Skip to content

Instantly share code, notes, and snippets.

@tuhlmann
Created October 4, 2015 18:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tuhlmann/d9f1e3237eb8f692eb71 to your computer and use it in GitHub Desktop.
Save tuhlmann/d9f1e3237eb8f692eb71 to your computer and use it in GitHub Desktop.
Quick & Dirty conversion of exported Wordpress posts into markdown
(ns word-parsos.core
(:gen-class)
(:require [clojure.xml]
[clojure.zip :as zip]
[clojure.data.zip.xml :as zip-xml]
[clj-time.core :as t]
[clj-time.format :as f]
[cuerdas.core :as str]
[clojure.pprint :as pprint]
[clj-commons-exec :as exec])
(:import (java.io.ByteArrayInputStream)
(java.util Locale)
(java.io StringWriter)))
(def xml-file "dev-resources/wordpress.2015-09-23_posts.xml")
(def html-out "dev-resources/out/")
(def asc-out "dev-resources/md-out/posts/")
(defn parse-xml [s]
(clojure.xml/parse
(java.io.ByteArrayInputStream. (.getBytes (slurp s)))))
(def date-formatter (f/with-locale (f/formatters :rfc822) (Locale. "en")))
(def fname-formatter (f/formatter "yyyy-MM-dd"))
(defn parse-date [date-str]
(f/parse date-formatter date-str))
(defn title-as-fname [title]
(-> title
str/slugify))
(defn create-filename [date title]
(str (f/unparse fname-formatter date) "-" (title-as-fname title) ))
(defn write-html [item]
(let [fname (str html-out (:filename item) ".html")]
(spit fname (:content item))
fname))
(defn convert-to-md [html-file out-file]
(println "convert " html-file " to " out-file)
(let [re @(exec/sh ["sh" "-c"
(str "cat " html-file " | "
"sed -e 's/http:\\/\\/v2.agynamix.de\\/wp-content/\\/img/g' | "
"sed -e 's/http:\\/\\/www.simidude.com\\/wp-content/\\/img/g' | "
"pandoc -f html -t markdown_github" " > " out-file
)])]
(println "Result " (pr-str re)))
out-file)
(defn add-header-info [filename item]
(let [pre (str
"{
:title \"" (:title item) "\"
:layout :post
:banner \"/img/home-bg.jpg\"
:hide-disqus? true
:tags " (:category item) "
}\n\n" )
content (slurp filename)
out-str (str pre content)]
(spit filename out-str)))
(defn item->map [item]
(let [m
{:title (zip-xml/xml1-> item :title zip-xml/text)
:date (parse-date (zip-xml/xml1-> item :pubDate zip-xml/text))
;:date (Long/valueOf (zip-xml/attr item :date))
:category (vec (zip-xml/xml-> item :category (zip-xml/attr :nicename)))
:content (zip-xml/xml1-> item (keyword "content:encoded") zip-xml/text)
;:segments (mapv segment->map
; (zip-xml/xml-> item :segments :segment))
}]
(merge m {
:filename (create-filename (:date m) (:title m))
})))
(defn -main []
(let [xml (parse-xml xml-file)
root (zip/xml-zip xml)
items (mapv item->map (zip-xml/xml-> root :channel :item))]
(println "Converting " (count items) " posts.")
(doseq [item items]
(let [html-file (write-html item)
md-file (convert-to-md html-file (str asc-out (:filename item) ".md"))]
(add-header-info md-file item)))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment