Quick & Dirty conversion of exported Wordpress posts into markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns word-parsos.core | |
(:gen-class) | |
(:require [clojure.xml] | |
[clojure.zip :as zip] | |
[clojure.data.zip.xml :as zip-xml] | |
[clj-time.core :as t] | |
[clj-time.format :as f] | |
[cuerdas.core :as str] | |
[clojure.pprint :as pprint] | |
[clj-commons-exec :as exec]) | |
(:import (java.io.ByteArrayInputStream) | |
(java.util Locale) | |
(java.io StringWriter))) | |
(def xml-file "dev-resources/wordpress.2015-09-23_posts.xml") | |
(def html-out "dev-resources/out/") | |
(def asc-out "dev-resources/md-out/posts/") | |
(defn parse-xml [s] | |
(clojure.xml/parse | |
(java.io.ByteArrayInputStream. (.getBytes (slurp s))))) | |
(def date-formatter (f/with-locale (f/formatters :rfc822) (Locale. "en"))) | |
(def fname-formatter (f/formatter "yyyy-MM-dd")) | |
(defn parse-date [date-str] | |
(f/parse date-formatter date-str)) | |
(defn title-as-fname [title] | |
(-> title | |
str/slugify)) | |
(defn create-filename [date title] | |
(str (f/unparse fname-formatter date) "-" (title-as-fname title) )) | |
(defn write-html [item] | |
(let [fname (str html-out (:filename item) ".html")] | |
(spit fname (:content item)) | |
fname)) | |
(defn convert-to-md [html-file out-file] | |
(println "convert " html-file " to " out-file) | |
(let [re @(exec/sh ["sh" "-c" | |
(str "cat " html-file " | " | |
"sed -e 's/http:\\/\\/v2.agynamix.de\\/wp-content/\\/img/g' | " | |
"sed -e 's/http:\\/\\/www.simidude.com\\/wp-content/\\/img/g' | " | |
"pandoc -f html -t markdown_github" " > " out-file | |
)])] | |
(println "Result " (pr-str re))) | |
out-file) | |
(defn add-header-info [filename item] | |
(let [pre (str | |
"{ | |
:title \"" (:title item) "\" | |
:layout :post | |
:banner \"/img/home-bg.jpg\" | |
:hide-disqus? true | |
:tags " (:category item) " | |
}\n\n" ) | |
content (slurp filename) | |
out-str (str pre content)] | |
(spit filename out-str))) | |
(defn item->map [item] | |
(let [m | |
{:title (zip-xml/xml1-> item :title zip-xml/text) | |
:date (parse-date (zip-xml/xml1-> item :pubDate zip-xml/text)) | |
;:date (Long/valueOf (zip-xml/attr item :date)) | |
:category (vec (zip-xml/xml-> item :category (zip-xml/attr :nicename))) | |
:content (zip-xml/xml1-> item (keyword "content:encoded") zip-xml/text) | |
;:segments (mapv segment->map | |
; (zip-xml/xml-> item :segments :segment)) | |
}] | |
(merge m { | |
:filename (create-filename (:date m) (:title m)) | |
}))) | |
(defn -main [] | |
(let [xml (parse-xml xml-file) | |
root (zip/xml-zip xml) | |
items (mapv item->map (zip-xml/xml-> root :channel :item))] | |
(println "Converting " (count items) " posts.") | |
(doseq [item items] | |
(let [html-file (write-html item) | |
md-file (convert-to-md html-file (str asc-out (:filename item) ".md"))] | |
(add-header-info md-file item))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment