Skip to content

Instantly share code, notes, and snippets.

@Castux
Last active May 15, 2021 19:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Castux/29fedb348e08bb495587e4433c95e6d2 to your computer and use it in GitHub Desktop.
Save Castux/29fedb348e08bb495587e4433c95e6d2 to your computer and use it in GitHub Desktop.
local hp = require "gumbo"
local b64 = require "base64"
local banner = "_30q-"
local docBody = "_39k5"
local title = "_4lmk"
local bold = "_4yxo"
local italics = "_4yxp"
local emoji = "_47e3"
local hidden = "_7oe"
local date = "_39g5"
local namespace = "fbdoc"
local pageUrl
function treat(node, output)
if node.nodeName == "#text" then
table.insert(output, node.data or "")
elseif node.nodeName == "DIV" or node.nodeName == "LI" or node.nodeName == "FIGURE" then
for i,v in ipairs(node.childNodes) do
treat(v, output)
end
table.insert(output, "\n\n")
elseif node.nodeName == "SPAN" and node.className:match(bold) then
table.insert(output, "**")
for i,v in ipairs(node.childNodes) do
treat(v, output)
end
table.insert(output, "**")
elseif node.nodeName == "SPAN" and node.className:match(italics) then
table.insert(output, "//")
for i,v in ipairs(node.childNodes) do
treat(v, output)
end
table.insert(output, "//")
elseif node.nodeName == "A" then
table.insert(output, "[[" .. node:getAttribute "href" .. "|")
for i,v in ipairs(node.childNodes) do
treat(v, output)
end
table.insert(output, "]]")
elseif node.nodeName == "OL" then
for i,v in ipairs(node.childNodes) do
table.insert(output, " - ")
treat(v, output)
end
elseif node.nodeName == "UL" then
for i,v in ipairs(node.childNodes) do
table.insert(output, " * ")
treat(v, output)
end
elseif node.nodeName == "BR" then
table.insert(output, "\\\\\n")
elseif node.nodeName == "H2" then
table.insert(output, "===== ")
for i,v in ipairs(node.childNodes) do
treat(v, output)
end
table.insert(output, " =====\n\n")
elseif node.nodeName == "H3" then
table.insert(output, "==== ")
for i,v in ipairs(node.childNodes) do
treat(v, output)
end
table.insert(output, " ====\n\n")
elseif node.nodeName == "H4" then
table.insert(output, "=== ")
for i,v in ipairs(node.childNodes) do
treat(v, output)
end
table.insert(output, " ===\n\n")
elseif node.nodeName == "IMG" then
treatImage(node, output)
elseif node.nodeName == "SPAN" and node.className:match(emoji) then
for i,v in ipairs(node.childNodes) do
treat(v, output)
end
elseif node.nodeName == "SPAN" and node.className:match(hidden) then
-- nothing!
else
print("Unhandled node", node.nodeName)
print(node)
end
end
function saveImage(data, path)
local format,data = data:match("data:image/(%w-);base64,(.*)")
local blob = b64.decode(data)
local finalPath = path .. "." .. format
local fp = io.open("img/" .. finalPath, "wb")
fp:write(blob)
fp:close()
return finalPath
end
local imgCount = 0
function treatImage(node, out)
imgCount = imgCount + 1
local data = node:getAttribute "src"
local path = saveImage(data, pageUrl .. "-" .. imgCount)
table.insert(out, "{{ " .. namespace .. ":" .. path .. " }}")
end
function extractImage(number, src, path)
local data = src:match("%-%-savepage%-url%-" .. number .. ": url%((.-%))")
return saveImage(data, path)
end
function getBanner(root, src)
local elem = root:getElementsByClassName(banner)[1]
if not elem then
return
end
local ref = elem:getAttribute "style":match("%-%-savepage%-url%-(%w*)")
return extractImage(ref, src, pageUrl .. "-banner")
end
local months = {"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"}
for i,v in ipairs(months) do
months[v] = i
end
function treatDocument(path)
imgCount = 0
local src = io.open(path, "r"):read "a"
src = src:match('<!%-%-savepage%-srcdoc%-begin%-%->(.*)<!%-%-savepage%-srcdoc%-end%-%->')
src = src:gsub("&quot;", '"')
src = src:gsub("&amp;", '&')
src = src:gsub("&lt;", '<')
src = src:gsub("&gt;", '>')
src = src:gsub("&apos;", "'")
local body = src:match('<body.*</body>')
local root,err = hp.parse(body)
if not root then
error(err)
end
local out = {}
-- Get title
local titleElem = root:getElementsByClassName(title)[1]
local title = titleElem.textContent
table.insert(out, "====== " .. title .. " ======\n\n")
-- Double dashes and final dashes break dokuwiki for some reason
local url = title:lower():gsub("[^%w ]", ""):gsub(" ", "-")
url = url:gsub("%-+", "-"):gsub("%-$", "")
-- Get date
local date = root:getElementsByClassName(date)[1].textContent
local month, day, year = date:match("(%w+) (%d+), (%d+)")
local isoDate = string.format("%04d-%02d-%02d", tonumber(year), months[month], tonumber(day))
-- Final URL
pageUrl = isoDate .. "-" .. url
print(" " .. pageUrl)
-- Banner
local bannerPath = getBanner(root, src)
if bannerPath then
table.insert(out, "{{ " .. namespace .. ":" .. bannerPath .. " }}\n\n")
end
-- Body
local mainDoc = root:getElementsByClassName(docBody)[1]
treat(mainDoc, out)
local fp = io.open("pages/" .. pageUrl .. ".txt", "w")
fp:write(table.concat(out))
fp:close()
end
do
local args = {...}
for i,v in ipairs(args) do
print(i,v)
treatDocument(v)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment