Last active
May 15, 2021 19:48
-
-
Save Castux/29fedb348e08bb495587e4433c95e6d2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local hp = require "gumbo" | |
local b64 = require "base64" | |
local banner = "_30q-" | |
local docBody = "_39k5" | |
local title = "_4lmk" | |
local bold = "_4yxo" | |
local italics = "_4yxp" | |
local emoji = "_47e3" | |
local hidden = "_7oe" | |
local date = "_39g5" | |
local namespace = "fbdoc" | |
local pageUrl | |
function treat(node, output) | |
if node.nodeName == "#text" then | |
table.insert(output, node.data or "") | |
elseif node.nodeName == "DIV" or node.nodeName == "LI" or node.nodeName == "FIGURE" then | |
for i,v in ipairs(node.childNodes) do | |
treat(v, output) | |
end | |
table.insert(output, "\n\n") | |
elseif node.nodeName == "SPAN" and node.className:match(bold) then | |
table.insert(output, "**") | |
for i,v in ipairs(node.childNodes) do | |
treat(v, output) | |
end | |
table.insert(output, "**") | |
elseif node.nodeName == "SPAN" and node.className:match(italics) then | |
table.insert(output, "//") | |
for i,v in ipairs(node.childNodes) do | |
treat(v, output) | |
end | |
table.insert(output, "//") | |
elseif node.nodeName == "A" then | |
table.insert(output, "[[" .. node:getAttribute "href" .. "|") | |
for i,v in ipairs(node.childNodes) do | |
treat(v, output) | |
end | |
table.insert(output, "]]") | |
elseif node.nodeName == "OL" then | |
for i,v in ipairs(node.childNodes) do | |
table.insert(output, " - ") | |
treat(v, output) | |
end | |
elseif node.nodeName == "UL" then | |
for i,v in ipairs(node.childNodes) do | |
table.insert(output, " * ") | |
treat(v, output) | |
end | |
elseif node.nodeName == "BR" then | |
table.insert(output, "\\\\\n") | |
elseif node.nodeName == "H2" then | |
table.insert(output, "===== ") | |
for i,v in ipairs(node.childNodes) do | |
treat(v, output) | |
end | |
table.insert(output, " =====\n\n") | |
elseif node.nodeName == "H3" then | |
table.insert(output, "==== ") | |
for i,v in ipairs(node.childNodes) do | |
treat(v, output) | |
end | |
table.insert(output, " ====\n\n") | |
elseif node.nodeName == "H4" then | |
table.insert(output, "=== ") | |
for i,v in ipairs(node.childNodes) do | |
treat(v, output) | |
end | |
table.insert(output, " ===\n\n") | |
elseif node.nodeName == "IMG" then | |
treatImage(node, output) | |
elseif node.nodeName == "SPAN" and node.className:match(emoji) then | |
for i,v in ipairs(node.childNodes) do | |
treat(v, output) | |
end | |
elseif node.nodeName == "SPAN" and node.className:match(hidden) then | |
-- nothing! | |
else | |
print("Unhandled node", node.nodeName) | |
print(node) | |
end | |
end | |
function saveImage(data, path) | |
local format,data = data:match("data:image/(%w-);base64,(.*)") | |
local blob = b64.decode(data) | |
local finalPath = path .. "." .. format | |
local fp = io.open("img/" .. finalPath, "wb") | |
fp:write(blob) | |
fp:close() | |
return finalPath | |
end | |
local imgCount = 0 | |
function treatImage(node, out) | |
imgCount = imgCount + 1 | |
local data = node:getAttribute "src" | |
local path = saveImage(data, pageUrl .. "-" .. imgCount) | |
table.insert(out, "{{ " .. namespace .. ":" .. path .. " }}") | |
end | |
function extractImage(number, src, path) | |
local data = src:match("%-%-savepage%-url%-" .. number .. ": url%((.-%))") | |
return saveImage(data, path) | |
end | |
function getBanner(root, src) | |
local elem = root:getElementsByClassName(banner)[1] | |
if not elem then | |
return | |
end | |
local ref = elem:getAttribute "style":match("%-%-savepage%-url%-(%w*)") | |
return extractImage(ref, src, pageUrl .. "-banner") | |
end | |
local months = {"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"} | |
for i,v in ipairs(months) do | |
months[v] = i | |
end | |
function treatDocument(path) | |
imgCount = 0 | |
local src = io.open(path, "r"):read "a" | |
src = src:match('<!%-%-savepage%-srcdoc%-begin%-%->(.*)<!%-%-savepage%-srcdoc%-end%-%->') | |
src = src:gsub(""", '"') | |
src = src:gsub("&", '&') | |
src = src:gsub("<", '<') | |
src = src:gsub(">", '>') | |
src = src:gsub("'", "'") | |
local body = src:match('<body.*</body>') | |
local root,err = hp.parse(body) | |
if not root then | |
error(err) | |
end | |
local out = {} | |
-- Get title | |
local titleElem = root:getElementsByClassName(title)[1] | |
local title = titleElem.textContent | |
table.insert(out, "====== " .. title .. " ======\n\n") | |
-- Double dashes and final dashes break dokuwiki for some reason | |
local url = title:lower():gsub("[^%w ]", ""):gsub(" ", "-") | |
url = url:gsub("%-+", "-"):gsub("%-$", "") | |
-- Get date | |
local date = root:getElementsByClassName(date)[1].textContent | |
local month, day, year = date:match("(%w+) (%d+), (%d+)") | |
local isoDate = string.format("%04d-%02d-%02d", tonumber(year), months[month], tonumber(day)) | |
-- Final URL | |
pageUrl = isoDate .. "-" .. url | |
print(" " .. pageUrl) | |
-- Banner | |
local bannerPath = getBanner(root, src) | |
if bannerPath then | |
table.insert(out, "{{ " .. namespace .. ":" .. bannerPath .. " }}\n\n") | |
end | |
-- Body | |
local mainDoc = root:getElementsByClassName(docBody)[1] | |
treat(mainDoc, out) | |
local fp = io.open("pages/" .. pageUrl .. ".txt", "w") | |
fp:write(table.concat(out)) | |
fp:close() | |
end | |
do | |
local args = {...} | |
for i,v in ipairs(args) do | |
print(i,v) | |
treatDocument(v) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment