Skip to content

Instantly share code, notes, and snippets.

@Markismus
Created February 13, 2020 12:29
Show Gist options
  • Save Markismus/6f5b72d896073d81103a410dd2d2f7bd to your computer and use it in GitHub Desktop.
Save Markismus/6f5b72d896073d81103a410dd2d2f7bd to your computer and use it in GitHub Desktop.
-- PetitRobert2007 HTML cleanup
local Screen = require("device").screen
local logger=require("logger")
local util=require("util")
-- From http://lua-users.org/wiki/LuaXml
-- Pure lua XML parsing
local function parseargs(s)
local arg = {}
string.gsub(s, "([%-%w]+)=([\"'])(.-)%2", function (w, _, a) arg[w] = a end)
return arg
end
local function collect(s)
local stack = {}
local top = {}
table.insert(stack, top)
local ni,c,label,xarg, empty
local i, j = 1, 1
while true do
-- s="toto</b toto='tutu'/>titi"
-- print( string.find(s, "<(%/?)([%w:]+)(.-)(%/?)>", 1))
-- 5 21 / b toto='tutu' /
ni,j,c,label,xarg, empty = string.find(s, "<(%/?)([%w:]+)(.-)(%/?)>", i)
if not ni then break end
-- If we wouldn't trust label cases
-- if label then label = label:lower() end
local text = string.sub(s, i, ni-1)
-- me: keep empty spaces
-- if not string.find(text, "^%s*$") then
if text ~= "" then
table.insert(top, text)
end
if label:lower() == "hr" then empty = "/" end
if empty == "/" then -- empty element tag
table.insert(top, {label=label, xarg=parseargs(xarg), empty=1})
elseif c == "" then -- start tag
top = {label=label, xarg=parseargs(xarg)}
table.insert(stack, top) -- new level
else -- end tag
local z=0
while #stack > 1 do -- close all non matching label
local toclose = table.remove(stack) -- remove top
top = stack[#stack]
z = z + 1
if toclose and toclose.label == label then -- closing matching tag
table.insert(top, toclose)
break
else
-- logger.warn("got", label, "closing non-matching", toclose.label, z)
table.insert(top, toclose)
end
end
if #stack < 1 then
logger.warn("PetitRobert2017: stack is empty")
end
end
i = j+1
end
local text = string.sub(s, i)
-- me: keep empty spaces
-- if not string.find(text, "^%s*$") then
if text ~= "" then
table.insert(stack[#stack], text)
end
if #stack > 1 then
logger.warn("PetitRobert2017: stack at end has multiple elements:", #stack)
return stack
end
return stack[1]
end
-- MuPDF expects XHTML. In XHTML, all tags and attributes should be lowercase
local function serialize(elem, colored)
local s = {}
if type(elem) == "string" then
table.insert(s, elem) -- .." ")
elseif type(elem) == "table" then
if elem.label then
if elem.label == "DIV" then table.insert(s, "<div></div>") end -- fix A/I/1 div
table.insert(s, "<")
table.insert(s, elem.label:lower())
if elem.label:lower() == "a" and not colored then
table.insert(s, ' style="color: black"')
end
if elem.xarg and type(elem.xarg) == "table" then
for attr, value in pairs(elem.xarg) do
-- MuPDF needs lowercase href to consider it a link...
if elem.label:lower() == "a" and attr:lower() == "href" then
attr = "href"
end
if elem.label == "DIV" and attr == "style" then -- fix A/I/1 div
if not value:find("border: solid") then
value = value .. "; display: inline"
end
end
if attr == "style" and not colored then
-- colored stuff is displayed at less readable light grey: get black
value = value:gsub('background%-color:', 'zbgcol:')
value = value:gsub('color: navy', 'color: black')
value = value:gsub('color: red', 'color: black')
-- main meaning in blue => bold black
value = value:gsub('color: rgb%(51,51,153%); font%-style: normal; font%-weight: normal', 'color: black; font-style: normal; font-weight: bold')
value = value:gsub(" #%w%w%w%w%w%w", " black")
value = value:gsub(" rgb%(.-%)", " black")
end
table.insert(s, string.format(' %s="%s"', attr:lower(), value))
end
end
if elem.empty then
table.insert(s, "/>")
else
table.insert(s, ">")
end
end
for _, subelem in ipairs(elem) do
table.insert(s, serialize(subelem, colored))
end
if elem.label and not elem.empty then
table.insert(s, string.format('</%s>', elem.label:lower()))
end
end
return table.concat(s)
end
return function(html)
html = html:gsub('<c c=\"', '<font color=\"')
html = html:gsub('</c>', '</font>')
html = html:gsub('<c>', '<font>')
local colored = Screen:isColorEnabled()
local stack = collect(html)
html = serialize(stack, colored)
-- logger.warn("SERIALIZED:", html)
f = io.open("/tmp/outdict.html", "w")
f:write(html)
f:close()
return html
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment