Created
February 13, 2020 12:29
-
-
Save Markismus/6f5b72d896073d81103a410dd2d2f7bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- PetitRobert2007 HTML cleanup | |
local Screen = require("device").screen | |
local logger=require("logger") | |
local util=require("util") | |
-- From http://lua-users.org/wiki/LuaXml | |
-- Pure lua XML parsing | |
local function parseargs(s) | |
local arg = {} | |
string.gsub(s, "([%-%w]+)=([\"'])(.-)%2", function (w, _, a) arg[w] = a end) | |
return arg | |
end | |
local function collect(s) | |
local stack = {} | |
local top = {} | |
table.insert(stack, top) | |
local ni,c,label,xarg, empty | |
local i, j = 1, 1 | |
while true do | |
-- s="toto</b toto='tutu'/>titi" | |
-- print( string.find(s, "<(%/?)([%w:]+)(.-)(%/?)>", 1)) | |
-- 5 21 / b toto='tutu' / | |
ni,j,c,label,xarg, empty = string.find(s, "<(%/?)([%w:]+)(.-)(%/?)>", i) | |
if not ni then break end | |
-- If we wouldn't trust label cases | |
-- if label then label = label:lower() end | |
local text = string.sub(s, i, ni-1) | |
-- me: keep empty spaces | |
-- if not string.find(text, "^%s*$") then | |
if text ~= "" then | |
table.insert(top, text) | |
end | |
if label:lower() == "hr" then empty = "/" end | |
if empty == "/" then -- empty element tag | |
table.insert(top, {label=label, xarg=parseargs(xarg), empty=1}) | |
elseif c == "" then -- start tag | |
top = {label=label, xarg=parseargs(xarg)} | |
table.insert(stack, top) -- new level | |
else -- end tag | |
local z=0 | |
while #stack > 1 do -- close all non matching label | |
local toclose = table.remove(stack) -- remove top | |
top = stack[#stack] | |
z = z + 1 | |
if toclose and toclose.label == label then -- closing matching tag | |
table.insert(top, toclose) | |
break | |
else | |
-- logger.warn("got", label, "closing non-matching", toclose.label, z) | |
table.insert(top, toclose) | |
end | |
end | |
if #stack < 1 then | |
logger.warn("PetitRobert2017: stack is empty") | |
end | |
end | |
i = j+1 | |
end | |
local text = string.sub(s, i) | |
-- me: keep empty spaces | |
-- if not string.find(text, "^%s*$") then | |
if text ~= "" then | |
table.insert(stack[#stack], text) | |
end | |
if #stack > 1 then | |
logger.warn("PetitRobert2017: stack at end has multiple elements:", #stack) | |
return stack | |
end | |
return stack[1] | |
end | |
-- MuPDF expects XHTML. In XHTML, all tags and attributes should be lowercase | |
local function serialize(elem, colored) | |
local s = {} | |
if type(elem) == "string" then | |
table.insert(s, elem) -- .." ") | |
elseif type(elem) == "table" then | |
if elem.label then | |
if elem.label == "DIV" then table.insert(s, "<div></div>") end -- fix A/I/1 div | |
table.insert(s, "<") | |
table.insert(s, elem.label:lower()) | |
if elem.label:lower() == "a" and not colored then | |
table.insert(s, ' style="color: black"') | |
end | |
if elem.xarg and type(elem.xarg) == "table" then | |
for attr, value in pairs(elem.xarg) do | |
-- MuPDF needs lowercase href to consider it a link... | |
if elem.label:lower() == "a" and attr:lower() == "href" then | |
attr = "href" | |
end | |
if elem.label == "DIV" and attr == "style" then -- fix A/I/1 div | |
if not value:find("border: solid") then | |
value = value .. "; display: inline" | |
end | |
end | |
if attr == "style" and not colored then | |
-- colored stuff is displayed at less readable light grey: get black | |
value = value:gsub('background%-color:', 'zbgcol:') | |
value = value:gsub('color: navy', 'color: black') | |
value = value:gsub('color: red', 'color: black') | |
-- main meaning in blue => bold black | |
value = value:gsub('color: rgb%(51,51,153%); font%-style: normal; font%-weight: normal', 'color: black; font-style: normal; font-weight: bold') | |
value = value:gsub(" #%w%w%w%w%w%w", " black") | |
value = value:gsub(" rgb%(.-%)", " black") | |
end | |
table.insert(s, string.format(' %s="%s"', attr:lower(), value)) | |
end | |
end | |
if elem.empty then | |
table.insert(s, "/>") | |
else | |
table.insert(s, ">") | |
end | |
end | |
for _, subelem in ipairs(elem) do | |
table.insert(s, serialize(subelem, colored)) | |
end | |
if elem.label and not elem.empty then | |
table.insert(s, string.format('</%s>', elem.label:lower())) | |
end | |
end | |
return table.concat(s) | |
end | |
return function(html) | |
html = html:gsub('<c c=\"', '<font color=\"') | |
html = html:gsub('</c>', '</font>') | |
html = html:gsub('<c>', '<font>') | |
local colored = Screen:isColorEnabled() | |
local stack = collect(html) | |
html = serialize(stack, colored) | |
-- logger.warn("SERIALIZED:", html) | |
f = io.open("/tmp/outdict.html", "w") | |
f:write(html) | |
f:close() | |
return html | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment