Skip to content

Instantly share code, notes, and snippets.

@exebetche
Created July 31, 2013 22:00
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save exebetche/6126573 to your computer and use it in GitHub Desktop.
Save exebetche/6126573 to your computer and use it in GitHub Desktop.
LUA HTML parser
-- LUA HTML parser
local empty_tags = {
br = true,
hr = true,
img = true,
embed = true,
param = true,
area = true,
col = true,
input = true,
meta = true,
link = true,
base = true,
basefont = true,
frame = true,
isindex = true
}
-- omittable tags siblings
-- if an open tag from the primary entry follow
-- an unclosed tag of the secondary,
-- the secondary is automatically closed
-- See http://www.w3.org/TR/html5/syntax.html#optional-tags
local omittable_tags = {
tbody = {
thead = true,
tbody = true,
tfoot = true
},
thead = {
thead = true,
tbody = true,
tfoot = true
},
tfoot = {
thead = true,
tbody = true,
tfoot = true
},
td = {
td = true,
th = true
},
th = {
td = true,
th = true
},
tr = {
tr = true
},
dd = {
dd = true,
dt = true
},
dt = {
dd = true,
dt = true
},
optgroup = {
optgroup = true,
option = true
},
optgroup = {
optgroup = true,
option = true
},
address = { p = true},
article = { p = true},
aside = { p = true},
blockquote = { p = true},
dir = { p = true},
div = { p = true},
dl = { p = true},
fieldset = { p = true},
footer = { p = true},
form = { p = true},
h1 = { p = true},
h2 = { p = true},
h3 = { p = true},
h4 = { p = true},
h5 = { p = true},
h6 = { p = true},
header = { p = true},
hgroup = { p = true},
hr = { p = true},
menu = { p = true},
nav = { p = true},
ol = { p = true},
p = { p = true},
pre = { p = true},
section = { p = true},
table = { p = true},
ul= { p = true}
}
-- omittable tags children
local omittable_tags2 = {
table = {
tr = true,
td = true,
p = true,
},
tr = {
td = true,
p = true
},
td = {
p = true
}
}
function parse_html(data, lazy)
local tree = {}
local stack = {}
local level = 0
local new_level = 0
table.insert(stack, tree)
local node
local lower_tag
local script_open = false
local script_val = ""
local script_node = nil
local tag_match = ""
lazy = lazy or false
for b, op, tag, attr, op2, bl1, val, bl2 in string.gmatch(
data,
"(<)(%/?!?)([%w:_-'\\\"%[]+)(.-)(%/?%-?)>"..
"([%s\r\n\t]*)([^<]*)([%s\r\n\t]*)"
) do
lower_tag = string.lower(tag)
if script_open then
if lower_tag == "script" and op == "/" then
node.childNodes[1].value = string.gsub(script_val, "^<!%[CDATA%[", "<!--//%1")
if val ~= "" then
table.insert(stack[level], {
tagName = "textNode",
value = val
})
end
level = level - 1
script_open = false
else
script_val = script_val..b..op..tag..attr..op2..bl1..val..bl2
end
elseif op == "!" then
elseif op == "/" then
-- Check if the previous children elements end tag have been omitted
-- and should be close automatically
while not lazy
and omittable_tags2[lower_tag]
and #stack[level] > 0
and omittable_tags2[lower_tag][stack[level][#stack[level]].tagName]
do
print("Auto closing "..
stack[level][#stack[level]].tagName..
" followed by ending "..lower_tag)
level = level - 1
table.remove(stack)
end
if level==0 then return tree end
if lower_tag ~= stack[level][#stack[level]].tagName
then
print("Mismatch: "..lower_tag..
", (has "..stack[level][#stack[level]].tagName..")")
end
level = level - 1
table.remove(stack)
else
level = level + 1
node = nil
node = {}
node.tagName = lower_tag
node.childNodes = {}
if attr ~= "" then
node.attr = {}
for n, v in string.gmatch(
attr,
"%s([^%s=]+)=\"([^\"]+)\""
) do
node.attr[n] = string.gsub(v, '"', '[^\\]\\"')
end
for n, v in string.gmatch(
attr,
"%s([^%s=]+)='([^']+)'"
) do
node.attr[n] = string.gsub(v, '"', '[^\\]\\"')
end
end
if lower_tag == "script"
and node.attr
and not node.attr["src"]
then
script_val = bl1..val..bl2
table.insert(node.childNodes, {
tagName = "textNode",
value = ""
})
table.insert(stack[level], node)
script_open = true
else
-- Check if the previous sibling element end tag has been omitted
-- and should be close automatically
if not lazy
and omittable_tags[lower_tag]
and level > 1
and stack[level-1]
and #stack[level-1] > 0
and omittable_tags[lower_tag][stack[level-1][#stack[level-1]].tagName] == true
then
print("Auto closing "..
stack[level-1][#stack[level-1]].tagName..
" followed by "..lower_tag)
level = level - 1
table.remove(stack)
if level==0 then return tree end
end
table.insert(stack[level], node)
if empty_tags[lower_tag] then
if val ~= "" then
table.insert(stack[level], {
tagName = "textNode",
value = val
})
end
node.childNodes = nil
level = level - 1
else
if val ~= "" then
table.insert(node.childNodes, {
tagName = "textNode",
value = val
})
end
table.insert(stack, node.childNodes)
end
end
end
end
if level~=0 then
vlc.msg.dbg("Parse error: "..level)
end
collectgarbage()
return tree
end
function dump_html(data)
local stack = {data}
local d = ""
local node = nil
while #stack ~= 0 do
node = nil
node = stack[#stack][1]
if not node then break end
if node.tagName == "textNode" then
d = d..trim(node.value)
else
d = d.."\n"..string.rep (" ", #stack-1)
d = d.."<"..node.tagName
if node.attr then
for a, v in pairs(node.attr) do
d = d.." "..a..'="'..v..'"'
end
end
if empty_tags[node.tagName] then
d = d.."/>"
else
d = d..">"
end
end
if node.childNodes and #node.childNodes > 0 then
node.l = #node.childNodes
table.insert(stack, node.childNodes)
else
table.remove(stack[#stack], 1)
if node.childNodes and #node.childNodes == 0 and not empty_tags[node.tagName] then
d = d.."</"..node.tagName..">"
end
while #stack > 0 and #stack[#stack] == 0 do
table.remove(stack)
if #stack > 0 then
if stack[#stack][1].l > 1 then
d = d.."\n"..string.rep(" ", #stack-1).."</"..stack[#stack][1].tagName..">"
else
d = d.."</"..stack[#stack][1].tagName..">"
end
table.remove(stack[#stack], 1)
end
end
end
end
return d
end
@denisdemaisbr
Copy link

did you have some example of usage
?

@geehalel
Copy link

geehalel commented May 5, 2020

Hi,
This is an old piece of code, but it is the only lua html parser which has been able to correctly parse my html files (old microsoft stuff).
I think other lua html parsers are mainly for producing output, not reading malformed (but real) html files.
By the way I just think that you forgot a simple case at line 175 when closing a tag: as the for loop parses almost between '<' and the next '<' there may be a text node between the closing '>' of the tag and the next '<'. Thus I just added that small piece of code:

table.remove(stack) -- line 175
if val ~= nil and val ~= '' and not(string.gsub(val, '[%s\r\t\n]+', ' ') == ' ') then
     --logMsg('inserting ' .. val .. ' in stack level ' .. level)
      if stack[level][#stack[level]].childNodes then
            table.insert(stack[level][#stack[level]].childNodes, {
            tagName = "textNode",
            value = val
      else
                    
      end
end         

Now I am not sure what to put in the else part: if the closing tag is not the very first child, that's ok, but is there always a childNodes table if the
closing tag node is the first node. That works well up to now so Il leave it as is.
Anyway thanks for sharing, and hopefully this may be useful to some people here.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment