Skip to content

Instantly share code, notes, and snippets.

@tsudoko
Created October 4, 2018 00:55
Show Gist options
  • Save tsudoko/54e9367b8cf15132edc4e2e60a603632 to your computer and use it in GitHub Desktop.
Save tsudoko/54e9367b8cf15132edc4e2e60a603632 to your computer and use it in GitHub Desktop.
_SPACE = "\u0020\u0009\u000a\u000c\u000d"
_POS_OUTSIDE = 0
_POS_URL = 1
_POS_DESCRIPTOR = 2
def urls(srcset):
# URLs may contain commas, so we can't just .split(',')
pos = _POS_OUTSIDE
url = ""
for c in srcset:
if pos == _POS_OUTSIDE and c not in _SPACE:
if c == ",": # stray comma
continue
url += c
pos = _POS_URL
elif pos == _POS_URL:
if c in _SPACE:
if len(url) >= 1 and url[-1] == ",": # no descriptor
pos = _POS_OUTSIDE
else:
pos = _POS_DESCRIPTOR
yield url.rstrip(",")
url = ""
else:
url += c
elif pos == _POS_DESCRIPTOR:
if c == ",":
pos = _POS_OUTSIDE
if url:
yield url.rstrip(",")
# https://raw.githubusercontent.com/web-platform-tests/wpt/704d739c5a54658e6ea09efaa76824b134b36504/html/semantics/embedded-content/the-img-element/srcset/parse-a-srcset-attribute.html
_splitting_loop = r"""
<img srcset='' data-expect=''>
<img srcset=',' data-expect=''>
<img srcset=',,,' data-expect=''>
<img srcset=' data:,a 1x ' data-expect='data:,a'>
<img srcset='&#x9;&#x9;data:,a&#x9;&#x9;1x&#x9;&#x9;' data-expect='data:,a'>
<img srcset='&#xA;&#xA;data:,a&#xA;&#xA;1x&#xA;&#xA;' data-expect='data:,a'>
<img srcset='&#xB;&#xB;data:,a&#xB;&#xB;1x&#xB;&#xB;' data-expect='&#xB;&#xB;data:,a&#xB;&#xB;1x&#xB;&#xB;' data-resolve>
<!-- <img srcset='&#xC;&#xC;data:,a&#xC;&#xC;1x&#xC;&#xC;' data-expect='data:,a'> <!-- this one fails on lxml; srcset in the parsed tag is empty -->
<img srcset='&#xD;&#xD;data:,a&#xD;&#xD;1x&#xD;&#xD;' data-expect='data:,a'>
<img srcset='&#xE;&#xE;data:,a&#xE;&#xE;1x&#xE;&#xE;' data-expect='&#xE;&#xE;data:,a&#xE;&#xE;1x&#xE;&#xE;' data-resolve>
<img srcset='&#xF;&#xF;data:,a&#xF;&#xF;1x&#xF;&#xF;' data-expect='&#xF;&#xF;data:,a&#xF;&#xF;1x&#xF;&#xF;' data-resolve>
<img srcset='&#x10;&#x10;data:,a&#x10;&#x10;1x&#x10;&#x10;' data-expect='&#x10;&#x10;data:,a&#x10;&#x10;1x&#x10;&#x10;' data-resolve>
<img srcset='data:,a' data-expect='data:,a'>
<img srcset='data:,a ' data-expect='data:,a'>
<img srcset='data:,a ,' data-expect='data:,a'>
<img srcset='data:,a,' data-expect='data:,a'>
<img srcset='data:,a, ' data-expect='data:,a'>
<img srcset='data:,a,,,' data-expect='data:,a'>
<img srcset='data:,a,, , ' data-expect='data:,a'>
<img srcset=' data:,a' data-expect='data:,a'>
<img srcset=',,,data:,a' data-expect='data:,a'>
<img srcset=' , ,,data:,a' data-expect='data:,a'>
<img srcset='&nbsp;data:,a' data-expect='&nbsp;data:,a' data-resolve>
<img srcset='data:,a&nbsp;' data-expect='data:,a&nbsp;' data-resolve>
"""
_descriptor_tokenizer = r"""
<img srcset='data:,a 1x' data-expect='data:,a'>
<img srcset='data:,a 1x ' data-expect='data:,a'>
<img srcset='data:,a 1x,' data-expect='data:,a'>
<img srcset='data:,a ( , data:,b 1x, ), data:,c' data-expect='data:,c'>
<img srcset='data:,a ((( , data:,b 1x, ), data:,c' data-expect='data:,c'>
<img srcset='data:,a [ , data:,b 1x, ], data:,c' data-expect='data:,b'>
<img srcset='data:,a { , data:,b 1x, }, data:,c' data-expect='data:,b'>
<img srcset='data:,a " , data:,b 1x, ", data:,c' data-expect='data:,b'>
<img srcset='data:,a \,data:;\,b, data:,c' data-expect='data:;\,b'>
<img srcset='data:,a, data:,b (' data-expect='data:,a'>
<img srcset='data:,a, data:,b ( ' data-expect='data:,a'>
<img srcset='data:,a, data:,b (,' data-expect='data:,a'>
<img srcset='data:,a, data:,b (x' data-expect='data:,a'>
<img srcset='data:,a, data:,b ()' data-expect='data:,a'>
<img srcset='data:,a (, data:,b' data-expect=''>
<img srcset='data:,a /*, data:,b, data:,c */' data-expect='data:,b'>
<img srcset='data:,a //, data:,b' data-expect='data:,b'>
"""
def test():
import bs4
for _html in (_splitting_loop, _descriptor_tokenizer):
soup = bs4.BeautifulSoup(_html)
total = 0
success = 0
for img in soup.find_all("img"):
total += 1
parsed = set(urls(img['srcset']))
expect = {img['data-expect']} if img['data-expect'] else set()
if expect == parsed:
success += 1
elif expect.issubset(parsed):
# we don't mind getting some strictly invalid descriptors/URLs
# as long as the valid ones are extracted correctly
print("%2d" % total, "(w) extracted too much:", img['srcset'].__repr__(), "→", parsed)
success += 1
else:
print(
"%2d" % total, "(F)",
#img,
img['srcset'].__repr__(),
"→",
parsed,
"(expected", [img['data-expect']], ")"
)
print(f"{success}/{total}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment