html5lib - ihatexml.py - line 254: DataLossWarning: Coercing non-XML name
In my case this was caused by an malformed attribute: ``
import xml.etree.ElementTree as etree | |
import html5lib | |
def fromstring(s): | |
tb = html5lib.getTreeBuilder("lxml", implementation=etree) | |
p = html5lib.HTMLParser(tb, namespaceHTMLElements=False) | |
return p.parse(s) |
html5lib - ihatexml.py - line 254: DataLossWarning: Coercing non-XML name
In my case this was caused by an malformed attribute: ``
# -*- coding: utf-8 | |
from __future__ import unicode_literals | |
import warnings | |
import unicodedata | |
def build_lookup(macro, micro): | |
assert len(macro) == len(micro), (len(macro), len(micro)) | |
output = {} | |
for pair in zip(macro, micro): | |
if pair[1] != ' ': |
<html> | |
<head> | |
<script type="text/javascript" src="https://www.dropbox.com/static/api/2/dropins.js" id="dropboxjs" data-app-key="w7qbnscwwlxgtz0"></script> | |
<script type="text/javascript" src="https://cdn.rawgit.com/Nijikokun/5192472/raw/4c80b2c2688841ffb086f8c2b3f57520b0bd817d/base64-utf8.module.js"></script> | |
</head> | |
<body> | |
<a href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA | |
AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO | |
9TXL0Y4OHwAAAABJRU5ErkJggg==" data-filename="reddot.png" alt="Red dot" id=save class="dropbox-saver" onclick='save.href="data:text/text;base64,"+base64.encode(foo.value);save.dataset.filename = "mytext"'></a> | |
<input type='text |
'--- https://support.microsoft.com/en-us/kb/195763 | |
' NB: remove PtrSafe if old Excel | |
Private Declare PtrSafe Function GetTempPath Lib "kernel32" _ | |
Alias "GetTempPathA" (ByVal nBufferLength As Long, _ | |
ByVal lpBuffer As String) As Long | |
'--- https://support.microsoft.com/en-us/kb/195763 | |
' NB: remove PtrSafe if old Excel | |
Private Declare PtrSafe Function GetTempFileName Lib "kernel32" _ | |
Alias "GetTempFileNameA" (ByVal lpszPath As String, _ |
def get_select_value(node): | |
# node is an LXML element (SELECT tag) | |
try: | |
return node.cssselect("option[selected='selected']")[0].text | |
except IndexError: | |
return node.cssselect("option")[0].text |
def makeidentifier(s): | |
import string | |
s=s.strip().replace(' ','_') | |
valid_chars = "_%s%s" % (string.ascii_letters, string.digits) | |
out=''.join(c for c in s if c in valid_chars) | |
if len(out)==0: | |
return '_' | |
else: | |
return out |
import re | |
# functions to detect/fix double-encoded UTF-8 strings | |
# Based on http://blogs.perl.org/users/chansen/2010/10/coping-with-double-encoded-utf-8.html | |
DOUBLE_ENCODED = re.compile(""" | |
\xC3 (?: [\x82-\x9F] \xC2 [\x80-\xBF] # U+0080 - U+07FF | |
| \xA0 \xC2 [\xA0-\xBF] \xC2 [\x80-\xBF] # U+0800 - U+0FFF | |
| [\xA1-\xAC] \xC2 [\x80-\xBF] \xC2 [\x80-\xBF] # U+1000 - U+CFFF | |
| \xAD \xC2 [\x80-\x9F] \xC2 [\x80-\xBF] # U+D000 - U+D7FF | |
| [\xAE-\xAF] \xC2 [\x80-\xBF] \xC2 [\x80-\xBF] # U+E000 - U+FFFF |
fs=require 'fs' | |
settings = fs.readFileSync 'scraperwiki.json' | |
settings = JSON.parse settings |
import requests | |
requests.defaults.defaults['max_retries'] = 5 | |
# ... rest of code ... |