Skip to content

Instantly share code, notes, and snippets.

@Kwpolska
Created February 16, 2013 19:05
Show Gist options
  • Save Kwpolska/4968252 to your computer and use it in GitHub Desktop.
Save Kwpolska/4968252 to your computer and use it in GitHub Desktop.
LXML Breakage
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "lxml breakage"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "import lxml.html, requests",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": "r = requests.get('https://aur.archlinux.org')\n",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": "r.text[:38]",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 8,
"text": "'<?xml version=\"1.0\" encoding=\"UTF-8\"?>'"
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": "lxml.html.fromstring(r.text) # u'string'\n",
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Unicode strings with encoding declaration are not supported.",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-11-2b8a760199e9>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mlxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/html/__init__.py\u001b[0m in \u001b[0;36mfromstring\u001b[1;34m(html, base_url, parser, **kw)\u001b[0m\n\u001b[0;32m 663\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdocument_fromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_url\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbase_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 664\u001b[0m \u001b[1;31m# otherwise, lets parse it out...\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 665\u001b[1;33m \u001b[0mdoc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdocument_fromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_url\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbase_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 666\u001b[0m \u001b[0mbodies\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfindall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'body'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 667\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mbodies\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/html/__init__.py\u001b[0m in \u001b[0;36mdocument_fromstring\u001b[1;34m(html, parser, **kw)\u001b[0m\n\u001b[0;32m 561\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 562\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml_parser\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 563\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0metree\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 564\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mvalue\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 565\u001b[0m raise etree.ParserError(\n",
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/etree.cpython-33m.so\u001b[0m in \u001b[0;36mlxml.etree.fromstring (src/lxml/lxml.etree.c:61729)\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/etree.cpython-33m.so\u001b[0m in \u001b[0;36mlxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:91131)\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: Unicode strings with encoding declaration are not supported."
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": "lxml.html.fromstring(r.content) # b'string'",
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "startswith first arg must be bytes or a tuple of bytes, not str",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-12-059d51be153d>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mlxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# b'string'\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/html/__init__.py\u001b[0m in \u001b[0;36mfromstring\u001b[1;34m(html, base_url, parser, **kw)\u001b[0m\n\u001b[0;32m 659\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml_parser\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 660\u001b[0m \u001b[0mstart\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlstrip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 661\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'<html'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'<!doctype'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 662\u001b[0m \u001b[1;31m# Looks like a full HTML document\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 663\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdocument_fromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_url\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbase_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: startswith first arg must be bytes or a tuple of bytes, not str"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": "lxml.html.fromstring(r.text[38:])",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 13,
"text": "<Element html at 0x7f7e600c0e30>"
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": "lxml.html.fromstring(r.content[38:])",
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "startswith first arg must be bytes or a tuple of bytes, not str",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-14-f1fb572c1313>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mlxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m38\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/html/__init__.py\u001b[0m in \u001b[0;36mfromstring\u001b[1;34m(html, base_url, parser, **kw)\u001b[0m\n\u001b[0;32m 659\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml_parser\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 660\u001b[0m \u001b[0mstart\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlstrip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 661\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'<html'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'<!doctype'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 662\u001b[0m \u001b[1;31m# Looks like a full HTML document\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 663\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdocument_fromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_url\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbase_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: startswith first arg must be bytes or a tuple of bytes, not str"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment