Last active
August 29, 2015 14:01
-
-
Save mhermans/5b33f154c97a9866c447 to your computer and use it in GitHub Desktop.
dierentheator document scraper error
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ERROR | |
2014-05-19 23:57:10,980 - parsing http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=/site/wwwcfm/flwb/flwbn.cfm?lang=F&legislat=53&dossierID=3573 --- a document 3573 | |
2014-05-19 23:57:12,065 - LXML parsing http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=/site/wwwcfm/flwb/flwbn.cfm?lang=F&legislat=53&dossierID=3573 --- a document 3573 | |
2014-05-19 23:57:12,068 - LXML parsing http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=nl&rightmenu=right&cfm=/site/wwwcfm/flwb/flwbn.cfm?lang=F&legislat=53&dossierID=3573 --- a document 3573 nl | |
2014-05-19 23:57:12,995 - lachambre_deputy.find (0.00) {'lachambre_id': '01201'} | |
Traceback (most recent call last): | |
File "/home/mhermans/tmp/dierentheater/lachambre_parser/documents.py", line 68, in parse_every_documents | |
handle_document(document) | |
File "/home/mhermans/tmp/dierentheater/lachambre_parser/documents.py", line 121, in handle_document | |
_get_document_chambre(dico, dico_nl,http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=nl&rightmenu=right&cfm=/site/wwwcfm/flwb/flwbn.cfm?lang=F&legislat=53&dossierID=3573 document) | |
File "/home/mhermans/tmp/dierentheater/lachambre_parser/documents.py", line 335, in _get_document_chambre | |
url, tipe, session = clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />") | |
ValueError: need more than 1 value to unpack | |
2014-05-19 23:57:12,998 - /!\ 3573 didn't succed! Error: while reparsing document need more than 1 value to unpack | |
> /home/mhermans/tmp/dierentheater/lachambre_parser/documents.py(335)_get_document_chambre() | |
334 | |
--> 335 url, tipe, session = clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />") | |
336 _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace(" ", "")).split("<br />") | |
## ipdb | |
ipdb> chambre_dico[u'head'] | |
<td class="td0x"> | |
<a href="http://www.lachambre.be/FLWB/PDF/53/3572/53K3572001.pdf" target="_blank"><img class="picto" src="/images/pictos_pdf.gif"/></a> <a href="http://www.lachambre.be/FLWB/PDF/53/3572/53K3572001.pdf" target="_blank">53K3572001</a> | |
945 Kb | |
<br/> | |
PROPOSITION DE LOI | |
- CHAMBRE | |
<br/> | |
Législature : 53 - Session : 2013/2014-0 | |
</td> | |
ipdb> clean_text(str(chambre_dico[u'head']).replace(" ", "")) | |
'<td class="td0x"> <a href="http://www.lachambre.be/FLWB/PDF/53/3572/53K3572001.pdf" target="_blank"><img class="picto" src="/images/pictos_pdf.gif"/></a>\xc2\xa0 <a href="http://www.lachambre.be/FLWB/PDF/53/3572/53K3572001.pdf" target="_blank">53K3572001</a> \xc2\xa0\xc2\xa0945 Kb <br/> PROPOSITION DE LOI \xc2\xa0 - \xc2\xa0CHAMBRE <br/> L\xc3\xa9gislature : 53 \xc2\xa0-\xc2\xa0 Session : 2013/2014-0 </td>' | |
ipdb> clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />") | |
['<td class="td0x"> <a href="http://www.lachambre.be/FLWB/PDF/53/3572/53K3572001.pdf" target="_blank"><img class="picto" src="/images/pictos_pdf.gif"/></a>\xc2\xa0 <a href="http://www.lachambre.be/FLWB/PDF/53/3572/53K3572001.pdf" target="_blank">53K3572001</a> \xc2\xa0\xc2\xa0945 Kb <br/> PROPOSITION DE LOI \xc2\xa0 - \xc2\xa0CHAMBRE <br/> L\xc3\xa9gislature : 53 \xc2\xa0-\xc2\xa0 Session : 2013/2014-0 </td>'] | |
ipdb> len(clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />")) | |
1 | |
ipdb> url, type, session = clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />") | |
*** ValueError: need more than 1 value to unpack | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment