-
-
Save leandromuto/01947ba5c48f6cd5dce7 to your computer and use it in GitHub Desktop.
# This is a template for a Python scraper on morph.io (https://morph.io) | |
# including some code snippets below that you should find helpful | |
import urllib | |
import scraperwiki | |
import pdb | |
from bs4 import BeautifulSoup as bs | |
encoding = 'utf-8' | |
url = 'http://www.paodeacucar.com.br/' | |
# Read in a page | |
html = urllib.urlopen(url).read() | |
# Making a soup | |
soup = bs(html, from_encoding = encoding) | |
# menu_header = soup.select(".nhgpa_list") | |
menu_header = soup.select('div.nhgpa_list a') | |
if menu_header: | |
for item in menu_header: | |
link = item['href'].encode('utf8', 'replace') | |
secao = item.text.encode('utf8', 'replace') | |
print secao | |
scraperwiki.sqlite.save(unique_keys=['secao', 'href'], | |
data={"secao": secao.encode('utf8', 'replace'), "href": link.encode('utf8', 'replace')}, | |
table_name='secoes') | |
# pdb.set_trace() | |
# for secao in scraperwiki.sqlite.select('* FROM secoes'): |
leandrotoledo
commented
Mar 23, 2016
Traceback (most recent call last):
File "/Users/leandromuto/GitHub/pao_de_acucar_produtos/scraper.py", line 28, in
table_name='secoes')
File "/Library/Python/2.7/site-packages/scraperwiki/sql.py", line 203, in save
connection.execute(insert.values(row))
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 914, in execute
return meth(self, multiparams, params)
File "/Library/Python/2.7/site-packages/sqlalchemy/sql/elements.py", line 323, in _execute_on_connection
return connection._execute_clauseelement(self, multiparams, params)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1010, in _execute_clauseelement
compiled_sql, distilled_params
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1146, in _execute_context
context)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1341, in _handle_dbapi_exception
exc_info
File "/Library/Python/2.7/site-packages/sqlalchemy/util/compat.py", line 200, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1139, in _execute_context
context)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/default.py", line 450, in do_execute
cursor.execute(statement, parameters)
sqlalchemy.exc.ProgrammingError: (sqlite3.ProgrammingError) You must not use 8-bit bytestrings unless you use a text_factory that can interpret 8-bit bytestrings (like text_factory = str). It is highly recommended that you instead just switch your application to Unicode strings. [SQL: u'INSERT OR REPLACE INTO secoes (href, secao) VALUES (?, ?)'] [parameters: (u'http://www.paodeacucar.com.br/secoes/C5335/pascoa', 'P\xc3\xa1scoa')]