Skip to content

Instantly share code, notes, and snippets.

@leandromuto
Created March 23, 2016 20:34
Show Gist options
  • Save leandromuto/01947ba5c48f6cd5dce7 to your computer and use it in GitHub Desktop.
Save leandromuto/01947ba5c48f6cd5dce7 to your computer and use it in GitHub Desktop.
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful
import urllib
import scraperwiki
import pdb
from bs4 import BeautifulSoup as bs
encoding = 'utf-8'
url = 'http://www.paodeacucar.com.br/'
# Read in a page
html = urllib.urlopen(url).read()
# Making a soup
soup = bs(html, from_encoding = encoding)
# menu_header = soup.select(".nhgpa_list")
menu_header = soup.select('div.nhgpa_list a')
if menu_header:
for item in menu_header:
link = item['href'].encode('utf8', 'replace')
secao = item.text.encode('utf8', 'replace')
print secao
scraperwiki.sqlite.save(unique_keys=['secao', 'href'],
data={"secao": secao.encode('utf8', 'replace'), "href": link.encode('utf8', 'replace')},
table_name='secoes')
# pdb.set_trace()
# for secao in scraperwiki.sqlite.select('* FROM secoes'):
@leandrotoledo
Copy link

# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

import urllib
import scraperwiki
import pdb
from bs4 import BeautifulSoup as bs

encoding = 'utf-8'
url = 'http://www.paodeacucar.com.br/'

# Read in a page
html = urllib.urlopen(url).read()

# Making a soup
soup = bs(html, from_encoding = encoding)

# menu_header = soup.select(".nhgpa_list")
menu_header = soup.select('div.nhgpa_list a')

if menu_header:
    for item in menu_header:
        link = item['href']
        secao = item.text

        print secao

        scraperwiki.sqlite.save(unique_keys=['secao', 'href'],
                            data={"secao": secao, "href": link},
                            table_name='secoes')
        # pdb.set_trace()

# for secao in scraperwiki.sqlite.select('* FROM secoes'):

@leandromuto
Copy link
Author

Traceback (most recent call last):
File "/Users/leandromuto/GitHub/pao_de_acucar_produtos/scraper.py", line 28, in
table_name='secoes')
File "/Library/Python/2.7/site-packages/scraperwiki/sql.py", line 203, in save
connection.execute(insert.values(row))
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 914, in execute
return meth(self, multiparams, params)
File "/Library/Python/2.7/site-packages/sqlalchemy/sql/elements.py", line 323, in _execute_on_connection
return connection._execute_clauseelement(self, multiparams, params)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1010, in _execute_clauseelement
compiled_sql, distilled_params
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1146, in _execute_context
context)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1341, in _handle_dbapi_exception
exc_info
File "/Library/Python/2.7/site-packages/sqlalchemy/util/compat.py", line 200, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1139, in _execute_context
context)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/default.py", line 450, in do_execute
cursor.execute(statement, parameters)
sqlalchemy.exc.ProgrammingError: (sqlite3.ProgrammingError) You must not use 8-bit bytestrings unless you use a text_factory that can interpret 8-bit bytestrings (like text_factory = str). It is highly recommended that you instead just switch your application to Unicode strings. [SQL: u'INSERT OR REPLACE INTO secoes (href, secao) VALUES (?, ?)'] [parameters: (u'http://www.paodeacucar.com.br/secoes/C5335/pascoa', 'P\xc3\xa1scoa')]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment