Skip to content

Instantly share code, notes, and snippets.

@nbari
Last active August 29, 2015 14:01
Show Gist options
  • Save nbari/f8e8437762768f887ece to your computer and use it in GitHub Desktop.
Save nbari/f8e8437762768f887ece to your computer and use it in GitHub Desktop.
recetas
import os
from bs4 import BeautifulSoup
from twisted.internet import reactor
from twisted.web.client import getPage
site = 'http://www.1080recetas.com'
def crawl(url):
print 'fetching', url
d = getPage(url)
def makeSoup(page, url):
""" prepare the dir to store all the data """
dir_name = 'recetas'
if not os.path.exists(dir_name):
os.makedirs(dir_name)
soup = BeautifulSoup(page)
for link in soup.find_all('a', {'class': 'readon'}):
title = link.get('title').encode('utf-8').strip()
r_url = '%s%s' % (site, link.get('href').encode('utf-8'))
d = getPage(r_url)
def save(page, title):
soup = BeautifulSoup(page)
receta = soup.find('div', {'class': 'article-content'})
with open('%s/%s.txt' % (dir_name, title), 'w') as f:
data = receta.text.encode('utf-8')
f.write(data)
print "%s -> finished" % title
d.addCallback(save, title)
def saveErrorHangler(error):
print error
d.addErrback(saveErrorHangler)
d.addCallback(makeSoup, url)
def errorHandler(error):
print "An error has occurred: <%s>" % str(error)
reactor.stop()
d.addErrback(errorHandler)
return d
if __name__ == '__main__':
for i in range(0, 500, 5):
crawl('%s/recetas?start=%d' % (site, i))
reactor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment