Skip to content

Instantly share code, notes, and snippets.

@gzxultra
Last active December 8, 2017 13:33
Show Gist options
  • Save gzxultra/11aab89ad748afdecd3a0be78ae63218 to your computer and use it in GitHub Desktop.
Save gzxultra/11aab89ad748afdecd3a0be78ae63218 to your computer and use it in GitHub Desktop.
scrapy.py
# coding: utf-8
import requests
import re
import hashlib
import simplejson
from bs4 import BeautifulSoup
requests.adapters.DEFAULT_RETRIES = 10
session = requests.session()
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
class Recipe(object):
def __init__(self, id, create_time, title):
self.id = id
self.create_time = create_time
self.title = title
def __repr__(self):
return str(self.id)
def __eq__(self, other):
return self.id == other.id
def __hash__(self):
return hash(self.id)
def get_response(category_url):
page = '?page={page_id}'
base_recipe_url = 'https://www.kurashiru.com'
print('category_url{}'.format(category_url))
soup = BeautifulSoup(session.get(category_url, headers=headers).content)
try:
children = soup.find('h2', class_='category-list-title').next.next.find_all('li')
except:
return []
video = soup.find('div', class_='videos-list')
if not video:
raise Exception('no data')
page_id = 2
MAX_PAGE = int(re.search(r'[0-9]+', soup.title.text).group()) / 30 + 1
while page_id < MAX_PAGE:
print('page_id{}'.format(page_id))
for r in video.find_all('a', class_='video-list-img'):
if not r:
continue
recipe_url = base_recipe_url + r['href']
try:
get_recipe_object(recipe_url)
except simplejson.JSONDecodeError:
print('[continue{}'.format(recipe_url))
continue
soup = BeautifulSoup(session.get(category_url + page.format(page_id=page_id), headers=headers).content)
video = soup.find('div', class_='videos-list')
page_id += 1
return [child.a['href'] for child in children]
def get_recipe_object(recipe_url):
html_doc = session.get(recipe_url, headers=headers).content
soup = BeautifulSoup(html_doc)
json = simplejson.loads(soup.find('script', type='application/ld+json').text.replace('\r\n', ''))
recipe_id = recipe_url.split('recipes/')[-1]
objects.add(Recipe(recipe_id, json['datePublished'], json['name']))
def bfs(p):
queue = []
crwaw_set = set()
queue.append(p)
while queue:
p = queue.pop(0)
crwaw_set.add(p)
print('scraping {}'.format(p))
children = get_response(p)
for child in children:
if child not in crwaw_set:
queue.append(child)
if __name__ == '__main__':
objects = set()
root_url = 'https://www.kurashiru.com/categories/1'
# html_doc = requests.get(root_url).content
# soup = BeautifulSoup(html_doc)
# category_urls = [category.a['href'] for category in soup.find_all('h2') if category.find_all('a')]
# for category_url in category_urls:
# objects.extend(bfs(category_url))
bfs(root_url)
# coding: utf-8
import requests
import re
import hashlib
import simplejson
from gevent import monkey; monkey.patch_socket()
import gevent
from bs4 import BeautifulSoup
requests.adapters.DEFAULT_RETRIES = 10
session = requests.session()
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
class Recipe(object):
def __init__(self, id, create_time, title):
self.id = id
self.create_time = create_time
self.title = title
def __repr__(self):
return str(self.id)
def __eq__(self, other):
return self.id == other.id
def __hash__(self):
return hash(self.id)
def get_response(category_url):
page = '?page={page_id}'
base_recipe_url = 'https://www.kurashiru.com'
print('category_url{}'.format(category_url))
soup = BeautifulSoup(session.get(category_url, headers=headers).content)
try:
children = soup.find('h2', class_='category-list-title').next.next.find_all('li')
except:
return []
video = soup.find('div', class_='videos-list')
if not video:
raise Exception('no data')
page_id = 2
MAX_PAGE = int(re.search(r'[0-9]+', soup.title.text).group()) / 30 + 1
while page_id < MAX_PAGE:
print('page_id{}'.format(page_id))
recipe_tasks = []
for r in video.find_all('a', class_='video-list-img'):
if not r:
continue
recipe_url = base_recipe_url + r['href']
recipe_tasks.append(gevent.spawn(get_recipe_object, recipe_url))
gevent.joinall(recipe_tasks)
soup = BeautifulSoup(session.get(category_url + page.format(page_id=page_id), headers=headers).content)
video = soup.find('div', class_='videos-list')
page_id += 1
return [child.a['href'] for child in children]
def get_recipe_object(recipe_url):
try:
html_doc = session.get(recipe_url, headers=headers).content
soup = BeautifulSoup(html_doc)
json = simplejson.loads(soup.find('script', type='application/ld+json').text.replace('\r\n', ''))
recipe_id = recipe_url.split('recipes/')[-1]
objects.add(Recipe(recipe_id, json['datePublished'], json['name']))
except:
print('[continue->{}]'.format(recipe_url))
def bfs(p):
queue = []
crwaw_set = set()
queue.append(p)
while queue:
p = queue.pop(0)
crwaw_set.add(p)
print('scraping {}'.format(p))
children = get_response(p)
for child in children:
if child not in crwaw_set:
queue.append(child)
if __name__ == '__main__':
objects = set()
root_url = 'https://www.kurashiru.com/categories/1'
# html_doc = requests.get(root_url).content
# soup = BeautifulSoup(html_doc)
# category_urls = [category.a['href'] for category in soup.find_all('h2') if category.find_all('a')]
# for category_url in category_urls:
# objects.extend(bfs(category_url))
bfs(root_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment