gzxultra/scrapy.py

## scrapy.py
# coding: utf-8
import requests
import re
import hashlib
import simplejson

from bs4 import BeautifulSoup

requests.adapters.DEFAULT_RETRIES = 10
session = requests.session()

headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}

class Recipe(object):

    def __init__(self, id, create_time, title):
        self.id = id
        self.create_time = create_time
        self.title = title

    def __repr__(self):
        return str(self.id)

    def __eq__(self, other):
        return self.id == other.id

    def __hash__(self):
        return hash(self.id)


def get_response(category_url):
    page = '?page={page_id}'
    base_recipe_url = 'https://www.kurashiru.com'
    print('category_url{}'.format(category_url))
    soup = BeautifulSoup(session.get(category_url, headers=headers).content)
    try:
        children = soup.find('h2', class_='category-list-title').next.next.find_all('li')
    except:
        return []

    video = soup.find('div', class_='videos-list')
    if not video:
        raise Exception('no data')

    page_id = 2
    MAX_PAGE = int(re.search(r'[0-9]+', soup.title.text).group()) / 30 + 1
    while page_id < MAX_PAGE:
        print('page_id{}'.format(page_id))
        for r in video.find_all('a', class_='video-list-img'):
            if not r:
                continue
            recipe_url = base_recipe_url + r['href']
            try:
                get_recipe_object(recipe_url)
            except simplejson.JSONDecodeError:
                print('[continue{}'.format(recipe_url))
                continue
        soup = BeautifulSoup(session.get(category_url + page.format(page_id=page_id), headers=headers).content)
        video = soup.find('div', class_='videos-list')
        page_id += 1

    return [child.a['href'] for child in children]


def get_recipe_object(recipe_url):
    html_doc = session.get(recipe_url, headers=headers).content
    soup = BeautifulSoup(html_doc)
    json = simplejson.loads(soup.find('script', type='application/ld+json').text.replace('\r\n', ''))
    recipe_id = recipe_url.split('recipes/')[-1]
    objects.add(Recipe(recipe_id, json['datePublished'], json['name']))


def bfs(p):
    queue = []
    crwaw_set = set()
    queue.append(p)

    while queue:
        p = queue.pop(0)
        crwaw_set.add(p)
        print('scraping {}'.format(p))
        children = get_response(p)

        for child in children:
            if child not in crwaw_set:
                queue.append(child)


if __name__ == '__main__':
    objects = set()
    root_url = 'https://www.kurashiru.com/categories/1'
    # html_doc = requests.get(root_url).content
    # soup = BeautifulSoup(html_doc)
    # category_urls = [category.a['href'] for category in soup.find_all('h2') if category.find_all('a')]
    # for category_url in category_urls:
    #     objects.extend(bfs(category_url))
    bfs(root_url)


# coding: utf-8
import requests
import re
import hashlib
import simplejson
from gevent import monkey; monkey.patch_socket()
import gevent
from bs4 import BeautifulSoup

requests.adapters.DEFAULT_RETRIES = 10
session = requests.session()

headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}

class Recipe(object):

    def __init__(self, id, create_time, title):
        self.id = id
        self.create_time = create_time
        self.title = title

    def __repr__(self):
        return str(self.id)

    def __eq__(self, other):
        return self.id == other.id

    def __hash__(self):
        return hash(self.id)


def get_response(category_url):
    page = '?page={page_id}'
    base_recipe_url = 'https://www.kurashiru.com'
    print('category_url{}'.format(category_url))
    soup = BeautifulSoup(session.get(category_url, headers=headers).content)
    try:
        children = soup.find('h2', class_='category-list-title').next.next.find_all('li')
    except:
        return []

    video = soup.find('div', class_='videos-list')
    if not video:
        raise Exception('no data')

    page_id = 2
    MAX_PAGE = int(re.search(r'[0-9]+', soup.title.text).group()) / 30 + 1
    while page_id < MAX_PAGE:
        print('page_id{}'.format(page_id))
        recipe_tasks = []
        for r in video.find_all('a', class_='video-list-img'):
            if not r:
                continue
            recipe_url = base_recipe_url + r['href']
            recipe_tasks.append(gevent.spawn(get_recipe_object, recipe_url))

        gevent.joinall(recipe_tasks)
        soup = BeautifulSoup(session.get(category_url + page.format(page_id=page_id), headers=headers).content)
        video = soup.find('div', class_='videos-list')
        page_id += 1

    return [child.a['href'] for child in children]


def get_recipe_object(recipe_url):
    try:
        html_doc = session.get(recipe_url, headers=headers).content
        soup = BeautifulSoup(html_doc)
        json = simplejson.loads(soup.find('script', type='application/ld+json').text.replace('\r\n', ''))
        recipe_id = recipe_url.split('recipes/')[-1]
        objects.add(Recipe(recipe_id, json['datePublished'], json['name']))
    except:
        print('[continue->{}]'.format(recipe_url))


def bfs(p):
    queue = []
    crwaw_set = set()
    queue.append(p)

    while queue:
        p = queue.pop(0)
        crwaw_set.add(p)
        print('scraping {}'.format(p))
        children = get_response(p)

        for child in children:
            if child not in crwaw_set:
                queue.append(child)


if __name__ == '__main__':
    objects = set()
    root_url = 'https://www.kurashiru.com/categories/1'
    # html_doc = requests.get(root_url).content
    # soup = BeautifulSoup(html_doc)
    # category_urls = [category.a['href'] for category in soup.find_all('h2') if category.find_all('a')]
    # for category_url in category_urls:
    #     objects.extend(bfs(category_url))
    bfs(root_url)
	# coding: utf-8
	import requests
	import re
	import hashlib
	import simplejson

	from bs4 import BeautifulSoup

	requests.adapters.DEFAULT_RETRIES = 10
	session = requests.session()

	headers = {
	'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
	}

	class Recipe(object):

	def __init__(self, id, create_time, title):
	self.id = id
	self.create_time = create_time
	self.title = title

	def __repr__(self):
	return str(self.id)

	def __eq__(self, other):
	return self.id == other.id

	def __hash__(self):
	return hash(self.id)


	def get_response(category_url):
	page = '?page={page_id}'
	base_recipe_url = 'https://www.kurashiru.com'
	print('category_url{}'.format(category_url))
	soup = BeautifulSoup(session.get(category_url, headers=headers).content)
	try:
	children = soup.find('h2', class_='category-list-title').next.next.find_all('li')
	except:
	return []

	video = soup.find('div', class_='videos-list')
	if not video:
	raise Exception('no data')

	page_id = 2
	MAX_PAGE = int(re.search(r'[0-9]+', soup.title.text).group()) / 30 + 1
	while page_id < MAX_PAGE:
	print('page_id{}'.format(page_id))
	for r in video.find_all('a', class_='video-list-img'):
	if not r:
	continue
	recipe_url = base_recipe_url + r['href']
	try:
	get_recipe_object(recipe_url)
	except simplejson.JSONDecodeError:
	print('[continue{}'.format(recipe_url))
	continue
	soup = BeautifulSoup(session.get(category_url + page.format(page_id=page_id), headers=headers).content)
	video = soup.find('div', class_='videos-list')
	page_id += 1

	return [child.a['href'] for child in children]


	def get_recipe_object(recipe_url):
	html_doc = session.get(recipe_url, headers=headers).content
	soup = BeautifulSoup(html_doc)
	json = simplejson.loads(soup.find('script', type='application/ld+json').text.replace('\r\n', ''))
	recipe_id = recipe_url.split('recipes/')[-1]
	objects.add(Recipe(recipe_id, json['datePublished'], json['name']))


	def bfs(p):
	queue = []
	crwaw_set = set()
	queue.append(p)

	while queue:
	p = queue.pop(0)
	crwaw_set.add(p)
	print('scraping {}'.format(p))
	children = get_response(p)

	for child in children:
	if child not in crwaw_set:
	queue.append(child)


	if __name__ == '__main__':
	objects = set()
	root_url = 'https://www.kurashiru.com/categories/1'
	# html_doc = requests.get(root_url).content
	# soup = BeautifulSoup(html_doc)
	# category_urls = [category.a['href'] for category in soup.find_all('h2') if category.find_all('a')]
	# for category_url in category_urls:
	# objects.extend(bfs(category_url))
	bfs(root_url)



	# coding: utf-8
	import requests
	import re
	import hashlib
	import simplejson
	from gevent import monkey; monkey.patch_socket()
	import gevent
	from bs4 import BeautifulSoup

	requests.adapters.DEFAULT_RETRIES = 10
	session = requests.session()

	headers = {
	'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
	}

	class Recipe(object):

	def __init__(self, id, create_time, title):
	self.id = id
	self.create_time = create_time
	self.title = title

	def __repr__(self):
	return str(self.id)

	def __eq__(self, other):
	return self.id == other.id

	def __hash__(self):
	return hash(self.id)


	def get_response(category_url):
	page = '?page={page_id}'
	base_recipe_url = 'https://www.kurashiru.com'
	print('category_url{}'.format(category_url))
	soup = BeautifulSoup(session.get(category_url, headers=headers).content)
	try:
	children = soup.find('h2', class_='category-list-title').next.next.find_all('li')
	except:
	return []

	video = soup.find('div', class_='videos-list')
	if not video:
	raise Exception('no data')

	page_id = 2
	MAX_PAGE = int(re.search(r'[0-9]+', soup.title.text).group()) / 30 + 1
	while page_id < MAX_PAGE:
	print('page_id{}'.format(page_id))
	recipe_tasks = []
	for r in video.find_all('a', class_='video-list-img'):
	if not r:
	continue
	recipe_url = base_recipe_url + r['href']
	recipe_tasks.append(gevent.spawn(get_recipe_object, recipe_url))

	gevent.joinall(recipe_tasks)
	soup = BeautifulSoup(session.get(category_url + page.format(page_id=page_id), headers=headers).content)
	video = soup.find('div', class_='videos-list')
	page_id += 1

	return [child.a['href'] for child in children]


	def get_recipe_object(recipe_url):
	try:
	html_doc = session.get(recipe_url, headers=headers).content
	soup = BeautifulSoup(html_doc)
	json = simplejson.loads(soup.find('script', type='application/ld+json').text.replace('\r\n', ''))
	recipe_id = recipe_url.split('recipes/')[-1]
	objects.add(Recipe(recipe_id, json['datePublished'], json['name']))
	except:
	print('[continue->{}]'.format(recipe_url))


	def bfs(p):
	queue = []
	crwaw_set = set()
	queue.append(p)

	while queue:
	p = queue.pop(0)
	crwaw_set.add(p)
	print('scraping {}'.format(p))
	children = get_response(p)

	for child in children:
	if child not in crwaw_set:
	queue.append(child)


	if __name__ == '__main__':
	objects = set()
	root_url = 'https://www.kurashiru.com/categories/1'
	# html_doc = requests.get(root_url).content
	# soup = BeautifulSoup(html_doc)
	# category_urls = [category.a['href'] for category in soup.find_all('h2') if category.find_all('a')]
	# for category_url in category_urls:
	# objects.extend(bfs(category_url))
	bfs(root_url)