Skip to content

Instantly share code, notes, and snippets.

@bongbongco
Last active October 26, 2016 08:04
Show Gist options
  • Save bongbongco/09cd7a772643e4dd09af287161dc724b to your computer and use it in GitHub Desktop.
Save bongbongco/09cd7a772643e4dd09af287161dc724b to your computer and use it in GitHub Desktop.
#-*- coding: UTF-8 -*-
from requests import get
from bs4 import BeautifulSoup
from random import choice
from time import sleep
user_agent_list = [
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36']
def search(keyword):
url_list = []
quantity = 12
page = 1
user_agent = {
'User-Agent': choice(user_agent_list)}
while True:
response = get('http://search.yahoo.co.jp/search?p="%s"&b="%d"' % (keyword, page),
headers=user_agent, timeout=5)
html = response.text.encode('utf-8')
soup = BeautifulSoup(html)
elements = soup.findAll('div', {'class':'hd'})
for url in elements:
for a in url.find_all('a', href=True):
#print a['href']
url_list.append(a['href'])
if page//10 == quantity//10:
break
page = page + 10
return url_list
for url in search("test"):
print url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment