Last active
October 26, 2016 08:04
-
-
Save bongbongco/09cd7a772643e4dd09af287161dc724b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: UTF-8 -*- | |
from requests import get | |
from bs4 import BeautifulSoup | |
from random import choice | |
from time import sleep | |
user_agent_list = [ | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0', | |
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', | |
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'] | |
def search(keyword): | |
url_list = [] | |
quantity = 12 | |
page = 1 | |
user_agent = { | |
'User-Agent': choice(user_agent_list)} | |
while True: | |
response = get('http://search.yahoo.co.jp/search?p="%s"&b="%d"' % (keyword, page), | |
headers=user_agent, timeout=5) | |
html = response.text.encode('utf-8') | |
soup = BeautifulSoup(html) | |
elements = soup.findAll('div', {'class':'hd'}) | |
for url in elements: | |
for a in url.find_all('a', href=True): | |
#print a['href'] | |
url_list.append(a['href']) | |
if page//10 == quantity//10: | |
break | |
page = page + 10 | |
return url_list | |
for url in search("test"): | |
print url |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment