Skip to content

Instantly share code, notes, and snippets.

@kurohai
Forked from lmyyao/search.py
Created February 6, 2019 13:31
Show Gist options
  • Save kurohai/ec68d31e1186bb3acce078ff50bbb688 to your computer and use it in GitHub Desktop.
Save kurohai/ec68d31e1186bb3acce078ff50bbb688 to your computer and use it in GitHub Desktop.
 via search engine or baidu engine spider
from __future__ import print_function
from bs4 import BeautifulSoup as bs
import requests
import logging
from itertools import count
import time
from blinker import signal
from lxml.html.clean import Cleaner
from lxml.html import tostring, fromstring, iterlinks
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import concurrent.futures
import traceback
from multiprocessing import Pool, TimeoutError
import re
DownLoaderConf = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
"headless": False,
"ignore-certificate-errors": True,
#"proxy-server": "socks5://127.0.0.1:1080",
"no-sandbox": True,
"executable_path": "/Users/apple/spider/chromedriver"
}
def get_driver_opts(conf):
opts = Options()
path = None
for k, v in conf.items():
if type(v) == bool:
if v:
opts.add_argument("--{0}".format(k))
elif k == "executable_path":
path = v
else:
opts.add_argument("{0}={1}".format(k, v))
assert path is not None, "webdriver path cannot be None"
return path, opts
class DynamicDownLoader(object):
def __init__(self, browser=None, timeout=10):
if browser is None:
path, opts = get_driver_opts(DownLoaderConf)
self.browser = webdriver.Chrome(executable_path=path, chrome_options=opts)
self.browser.set_page_load_timeout(timeout)
else:
self.browser = browser
def run(self, req):
self.browser.get(req)
return self.browser.page_source
def close(self):
self.browser.quit()
class StaticDownLoader(object):
def __init__(self):
self.session = requests.Session()
def run(self, req):
self.session.send(req)
def close(self):
self.session.close()
class Engine(object):
entry = None
max_result = 20
max_requests = 10
DownloaderClass = DynamicDownLoader
delay_time = 1
before_crawl = signal("before_crawl")
after_crawl = signal("after_crawl")
def __init__(self, entry=None, DownloaderClass=None, **kwargs):
if entry is not None:
self.entry = entry
elif not getattr(self, "entry", None):
raise ValueError("%s must have a entry" % type(self).__name__)
if DownloaderClass is not None:
self.DownloaderClass= DownloaderClass
elif not getattr(self, "DownloaderClass", None):
raise ValueError("%s must have a downloader" % type(self).__name__)
self.__dict__.update(kwargs)
self.result = []
self.counter = count()
self.request_counter = 0
def set_downloader(self, downloader=None, **kwargs):
if downloader is None:
self.downloader = self.DownloaderClass(**kwargs)
else:
self.downloader = downloader
def crawl(self, key):
if not hasattr(self, "downloader"):
raise ValueError("%s must set_downloader before crawl" % type(self).__name__)
try:
self.before_crawl.send(self)
while True:
if len(self.result) > self.max_result or self.request_counter > self.max_requests:
break
req = self.make_next_request(key)
try:
response = self.downloader.run(req)
for item in self.parse(response):
print(item)
self.result.append(item)
except:
traceback.print_exc()
self.request_counter += 1
self.delay()
except Exception as e:
traceback.print_exc()
finally:
self.downloader.close()
self.after_crawl.send(self, key=key)
def make_next_request(self, key, **kwargs):
raise NotImplementedError
def parse(self, response):
raise NotImplementedError
def delay(self):
time.sleep(self.delay_time)
class GGEngine(Engine):
entry = "https://www.google.com/search?gl=us&q="
def make_next_request(self, key):
url = self.entry + key
start = 10
c = next(self.counter)
if c == 0:
return url
else:
return "{}&start={}".format(url, start*c)
def parse(self,response):
soup = bs(response,'lxml')
for item in soup.find_all("ol"):
i = item.select_one("li:nth-of-type(1) a")
if i:
href = i["href"]
u = self.resolve_href(href)
if u:
yield u
@staticmethod
def resolve_href(url):
import re
g = re.match("(.*?)cache:(.*?):(.*?)\+&cd(.*)",url)
if g:
return g.group(3)
class BDEngine(Engine):
entry = "https://www.baidu.com/s?wd="
def make_next_request(self,key):
url = self.entry + key
pn = 10
c = next(self.counter)
if c == 0:
return url
else:
return "{}&pn={}".format(url, pn*c)
def parse(self,response):
default_window = self.downloader.browser.current_window_handle
for item in self.downloader.browser.find_elements_by_css_selector("a.c-showurl"):
try:
item.click()
self.downloader.browser.switch_to_window(self.downloader.browser.window_handles.pop())
yield self.downloader.browser.current_url
self.downloader.browser.close()
self.downloader.browser.switch_to_window(default_window)
time.sleep(0.5)
except:
continue
def static_get_source(url, timeout=10):
if url.startswith("http"):
response = requests.get(url, timeout=timeout)
else:
response = requests.get("http://{}".format(url), timeout=timeout)
return response.content
def html2text(data):
try:
doc = fromstring(data)
cleaner = Cleaner(style=True)
doc = cleaner.clean_html(doc)
return re.sub('[ \t\n]+'," ",doc.text_content())
except Exception:
traceback.print_exc()
return ''
def dynamic_get_source(url, timeout=10):
try:
dspider = DynamicDownLoader(timeout=timeout)
return dspider.run(url)
except:
return ""
finally:
try:
dspider.close()
except:
pass
def spider(engine, downloader=None, dynamic=True):
loader = dynamic_get_source if dynamic else static_get_source
engine.set_downloader(downloader=downloader)
@engine.after_crawl.connect
def after_crawl(sender, key=""):
import os
if not os.path.exists("spider_out"):
os.makedirs("spider_out")
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
tasks = {executor.submit(loader, url, 10):url for url in sender.result}
counter = 1
for f in concurrent.futures.as_completed(tasks):
url = tasks[f]
try:
data = f.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
try:
data = html2text(data)
data = data.strip()
if data:
with open("spider_out/{}.html".format(key+str(counter)), "a+") as g:
g.write(data)
except Exception:
traceback.print_exc()
continue
finally:
counter += 1
return engine
if __name__ == '__main__':
import sys
with open(sys.argv[-1]) as f:
for line in f:
s = spider(BDEngine())
s.crawl(line.strip())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment