Skip to content

Instantly share code, notes, and snippets.

@lowweihong
Created August 3, 2019 12:20
Show Gist options
  • Save lowweihong/756c47dde15b569fddbe40c50da36774 to your computer and use it in GitHub Desktop.
Save lowweihong/756c47dde15b569fddbe40c50da36774 to your computer and use it in GitHub Desktop.
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
"Host": "www.similarweb.com",
"Upgrade-Insecure-Requests": "1",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive"}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.similarweb.com/',
fetch_type = 'chrome',
validate_cert = False,
headers = self.headers,
callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
print(response.cookies)
self.crawl('https://www.similarweb.com/website/google.com',
fetch_type = 'chrome',
validate_cert = False,
headers = self.headers,
cookies = response.cookies,
callback=self.index_page_1)
@config(age=10 * 24 * 60 * 60)
def index_page_1(self, response):
return {response.doc('span.engagementInfo-param.engagementInfo-param--large.u-text-ellipsis').text() : response.doc('span.engagementInfo-valueNumber.js-countValue').text().split()[0]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment