Skip to content

Instantly share code, notes, and snippets.

@lmyyao
Last active July 19, 2016 06:45
Show Gist options
  • Save lmyyao/8bd9ceb16559d4a49813a1e5a733a2c8 to your computer and use it in GitHub Desktop.
Save lmyyao/8bd9ceb16559d4a49813a1e5a733a2c8 to your computer and use it in GitHub Desktop.
Periodic Celery + Scrapy spider
from celery.task import PeriodicTask
from datetime import timedelta
class Lmy(PeriodicTask):
run_every = timedelta(seconds=60)
#celery queue router
options = {"exchange": "default", "routing_key": "default"}
name = "xxxxx"
def run(self):
import scrapy
from scrapy.crawler import CrawlerProcess
URL = 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1' \
'&st=-1&fr=&sf=1&fmq=1468240179138_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=' \
'&height=&face=0&istype=2&itg=0&uptype=urlsearch&ie=utf-8&word=%E6%9F%B3%E5%B2%A9'
class LiuYanSpider(scrapy.Spider):
name = "xxxxx"
start_urls = [URL]
def parse(self, response):
urls = response.selector.re("http://g.hiphotos.baidu.com(.*?)\.jpg")
for i in urls:
print("http://g.hiphotos.baidu.com{}.jpg".format(i))
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
"LOG_ENABLED": False
})
process.crawl(LiuYanSpider)
process.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment