Skip to content

Instantly share code, notes, and snippets.

@kezhenxu94
Last active March 30, 2018 08:09
Show Gist options
  • Save kezhenxu94/77a440bd2c6dad500e818b10ee5e7639 to your computer and use it in GitHub Desktop.
Save kezhenxu94/77a440bd2c6dad500e818b10ee5e7639 to your computer and use it in GitHub Desktop.
mmonly.cc Spider
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from mmonly_cc.items import MmonlyCcItem
import re
import datetime
class MmonlySpider(CrawlSpider):
name = 'mmonly'
allowed_domains = ['mmonly.cc']
start_urls = ['http://mmonly.cc/mmtp/']
rules = [
Rule(link_extractor=LinkExtractor(allow='.*/mmtp/.*/\d+(_\d+)?\.html'), callback='parse_item', follow=True),
Rule(link_extractor=LinkExtractor(allow='.*/mmtp/.*/(list_\d+_\d+\.html)?'))
]
def parse_item(self, response):
title_tags = response.css('div.imgtitle h1')
if not title_tags:
return
title_tag = title_tags[0]
title = title_tag.css('::text').extract_first()
title = re.sub('\(\d+/\d+\)$', '', title)
image_tags = response.css('#big-pic img')
if not image_tags:
return
image_tag = image_tags[0]
image_url = image_tag.css('::attr(src)').extract_first()
image_url = response.urljoin(image_url)
matcher = re.match(ur'.*/mmtp/(?P<category>\w+)/.*', response.url)
category = matcher.group('category') if matcher else ''
mmonly_cc_item = MmonlyCcItem(
title=title,
image_urls=[image_url],
source_url=response.url,
category=category,
updated_at=datetime.datetime.now()
)
yield mmonly_cc_item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment