Skip to content

Instantly share code, notes, and snippets.

@lucndm
Created December 25, 2015 06:24
Show Gist options
  • Save lucndm/2942b3db9ff13e51af8e to your computer and use it in GitHub Desktop.
Save lucndm/2942b3db9ff13e51af8e to your computer and use it in GitHub Desktop.
Crawler 2sao.vn
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from manga_scrapy.items import VideoItem
#2sao.vn
class SaoVnSpider(CrawlSpider):
name = '2saovn'
allowed_domains = ['2sao.vn']
start_urls = [
'http://2sao.vn/clip/nguoi-dan-xuong-pho-vui-noel-p0c1066n20151224193946087.vnn'
]
rules = [
Rule(LinkExtractor(allow=r'/clip/[-a-z0-9\._]+'),
callback='parse_item', follow=True)
]
def parse_item(self, response):
item = VideoItem()
item['url'] = response.xpath("//meta[@property='og:url']/@content").extract()[0]
item['title'] = response.xpath("//meta[@property='og:title']/@content").extract()[0]
youtube_link = response.xpath("//iframe[contains(@src,'youtube')]/@src").extract()[0]
if youtube_link is None:
item['video_link'] = 'Not Found Link !'
else:
item['video_link'] = youtube_link.replace('//', '').replace('embed/', '/watch?v=').replace('?rel=0', '')
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment