janstieler/scarppy_img.py

## scarppy_img.py
import scrapy

class MySpider(CrawlSpider):
    name = 'imageaggr'
    start_urls = ['https://www.datagon.de']

    rules = (
        # Extract links matching 'category.php' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).

        Rule(SgmlLinkExtractor(allow=('', ), deny=('defghi\.txt')), callback='parse_item'),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        # Rule(SgmlLinkExtractor(allow=('\.cms','\.html' )), deny=('parse_item\.html'))),


        #Rule(SgmlLinkExtractor(allow=('news', )), callback='parse_item'),
    )

    def parse_item(self, response):
        sel = Selector(response)
        images = sel.xpath('//img')
        image_count = len(images)
        count = 0
        while(count < image_count):
            item = imageItem()
            item['url'] = response.url
            title = sel.xpath('//img/@alt').extract()[count] or ''
            if title == '':
                break
            item['title'] = title
            iurl = sel.xpath('//img/@src').extract()[count] or ''
            item['iurl'] = iurl
            item['crawl_time'] = time.asctime( time.localtime(time.time()))
            crawl_date = time.strftime("%Y%m%d")
            item['crawl_date'] = crawl_date
            count = count + 1
            return item
	import scrapy

	class MySpider(CrawlSpider):
	name = 'imageaggr'
	start_urls = ['https://www.datagon.de']

	rules = (
	# Extract links matching 'category.php' (but not matching 'subsection.php')
	# and follow links from them (since no callback means follow=True by default).

	Rule(SgmlLinkExtractor(allow=('', ), deny=('defghi\.txt')), callback='parse_item'),

	# Extract links matching 'item.php' and parse them with the spider's method parse_item
	# Rule(SgmlLinkExtractor(allow=('\.cms','\.html' )), deny=('parse_item\.html'))),


	#Rule(SgmlLinkExtractor(allow=('news', )), callback='parse_item'),
	)

	def parse_item(self, response):
	sel = Selector(response)
	images = sel.xpath('//img')
	image_count = len(images)
	count = 0
	while(count < image_count):
	item = imageItem()
	item['url'] = response.url
	title = sel.xpath('//img/@alt').extract()[count] or ''
	if title == '':
	break
	item['title'] = title
	iurl = sel.xpath('//img/@src').extract()[count] or ''
	item['iurl'] = iurl
	item['crawl_time'] = time.asctime( time.localtime(time.time()))
	crawl_date = time.strftime("%Y%m%d")
	item['crawl_date'] = crawl_date
	count = count + 1
	return item