Skip to content

Instantly share code, notes, and snippets.

@janstieler
Created September 7, 2019 19:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save janstieler/2fc052f6527e8d19646bba211f7f3ad2 to your computer and use it in GitHub Desktop.
Save janstieler/2fc052f6527e8d19646bba211f7f3ad2 to your computer and use it in GitHub Desktop.
python scrappy file for crawl a website for imgs with alt-tag.
import scrapy
class MySpider(CrawlSpider):
name = 'imageaggr'
start_urls = ['https://www.datagon.de']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('', ), deny=('defghi\.txt')), callback='parse_item'),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
# Rule(SgmlLinkExtractor(allow=('\.cms','\.html' )), deny=('parse_item\.html'))),
#Rule(SgmlLinkExtractor(allow=('news', )), callback='parse_item'),
)
def parse_item(self, response):
sel = Selector(response)
images = sel.xpath('//img')
image_count = len(images)
count = 0
while(count < image_count):
item = imageItem()
item['url'] = response.url
title = sel.xpath('//img/@alt').extract()[count] or ''
if title == '':
break
item['title'] = title
iurl = sel.xpath('//img/@src').extract()[count] or ''
item['iurl'] = iurl
item['crawl_time'] = time.asctime( time.localtime(time.time()))
crawl_date = time.strftime("%Y%m%d")
item['crawl_date'] = crawl_date
count = count + 1
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment