Skip to content

Instantly share code, notes, and snippets.

@Crazy-Owl
Created December 9, 2011 16:20
Show Gist options
  • Save Crazy-Owl/1452202 to your computer and use it in GitHub Desktop.
Save Crazy-Owl/1452202 to your computer and use it in GitHub Desktop.
Паукан
#coding: utf-8
import os
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.regex import RegexLinkExtractor
class ZavtraSpider(CrawlSpider):
name = "zavtra"
allowed_domains = ["zavtra.ru"]
start_urls = [
"http://zavtra.ru/denlit/lit_index.html"
]
rules = (
Rule(RegexLinkExtractor(allow=("\d+/[\w\d]+\.html$",)), callback="parse_page", follow=True),
)
def parse_page(self, response):
issue, filename = response.url.split('/')[-2:]
if issue == "denlit":
issue = ""
try:
os.makedirs(os.path.join('crawled', issue))
except:
pass
with open(os.path.join('crawled', issue, filename), "wb") as f:
f.write(response.body)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment