Skip to content

Instantly share code, notes, and snippets.

#
#
#
#
#
#
#
#
#
###########
[{'file': ['2017-72',
[{'content': [{}],
'metadata': [{'date': [],
'dateCreated': [],
'dateIssued': ['2017-03-15'],
'dateMod': ['2017-03-15'],
'docType': ['Orders'],
'keywords': ['Telecommunications, TELUS '
'Communications Company, Tariff '
'approval, Tariff applications'],
@mediagestalt
mediagestalt / content2_spider.py
Last active March 5, 2017 18:47
Scrapy spider code for CRTC content
import scrapy
from crtc.items import CrtcItem
class Content2Spider(scrapy.Spider):
name = "content2"
download_delay = 5
allowed_domains = ['www.crtc.gc.ca/eng/archive']
start_urls = ['http://www.crtc.gc.ca/eng/archive/2016/2016-491.htm', 'http://www.crtc.gc.ca/eng/archive/1997/DB97-1.htm', 'http://www.crtc.gc.ca/eng/archive/1998/C98-428.HTM']