Skip to content

Instantly share code, notes, and snippets.

@Xevion
Created August 8, 2020 00:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Xevion/0118d6b8b45b97348cd7333534a0bd01 to your computer and use it in GitHub Desktop.
Save Xevion/0118d6b8b45b97348cd7333534a0bd01 to your computer and use it in GitHub Desktop.
Posting a simple scrapy based scraper for the officequotes.net site.
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
urls = []
class FollowAllSpider(CrawlSpider):
name = 'follow_all'
start_urls = ['https://www.officequotes.net']
allowed_domains = ['officequotes.net']
rules = [Rule(LinkExtractor(), callback='parse_item', follow=True)]
def parse_item(self, response):
urls.append(response.url)
def close(self, reason):
with open('urls.json', 'w') as file:
json.dump(urls, file)
[
"https://www.officequotes.net",
"https://www.officequotes.net/no3-14.php",
"https://www.officequotes.net/no3-13.php",
"https://www.officequotes.net/no3-15.php",
"https://www.officequotes.net/no3-17.php",
"https://www.officequotes.net/no3-16.php",
"https://www.officequotes.net/no3-20.php",
"https://www.officequotes.net/no3-19.php",
"https://www.officequotes.net/no3-18.php",
"https://www.officequotes.net/no3-07.php",
"https://www.officequotes.net/no3-12.php",
"https://www.officequotes.net/creedThoughts.php",
"https://www.officequotes.net/webisodes-the3rdfloor.php",
"https://www.officequotes.net/webisodes-subtleSexuality.php",
"https://www.officequotes.net/no3-11.php",
"https://www.officequotes.net/no3-10.php",
"https://www.officequotes.net/webisodes-kevinsLoan.php",
"https://www.officequotes.net/webisodes.php",
"https://www.officequotes.net/no9-21.php",
"https://www.officequotes.net/no9-20.php",
"https://www.officequotes.net/disclaimer.php",
"https://www.officequotes.net/no3-09.php",
"https://www.officequotes.net/no9-19.php",
"https://www.officequotes.net/no3-06.php",
"https://www.officequotes.net/no3-08.php",
"https://www.officequotes.net/no9-16.php",
"https://www.officequotes.net/no9-14.php",
"https://www.officequotes.net/cdn-cgi/l/email-protection",
"https://www.officequotes.net/no9-13.php",
"https://www.officequotes.net/no9-12.php",
"https://www.officequotes.net/no9-11.php",
"https://www.officequotes.net/no9-17.php",
"https://www.officequotes.net/no9-15.php",
"https://www.officequotes.net/no9-18.php",
"https://www.officequotes.net/schruteSpace.php",
"https://www.officequotes.net/no9-22.php",
"https://www.officequotes.net/no9-23.php",
"https://www.officequotes.net/no9-10.php",
"https://www.officequotes.net/no9-07.php",
"https://www.officequotes.net/no9-09.php",
"https://www.officequotes.net/no9-06.php",
"https://www.officequotes.net/no9-05.php",
"https://www.officequotes.net/no9-08.php",
"https://www.officequotes.net/no9-04.php",
"https://www.officequotes.net/no9-03.php",
"https://www.officequotes.net/no9-02.php",
"https://www.officequotes.net/no8-20.php",
"https://www.officequotes.net/no9-01.php",
"https://www.officequotes.net/no8-24.php",
"https://www.officequotes.net/no8-23.php",
"https://www.officequotes.net/no8-22.php",
"https://www.officequotes.net/no8-21.php",
"https://www.officequotes.net/no8-18.php",
"https://www.officequotes.net/no8-19.php",
"https://www.officequotes.net/no8-17.php",
"https://www.officequotes.net/no8-16.php",
"https://www.officequotes.net/no8-15.php",
"https://www.officequotes.net/no8-14.php",
"https://www.officequotes.net/no8-13.php",
"https://www.officequotes.net/no8-12.php",
"https://www.officequotes.net/no8-11.php",
"https://www.officequotes.net/no8-10.php",
"https://www.officequotes.net/no8-09.php",
"https://www.officequotes.net/no8-07.php",
"https://www.officequotes.net/no8-06.php",
"https://www.officequotes.net/no8-05.php",
"https://www.officequotes.net/no8-04.php",
"https://www.officequotes.net/no8-08.php",
"https://www.officequotes.net/no8-03.php",
"https://www.officequotes.net/no8-02.php",
"https://www.officequotes.net/no7-22.php",
"https://www.officequotes.net/no7-21.php",
"https://www.officequotes.net/no8-01.php",
"https://www.officequotes.net/no7-20.php",
"https://www.officequotes.net/no7-24.php",
"https://www.officequotes.net/no7-23.php",
"https://www.officequotes.net/no7-19.php",
"https://www.officequotes.net/no7-18.php",
"https://www.officequotes.net/no7-17.php",
"https://www.officequotes.net/no7-16.php",
"https://www.officequotes.net/no7-15.php",
"https://www.officequotes.net/no7-14.php",
"https://www.officequotes.net/no7-13.php",
"https://www.officequotes.net/no7-12.php",
"https://www.officequotes.net/no7-11.php",
"https://www.officequotes.net/no7-10.php",
"https://www.officequotes.net/no7-09.php",
"https://www.officequotes.net/no7-08.php",
"https://www.officequotes.net/no7-07.php",
"https://www.officequotes.net/no7-06.php",
"https://www.officequotes.net/no7-05.php",
"https://www.officequotes.net/no7-04.php",
"https://www.officequotes.net/no7-02.php",
"https://www.officequotes.net/no7-03.php",
"https://www.officequotes.net/no7-01.php",
"https://www.officequotes.net/no6-24.php",
"https://www.officequotes.net/no6-22.php",
"https://www.officequotes.net/no6-23.php",
"https://www.officequotes.net/no6-21.php",
"https://www.officequotes.net/no6-20.php",
"https://www.officequotes.net/no6-19.php",
"https://www.officequotes.net/no6-18.php",
"https://www.officequotes.net/no6-17.php",
"https://www.officequotes.net/no6-16.php",
"https://www.officequotes.net/no6-15.php",
"https://www.officequotes.net/no6-13.php",
"https://www.officequotes.net/no6-14.php",
"https://www.officequotes.net/no6-12.php",
"https://www.officequotes.net/no6-11.php",
"https://www.officequotes.net/no6-10.php",
"https://www.officequotes.net/no6-09.php",
"https://www.officequotes.net/no6-08.php",
"https://www.officequotes.net/no6-07.php",
"https://www.officequotes.net/no6-06.php",
"https://www.officequotes.net/no6-05.php",
"https://www.officequotes.net/no6-04.php",
"https://www.officequotes.net/no6-03.php",
"https://www.officequotes.net/no6-02.php",
"https://www.officequotes.net/no6-01.php",
"https://www.officequotes.net/no5-26.php",
"https://www.officequotes.net/no5-25.php",
"https://www.officequotes.net/no5-24.php",
"https://www.officequotes.net/no5-23.php",
"https://www.officequotes.net/no5-22.php",
"https://www.officequotes.net/no5-21.php",
"https://www.officequotes.net/no5-20.php",
"https://www.officequotes.net/no5-19.php",
"https://www.officequotes.net/no5-16.php",
"https://www.officequotes.net/no5-17.php",
"https://www.officequotes.net/no5-15.php",
"https://www.officequotes.net/no5-14.php",
"https://www.officequotes.net/no5-13.php",
"https://www.officequotes.net/no5-12.php",
"https://www.officequotes.net/no5-11.php",
"https://www.officequotes.net/no5-10.php",
"https://www.officequotes.net/no5-09.php",
"https://www.officequotes.net/no5-08.php",
"https://www.officequotes.net/no5-07.php",
"https://www.officequotes.net/no5-05.php",
"https://www.officequotes.net/no5-06.php",
"https://www.officequotes.net/no5-04.php",
"https://www.officequotes.net/no5-03.php",
"https://www.officequotes.net/no5-02.php",
"https://www.officequotes.net/no5-01.php",
"https://www.officequotes.net/no4-13.php",
"https://www.officequotes.net/no4-14.php",
"https://www.officequotes.net/no4-12.php",
"https://www.officequotes.net/no4-10.php",
"https://www.officequotes.net/no4-11.php",
"https://www.officequotes.net/no4-09.php",
"https://www.officequotes.net/no4-07.php",
"https://www.officequotes.net/no4-06.php",
"https://www.officequotes.net/no4-05.php",
"https://www.officequotes.net/no4-04.php",
"https://www.officequotes.net/no4-03.php",
"https://www.officequotes.net/no4-02.php",
"https://www.officequotes.net/no3-22.php",
"https://www.officequotes.net/no3-21.php",
"https://www.officequotes.net/no3-05.php",
"https://www.officequotes.net/no3-04.php",
"https://www.officequotes.net/no3-02.php",
"https://www.officequotes.net/no3-03.php",
"https://www.officequotes.net/no4-01.php",
"https://www.officequotes.net/no3-23.php",
"https://www.officequotes.net/no2-19.php",
"https://www.officequotes.net/no2-17.php",
"https://www.officequotes.net/no2-16.php",
"https://www.officequotes.net/no3-01.php",
"https://www.officequotes.net/no2-22.php",
"https://www.officequotes.net/no2-21.php",
"https://www.officequotes.net/no2-20.php",
"https://www.officequotes.net/no2-18.php",
"https://www.officequotes.net/no2-15.php",
"https://www.officequotes.net/no2-14.php",
"https://www.officequotes.net/no2-13.php",
"https://www.officequotes.net/no2-12.php",
"https://www.officequotes.net/no2-11.php",
"https://www.officequotes.net/no2-10.php",
"https://www.officequotes.net/no2-09.php",
"https://www.officequotes.net/no2-07.php",
"https://www.officequotes.net/no2-08.php",
"https://www.officequotes.net/no2-06.php",
"https://www.officequotes.net/no2-05.php",
"https://www.officequotes.net/no2-04.php",
"https://www.officequotes.net/no2-03.php",
"https://www.officequotes.net/no2-02.php",
"https://www.officequotes.net/no2-01.php",
"https://www.officequotes.net/no1-06.php",
"https://www.officequotes.net/no1-05.php",
"https://www.officequotes.net/no1-04.php",
"https://www.officequotes.net/no1-03.php",
"https://www.officequotes.net/no1-02.php",
"https://www.officequotes.net/no1-01.php",
"https://www.officequotes.net/linksPage.php",
"https://www.officequotes.net/specialThanks.php",
"https://www.officequotes.net/fanLove.php",
"https://www.officequotes.net/topQuotes.php",
"https://www.officequotes.net/contactMe.php"
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment