Skip to content

Instantly share code, notes, and snippets.

@ygrenzinger
Created March 9, 2021 14:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ygrenzinger/7d5c6f18197eff8309ce0a7d12d25e4d to your computer and use it in GitHub Desktop.
Save ygrenzinger/7d5c6f18197eff8309ce0a7d12d25e4d to your computer and use it in GitHub Desktop.
Scraping scoop.it
# coding=utf-8
# This is a sample Python script.
# Press ⌃R to execute it or replace it with your code.
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
import scrapy
from requests import get
from scrapy import Selector
import json
# class BlogSpider(scrapy.Spider):
# name = 'blogspider'
# start_urls = ['https://www.zyte.com/blog/']
#
# def parse(self, response):
# for title in response.css('.oxy-post-title'):
# yield {'title': title.css('::text').get()}
#
# for next_page in response.css('a.next'):
# yield response.follow(next_page, self.parse)
def retrieve_posts(topic_url, number, posts):
posts_elmts = Selector(text=get(topic_url + "?page=" + str(number)).text).css(".postView")
for post_elmt in posts_elmts:
post = {}
post_url = post_elmt.css(".postTitleView a::attr(href)").get()
post_title = post_elmt.css(".postTitleView a::text").get()
if post_url is None or post_title is None:
continue
post["title"] = post_title.strip()
post["url"] = post_url
post_description = post_elmt.css(".post-description blockquote::text").get()
if post_description:
post["description"] = post_description.strip()
posts.append(post)
def max_page(topic_url):
page_numbers = Selector(text=get(topic_url).text).css("nav.pagination li a::attr(data-page)").getall()
return max([int(x) for x in page_numbers])
def parse_topic(topic_url, file_name):
posts = []
for n in range(max_page(topic_url)):
print("retrieving page " + str(n))
retrieve_posts(topic_url, n+1, posts)
with open(file_name, 'w') as outfile:
json.dump(posts, outfile, indent=2)
if __name__ == '__main__':
parse_topic("https://www.scoop.it/topic/software-craftmanship-and-development", "software-engineering.json")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment