Skip to content

Instantly share code, notes, and snippets.

@w0w
Created December 18, 2014 16:43
Show Gist options
  • Save w0w/71b672704faa59d35458 to your computer and use it in GitHub Desktop.
Save w0w/71b672704faa59d35458 to your computer and use it in GitHub Desktop.
#! /usr/bin/coffee
#
# Install below packages before executing
# yum install nodejs npm redis
# npm -g install coffeescript
# npm -g install simplecrawler
# npm -g install redis
#
# USAGE: coffee scrape_medium.coffee >> data.csv
cheerio = require('cheerio')
request = require('request')
crawler = require("simplecrawler")
redis = require("redis")
client = redis.createClient()
client.on "error", (err) ->
console.log "OK"
crawler.crawl("https://medium.com/on-management")
.on "fetchcomplete",(queueItem) ->
part = queueItem.url.split('.com/')[1]
#console.log "part => ",part
if part.search('^@') == -1
url = "http://medium.com/" + part
request url, (err, res, body) ->
if !err && res.statusCode == 200
#console.log body
$ = cheerio.load(body)
title = $('.hero-title').text()
des = $('.hero-description').text()
fol_count = $('.button-label').text().split("Follow")[1] # Follow Count
title_count = $('.postItem-title').length
editor = $('.metabar-text.hide-on-mobile.metabar-text--collectionEditor > a')
editor_url = "http://medium.com" + editor.attr('href')
editor_name = editor.text()
csv_data = url + "||" + title + "||" + des + "||" + fol_count + "||" + title_count +
"||" + editor_url + "||" + editor_name
console.log csv_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment