Skip to content

Instantly share code, notes, and snippets.

@rjriel
Last active March 2, 2017 20:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjriel/8ec8cbfb0b87f1d5c65989d2e675873b to your computer and use it in GitHub Desktop.
Save rjriel/8ec8cbfb0b87f1d5c65989d2e675873b to your computer and use it in GitHub Desktop.
const cheerio = require('cheerio')
const request = require('request')
const mongoose = require('mongoose')
const config = require('config')
const Publication = require('./server/models/publication')
mongoose.Promise = global.Promise
let archives = [
["2016", "12"],
["2017", "01"],
["2017", "02"]
]
const getStories = (year, month) => {
return new Promise((resolve, reject) => {
request.get(`https://medium.com/qlik-branch/archive/${year}/${month}`, (error, response, body) => {
try {
let $ = cheerio.load(body)
let urls = $('h3 a').map((i, a) => $(a).attr('href')).get()
Promise.all(urls.map(url => getStory(url)))
.then(resolve)
.catch(reject)
} catch (e) {
reject(e)
}
})
})
}
const getStory = (url) => {
return new Promise((resolve, reject) => {
request.get(url, (error, response, body) => {
try {
if (error) {
console.error(error)
reject(e)
} else {
let $ = cheerio.load(body)
let publication = new Publication()
let content = $('div.section-inner')
content.find('h1').remove()
publication.title = $('meta[name="title"]').attr('content')
//publication.title = publication.title.encode('ascii','ignore')
publication.author = $('meta[name="author"]').attr('content')
publication.link = $('link[rel="canonical"]').attr('href')
publication.tags = $('ul.tags a').map((i, a) => $(a).text()).get().join(', ')
publication.published = new Date($('meta[property="article:published_time"]').attr('content'))
publication.published_num = publication.published.getTime()
publication.content = content.html()
publication.mediumId = publication.link.substring(publication.link.lastIndexOf('-') + 1)
publication.plaintext = content.text()
publication.short_description = $('meta[name="description"]').attr('content')
publication.image = $('meta[property="og:image"]').attr('content') || "/attachments/default/publication.png"
publication.approved = true
publication.save()
.then(resolve)
.catch(reject)
}
} catch (e) {
console.error(e.message)
reject(e)
}
})
})
}
mongoose.connect(config.mongoconnectionstring)
.then(connection => {
Promise.all(archives.map(archive => getStories(archive[0], archive[1])))
.then(() => {
console.log('done')
mongoose.connection.close()
})
.catch(error => {
console.error(error)
mongoose.connection.close()
})
})
.catch(error => {
console.error(error)
})
var mongoose = require('mongoose');
var Schema = mongoose.Schema;
var publicationSchema = new Schema({
title: String,
short_description: String,
mediumId: String,
content: String,
plaintext: String,
link: String,
image: String,
tags: String,
published: Date,
published_num: Number,
author: String,
checksum: String,
approved: Boolean
});
module.exports = mongoose.model('publication', publicationSchema)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment