Skip to content

Instantly share code, notes, and snippets.

@qdequele
Last active February 9, 2017 09:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save qdequele/f0f8f53d3f28b3cd6aef to your computer and use it in GitHub Desktop.
Save qdequele/f0f8f53d3f28b3cd6aef to your computer and use it in GitHub Desktop.
Media title/image universal scrapper with scraperjs promised
/*
* Bundle: Helpers - Scraper
* Project: Readlist - Server
* Author: Quentin de Quelen <quentin@dequelen.me>
* Copyright: 2015, Readlist
*/
/*
*
*Need 'scraperjs & url'
*/
"use strict";
const scraperjs = require('scraperjs');
const url = require('url');
exports.scrapUrl = function(url) {
var res = {
"title": '',
"image": ''
}
var scrapFacebook = function($) {
return new Promise(function(resolve, reject) {
try {
res.title = $('meta[property="og:title"]').attr('content');
if ($('meta[property="og:image"]').attr('content').length > 0)
res.image = $('meta[property="og:image"]').attr('content');
reject();
} catch (err) {
resolve($);
}
});
};
var scrapTwitter = function($) {
return new Promise(function(resolve, reject) {
try {
res.title = $('meta[property="twitter:title"]').attr('content');
if ($('meta[property="twitter:image"]').attr('content').length > 0)
res.image = $('meta[property="twitter:image"]').attr('content');
reject();
} catch (err) {
resolve($);
}
});
};
var scrapArticleH1 = function($) {
return new Promise(function(resolve, reject) {
try {
res.title = $('.article h1').text();
res.image = $('.article img').attr("src");
if (res.title.length == 0) resolve($);
else reject();
} catch (err) {
resolve($);
}
});
};
var scrapArticleH2 = function($) {
return new Promise(function(resolve, reject) {
try {
res.title = $('article h2').text();
res.image = $('article img').attr("src");
if (res.title.length == 0) resolve($);
else reject();
} catch (err) {
resolve($);
}
});
};
var scrapMainH2 = function($) {
return new Promise(function(resolve, reject) {
try {
res.title = $('#main h2').text();
res.image = $('#main img').attr("src");
if (res.title.length == 0) resolve($);
else reject();
} catch (err) {
resolve($);
}
});
};
return new Promise(function(resolve, reject) {
scraperjs.StaticScraper.create(url)
.scrape(function($) {
scrapFacebook($)
.then(scrapFacebook)
.then(scrapTwitter)
.then(scrapArticleH1)
.then(scrapArticleH2)
.then(scrapMainH2)
.then(function($) {
reject("Article not found");
})
.catch(function() {
resolve(res);
});
});
});
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment