Skip to content

Instantly share code, notes, and snippets.

@abernardobr
Last active August 29, 2015 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abernardobr/fa5c5240dd6c860124a3 to your computer and use it in GitHub Desktop.
Save abernardobr/fa5c5240dd6c860124a3 to your computer and use it in GitHub Desktop.
Node URL Crawler
// Client file
//***** hdCrawl ******//
// dependencies
// * lodash or underscore
// * jQuery
// Providers
hdCrawlProviders = {
youtube: {
regexp: /(?:.+?)?(?:\/v\/|watch\/|\?v=|\&v=|youtu\.be\/|\/v=|^youtu\.be\/|y2u\.be\/|^y2u\.be\/)([a-zA-Z0-9_-]{11})+/i,
dimensions: { width: 640, height: 390 },
authKey: 'your_google_authetication_key_goes_here',
match: function(url) {
return this.regexp.test(url);
},
preview: function (url, input, cb) {
var self = this;
if(input.match(self.regexp)) {
$.getJSON('https://www.googleapis.com/youtube/v3/videos?id=' + RegExp.$1 + '&key=' + self.authKey + '&part=snippet,statistics').success(function (d) {
var video = {};
var ytData = d.items[0];
video.host = 'youtube.com';
video.title = ytData.snippet.title;
video.imageCount = 1;
video.images = [ytData.snippet.thumbnails.medium.url];
video.description = (ytData.snippet.description.trunc(250, true)).replace(/\n/g, ' ').replace(/
/g, ' ');
video.rawDescription = ytData.snippet.description;
video.views = ytData.statistics.viewCount;
video.likes = ytData.statistics.likeCount;
video.url = 'https://www.youtube.com/watch?v=' + RegExp.$1;
video.width = self.dimensions.width;
video.height = self.dimensions.height;
video.id = ytData.id;
video.source = 'youtube';
cb(input, video);
});
} else {
cb(input, '', {});
}
}
},
vimeo: {
regexp: /https?:\/\/(?:www\.)?vimeo.com\/(?:channels\/(?:\w+\/)?|groups\/([^\/]*)\/videos\/|album\/(\d+)\/video\/|)(\d+)(?:$|\/|\?)*/i,
dimensions: { width: 640, height: 390 },
match: function(url) {
return this.regexp.test(url);
},
preview: function(url, input, cb) {
var self = this;
if(input.match(self.regexp)) {
$.getJSON('https://vimeo.com/api/v2/video/' + RegExp.$3 + '.json').success(function (d) {
var video = {};
video.host = 'vimeo.com';
video.title = d[0].title;
video.rawDescription = (d[0].description).replace(/\n/g, '<br/>').replace(/&#10;/g, '<br/>');
video.description = (d[0].description).replace(/((<|&lt;)br\s*\/*(>|&gt;)\r\n)/g, ' ').trunc(250, true);
video.imageCount = 1;
video.images = [d[0].thumbnail_medium];
video.views = d[0].stats_number_of_plays;
video.likes = d[0].stats_number_of_likes;
video.url = d[0].url;
video.width = self.dimensions.width;
video.height = self.dimensions.height;
video.id = d[0].id;
video.source = 'vimeo';
cb(input, video);
});
}
}
},
url: {
regexp: /((href|src)=["']|)(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/i,
dimensions: { width: 640, height: 390 },
match: function(url) {
return this.regexp.test(url);
},
preview: function(url, input, cb) {
var self = this;
// This is an internal function that just calls the Node crawling function
// I use HAPI to create my routes to reach the node code. Here you can use anything you want...
HDV.crawlUrl(url, function(urlData) {
urlData.width = self.dimensions.width;
urlData.height = self.dimensions.height;
cb(input, urlData);
});
}
}
}
// Crawler object to be used
// * Things to inproove
// * crawl more than one URL
// * define provider order
// * return more a match url --> provider list, so we can perform more than one crawlling
function hdCrawl() {
var self = this;
self.providers = [];
}
// in:
// url --> calls the providers match function to see if the url matchs the matching function (general a regexp, but it could be anything you need to check the match)
// we follow the hdCrawlProviders order. The first that matches is returned
// out:
// the provider if we matched any or null if no providers were found
//
hdCrawl.prototype.getProvider = function(url) {
var self = this;
self.providers = self.providers.length > 0 ? self.providers : _.keys(hdCrawlProviders);
for (var i = 0; i < self.providers.length; i++) {
var provider = hdCrawlProviders[self.providers[i]];
if(provider.match(url))
return provider;
}
return null;
}
// in:
// * input --> the text to crawl
// out (via callback):
// * input, if modified by the provider
// * the crawledInfo
// var retData = {
// host: internals.checkEmpty(urlParsed.host, ''), // the calling host. ex. http://google.com --> google.com
// title: internals.checkEmpty(title, ''), // the crawled title
// description: internals.elipses(internals.checkEmpty(description, ''), internals.descriptionSizeLimite), // the description limited by descriptionSizeLimite characters
// rawDescription: internals.checkEmpty(description, ''), // the crawled description
// imageCount: aImages.length, // image count
// images: aImages, // the array of image urls collected
// views: 0, // the views that the video has
// likes: 0, // number of likes of the video
// url: uri, // the actual incomming url
// width: 0, // base width and heigth of the video
// height: 0, // base width and heigth of the video
// id: 0, // video id
// source: 'url' // just to reference that we crawled an URL. In the future we might add more types of crawling like youtube, vimeo, twitter...
// };
hdCrawl.prototype.crawl = function(input, cb) {
var self = this;
// first check if we have a URLs to process
var urlRegex = /((href|src)=["']|)(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig;
var urls = input.match(urlRegex);
if(urls == null || urls.length === 0) {
cb(input, '', {});
} else {
// get and call the provider
var url = urls[0];
var provider = self.getProvider(url);
if(provider != null && url !== '') {
provider.preview(url, input, function(input, crawledInfo) {
cb(input, crawledInfo);
});
} else {
cb(input, '', {});
}
}
}
// Node file
// Dependencies
var _ = require('lodash');
var Request = require('request');
var Cheerio = require('cheerio');
var URL = require('url');
// **********************************************************************
// Things to improove:
// * Separate text crawling and image crawling for a better performance
// * Create crawling providers, like: url, vimeo, youtube, etc.
// * Add caching: in memory, redis, mongodb, memcached.
// The caching would be url --> parsed url data (depending on your storage architecture, the best storages would be key/pair like Redis, memcached, etc.
// * Add expiration to the cache
// **********************************************************************
internals = {
tagImageLimit: 10, // if we do not have at least imageLimit images until we reach collecting from img tags, collect than this limit of img tags
imageLimit: 5, // how many images do we want in general
descriptionSizeLimite: 100 // quantity of characters to limit the description
};
// General helper functions
internals.isEmpty = function(value) {
return (value == null || value === '' || _.isUndefined(value) || _.isEmpty(value));
}
internals.checkEmpty = function(value, newValue) {
return (value != null && value !== '' && !_.isUndefined(value) && !_.isEmpty(value)) ? value : newValue;
}
internals.addImage = function(aImages, image, urlParsed) {
var regExp = /^(https?:\/\/)?((([a-z\d]([a-z\d-]*[a-z\d])*)\.)+[a-z]{2,}|((\d{1,3}\.){3}\d{1,3}))(\:\d+)?(\/[-a-z\d%_@.~+&=!#$%\*\(\)<>?]*)*(\?[;&a-z\d%_@.~+&=!#$%\*\(\)<>?]*)?(\#[-a-z\d_]*)?$/i;
var addImage = false;
if(aImages.length < internals.tagImageLimit) {
if (!regExp.test(image)) {
if (image != null && image !== '' && !_.isUndefined(image) && !_.isEmpty(image) && image.length >= 2) {
// there are cases where CDNs start with //url.com/image, we do NOT accept this crazyness
if (image[1] !== '/') {
var imgUrlParsed = URL.parse(image);
// check for relative url starting with / or without it (that is, just the image name and the path to it), than add the host before
if (imgUrlParsed.protocol == null || imgUrlParsed.protocol === '') {
image = urlParsed.protocol + "//" + urlParsed.host + (image[0] === '/' ? image : ('/' + image));
addImage = true;
}
}
}
} else
addImage = true;
}
// we cannot check in this case for image extensions. More crazyness, some CDNs have a path to an image without the extension (arghhhhhhh) --> github is one of them
if(addImage)
aImages.push(image);
return aImages;
}
internals.checkImage = function(image) {
if(!_.isEmpty(image) && image !== '')
return image;
return '';
}
internals.elipses = function(value, size) {
var plainText = value.trim();
if(plainText.length > size)
return plainText.substr(0,size) + "…";
return plainText.replace(/((<|&lt;)br\s*\/*(>|&gt;)\r\n)/g, ' ');
}
// Crawling functions
internals.crawl = function(uri, cb) {
Request({ uri: uri }, function(err, response, body) {
var noOpRetData = {
host: '', title: '', rawDescription: '', description: '',
imageCount: 0, images: [], views: 0, likes: 0, url: '', width: 0, height: 0, id: 0, source: 'url', sourceUrl: uri
};
if(err || (response && response.statusCode !== 200) || body == null || body === '') {
cb(null, noOpRetData);
return;
}
var $;
try {
$ = Cheerio.load(body);
var urlParsed = URL.parse(uri);
// Get the title
var title = internals.checkEmpty('', $("meta[property='og:title']").attr("content"));
if(internals.isEmpty(title))
title = $("meta[name='title']").attr("content");
if(internals.isEmpty(title))
title = $("title").text();
// Get the description
var description = internals.checkEmpty('', $("meta[name='description']").attr("content"));
if(internals.isEmpty(description))
description = $("meta[property='og:description']").attr("content");
if(internals.isEmpty(description))
description = $("meta[property='twitter:description']").attr("content");
if(internals.isEmpty(description))
description = $('h1').html();
// Get Images
var aImages = [];
var gotFromOGorTwitter = false; // try to get images from OpenGraph or Twitter. If there is, than use just them
// meta:name can be used
$("meta[name='og:image']").each(function(i, elem) {
var imageTmp = internals.checkImage($(elem).attr('content'));
if(imageTmp !== '')
aImages = internals.addImage(aImages, imageTmp, urlParsed);
});
// Limit to 5 images
if(aImages.length < internals.imageLimit) {
// meta:property is more comonly used
$("meta[property='og:image']").each(function (i, elem) {
var imageTmp = internals.checkImage($(elem).attr('content'));
if (imageTmp !== '')
aImages = internals.addImage(aImages, imageTmp, urlParsed);
});
}
// Try to get througth twitter meta tags
if(aImages.length === 0) {
// get twitter:image
$("meta[name='twitter:image']").each(function(i, elem) {
var imageTmp = internals.checkImage($(elem).attr('content'));
if(imageTmp !== '')
aImages = internals.addImage(aImages, imageTmp, urlParsed);
});
}
if(aImages.length === 0) {
// get twitter:image:src
$("meta[name='twitter:image:src']").each(function(i, elem) {
var imageTmp = internals.checkImage($(elem).attr('content'));
if(imageTmp !== '')
aImages = internals.addImage(aImages, imageTmp, urlParsed);
});
}
// if we gotten images from twitter or open graph, than stop here. No need to parse any longer
// also, if we have already 5 (internals.imageLimit) images, than stop
gotFromOGorTwitter = aImages.length > 0;
if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) {
// there can be up 10 (internals.tagImageLimit) images only if we parse from img tags
$("img").each(function (i, elem) {
var $el = $(elem);
// check for lazyload --> UOL uses it (more originality)
var imageTmp = $el.attr('data-original');
if(_.isEmpty(imageTmp))
imageTmp = internals.checkImage($el.attr('src'));
if (imageTmp !== '')
aImages = internals.addImage(aImages, imageTmp, urlParsed);
});
}
if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) {
// get background url or background-image if we have not gotten any images yet
$('[style]').each(function (i, elem) {
var aImages = /url\(.*?\)/ig.exec($(elem).html());
if (aImages && aImages.length > 0) {
for (var i = 0; i < aImages.length; i++) {
var item = aImages[i];
var imageTmp;
imageTmp = item.substring(item.indexOf('(') + 1, item.indexOf(')'));
// limit only images with extensions known. Crazy CDNs will be excluded.
if (imageTmp.indexOf('jpg') !== -1 || imageTmp.indexOf('png') !== -1 ||
imageTmp.indexOf('gif') !== -1 || imageTmp.indexOf('jpeg') !== -1) {
if (imageTmp.indexOf("'") === -1 && imageTmp.indexOf("apos") === -1) {
aImages = internals.addImage(aImages, imageTmp, urlParsed);
}
}
}
}
});
}
if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) {
// some dudes also use meta:itemprop --> such good imagination (Google is one of them)
aImages = internals.addImage(aImages, $("meta[itemprop='image']").attr("content"), urlParsed);
}
// Crazy little trick. After making sure images do not repest (unique), reverse the array. Why? You might ask...
// We assume that the most intersting images are the ones in the middle of the page. Reversing, makes sure that we get first the
// most intersting ones.
aImages = _.unique(aImages).reverse();
var retData = {
host: internals.checkEmpty(urlParsed.host, ''), // the calling host. ex. http://google.com --> google.com
title: internals.checkEmpty(title, ''), // the crawled title
description: internals.elipses(internals.checkEmpty(description, ''), internals.descriptionSizeLimite), // the description limited by descriptionSizeLimite characters
rawDescription: internals.checkEmpty(description, ''), // the crawled description
imageCount: aImages.length, // image count
images: aImages, // the array of image urls collected
views: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
likes: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
url: uri, // the actual incomming url
width: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
height: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
id: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
source: 'url' // just to reference that we crawled an URL. In the future we might add more types of crawling like youtube, vimeo, twitter...
};
cb(null, retData);
} catch(ex) {
cb(null, noOpRetData);
}
});
}
module.exports = {
parse: internals.crawl
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment