abernardobr/hd_urlclientcrawler.js

## hd_urlclientcrawler.js
// Client file

//*****  hdCrawl ******//

// dependencies
//  * lodash or underscore
//  * jQuery

// Providers
hdCrawlProviders = {
    youtube: {
        regexp: /(?:.+?)?(?:\/v\/|watch\/|\?v=|\&v=|youtu\.be\/|\/v=|^youtu\.be\/|y2u\.be\/|^y2u\.be\/)([a-zA-Z0-9_-]{11})+/i,
        dimensions: { width: 640, height: 390 },
        authKey: 'your_google_authetication_key_goes_here',
        match: function(url) {
            return this.regexp.test(url);
        },
        preview: function (url, input, cb) {
            var self = this;
            if(input.match(self.regexp)) {
                $.getJSON('https://www.googleapis.com/youtube/v3/videos?id=' + RegExp.$1 + '&key=' + self.authKey + '&part=snippet,statistics').success(function (d) {
                    var video = {};
                    var ytData = d.items[0];
                    video.host = 'youtube.com';
                    video.title = ytData.snippet.title;
                    video.imageCount = 1;
                    video.images = [ytData.snippet.thumbnails.medium.url];
                    video.description = (ytData.snippet.description.trunc(250, true)).replace(/\n/g, ' ').replace(/&#10;/g, ' ');
                    video.rawDescription = ytData.snippet.description;
                    video.views = ytData.statistics.viewCount;
                    video.likes = ytData.statistics.likeCount;
                    video.url = 'https://www.youtube.com/watch?v=' + RegExp.$1;
                    video.width = self.dimensions.width;
                    video.height = self.dimensions.height;
                    video.id = ytData.id;
                    video.source = 'youtube';
                    cb(input, video);
                });
            } else {
                cb(input, '', {});
            }
        }
    },
    vimeo: {
        regexp: /https?:\/\/(?:www\.)?vimeo.com\/(?:channels\/(?:\w+\/)?|groups\/([^\/]*)\/videos\/|album\/(\d+)\/video\/|)(\d+)(?:$|\/|\?)*/i,
        dimensions: { width: 640, height: 390 },
        match: function(url) {
            return this.regexp.test(url);
        },
        preview: function(url, input, cb) {
            var self = this;
            if(input.match(self.regexp)) {
                $.getJSON('https://vimeo.com/api/v2/video/' + RegExp.$3 + '.json').success(function (d) {
                    var video = {};
                    video.host = 'vimeo.com';
                    video.title = d[0].title;
                    video.rawDescription = (d[0].description).replace(/\n/g, '<br/>').replace(/&#10;/g, '<br/>');
                    video.description = (d[0].description).replace(/((<|&lt;)br\s*\/*(>|&gt;)\r\n)/g, ' ').trunc(250, true);
                    video.imageCount = 1;
                    video.images = [d[0].thumbnail_medium];
                    video.views = d[0].stats_number_of_plays;
                    video.likes = d[0].stats_number_of_likes;
                    video.url = d[0].url;
                    video.width = self.dimensions.width;
                    video.height = self.dimensions.height;
                    video.id = d[0].id;
                    video.source = 'vimeo';
                    cb(input, video);
                });
            }
        }
    },
    url: {
        regexp: /((href|src)=["']|)(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/i,
        dimensions: { width: 640, height: 390 },
        match: function(url) {
            return this.regexp.test(url);
        },
        preview: function(url, input, cb) {
            var self = this;
            // This is an internal function that just calls the Node crawling function
            // I use HAPI to create my routes to reach the node code. Here you can use anything you want...
            HDV.crawlUrl(url, function(urlData) {
                urlData.width = self.dimensions.width;
                urlData.height = self.dimensions.height;
                cb(input, urlData);
            });
        }
    }
}

// Crawler object to be used
// * Things to inproove
//      * crawl more than one URL
//      * define provider order
//      * return more a match url --> provider list, so we can perform more than one crawlling
function hdCrawl() {
    var self = this;
    self.providers = [];
}

// in:
//  url --> calls the providers match function to see if the url matchs the matching function (general a regexp, but it could be anything you need to check the match)
//      we follow the hdCrawlProviders order. The first that matches is returned
// out:
//  the provider if we matched any or null if no providers were found
//
hdCrawl.prototype.getProvider = function(url) {
    var self = this;
    self.providers = self.providers.length > 0 ? self.providers : _.keys(hdCrawlProviders);
    for (var i = 0; i < self.providers.length; i++) {
        var provider = hdCrawlProviders[self.providers[i]];
        if(provider.match(url))
            return provider;
    }
    return null;
}

// in:
//  * input --> the text to crawl
// out (via callback):
//  * input, if modified by the provider
//  * the crawledInfo
//        var retData = {
//            host: internals.checkEmpty(urlParsed.host, ''),  // the calling host. ex. http://google.com --> google.com
//                title: internals.checkEmpty(title, ''), // the crawled title
//                description: internals.elipses(internals.checkEmpty(description, ''), internals.descriptionSizeLimite), // the description limited by descriptionSizeLimite characters
//                rawDescription: internals.checkEmpty(description, ''),  // the crawled description
//                imageCount: aImages.length, // image count
//                images: aImages,    // the array of image urls collected
//                views: 0,   // the views that the video has
//                likes: 0,   // number of likes of the video
//                url: uri,   // the actual incomming url
//                width: 0,   // base width and heigth of the video
//                height: 0,  // base width and heigth of the video
//                id: 0,      // video id
//                source: 'url'   // just to reference that we crawled an URL. In the future we might add more types of crawling like youtube, vimeo, twitter...
//            };

hdCrawl.prototype.crawl = function(input, cb) {
    var self = this;
    // first check if we have a URLs to process
    var urlRegex = /((href|src)=["']|)(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig;
    var urls = input.match(urlRegex);
    if(urls == null || urls.length === 0) {
        cb(input, '', {});
    } else {
        // get and call the provider
        var url = urls[0];
        var provider = self.getProvider(url);
        if(provider != null && url !== '') {
            provider.preview(url, input, function(input, crawledInfo) {
                cb(input, crawledInfo);
            });
        } else {
            cb(input, '', {});
        }
    }
}

## hd_urlcrawler.js
// Node file

// Dependencies
var _ = require('lodash');
var Request = require('request');
var Cheerio = require('cheerio');
var URL = require('url');

// **********************************************************************
// Things to improove:
// * Separate text crawling and image crawling for a better performance
// * Create crawling providers, like: url, vimeo, youtube, etc.
// * Add caching: in memory, redis, mongodb, memcached.
//          The caching would be url --> parsed url data (depending on your storage architecture, the best storages would be key/pair like Redis, memcached, etc.
// * Add expiration to the cache
// **********************************************************************

internals = {
    tagImageLimit: 10,      // if we do not have at least imageLimit images until we reach collecting from img tags, collect than this limit of img tags
    imageLimit: 5,          // how many images do we want in general
    descriptionSizeLimite: 100  // quantity of characters to limit the description
};

// General helper functions
internals.isEmpty = function(value) {
    return (value == null || value === '' || _.isUndefined(value) || _.isEmpty(value));
}

internals.checkEmpty = function(value, newValue) {
    return (value != null && value !== '' && !_.isUndefined(value) && !_.isEmpty(value)) ? value : newValue;
}

internals.addImage = function(aImages, image, urlParsed) {
    var regExp = /^(https?:\/\/)?((([a-z\d]([a-z\d-]*[a-z\d])*)\.)+[a-z]{2,}|((\d{1,3}\.){3}\d{1,3}))(\:\d+)?(\/[-a-z\d%_@.~+&=!#$%\*\(\)<>?]*)*(\?[;&a-z\d%_@.~+&=!#$%\*\(\)<>?]*)?(\#[-a-z\d_]*)?$/i;
    var addImage = false;
    if(aImages.length < internals.tagImageLimit) {
        if (!regExp.test(image)) {
            if (image != null && image !== '' && !_.isUndefined(image) && !_.isEmpty(image) && image.length >= 2) {
                // there are cases where CDNs start with //url.com/image, we do NOT accept this crazyness
                if (image[1] !== '/') {
                    var imgUrlParsed = URL.parse(image);
                    // check for relative url starting with / or without it (that is, just the image name and the path to it), than add the host before
                    if (imgUrlParsed.protocol == null || imgUrlParsed.protocol === '') {
                        image = urlParsed.protocol + "//" + urlParsed.host + (image[0] === '/' ? image : ('/' + image));
                        addImage = true;
                    }
                }
            }
        } else
            addImage = true;
    }

    // we cannot check in this case for image extensions. More crazyness, some CDNs have a path to an image without the extension (arghhhhhhh) --> github is one of them
    if(addImage)
        aImages.push(image);
    return aImages;
}

internals.checkImage = function(image) {
    if(!_.isEmpty(image) && image !== '')
        return image;
    return '';
}

internals.elipses = function(value, size) {
    var plainText = value.trim();
    if(plainText.length > size)
        return plainText.substr(0,size) + "…";
    return plainText.replace(/((<|&lt;)br\s*\/*(>|&gt;)\r\n)/g, ' ');
}

// Crawling functions
internals.crawl = function(uri, cb) {

    Request({ uri: uri }, function(err, response, body) {
        var noOpRetData = {
            host: '', title: '', rawDescription: '', description: '',
            imageCount: 0, images: [], views: 0, likes: 0, url: '', width: 0, height: 0, id: 0, source: 'url', sourceUrl: uri
        };

        if(err || (response && response.statusCode !== 200) || body == null || body === '')  {
            cb(null, noOpRetData);
            return;
        }

        var $;
        try {
            $ = Cheerio.load(body);
            var urlParsed = URL.parse(uri);

            // Get the title
            var title = internals.checkEmpty('', $("meta[property='og:title']").attr("content"));

            if(internals.isEmpty(title))
                title = $("meta[name='title']").attr("content");

            if(internals.isEmpty(title))
                title = $("title").text();

            // Get the description
            var description = internals.checkEmpty('', $("meta[name='description']").attr("content"));

            if(internals.isEmpty(description))
                description = $("meta[property='og:description']").attr("content");

            if(internals.isEmpty(description))
                description = $("meta[property='twitter:description']").attr("content");

            if(internals.isEmpty(description))
                description = $('h1').html();

            // Get Images
            var aImages = [];
            var gotFromOGorTwitter = false;  // try to get images from OpenGraph or Twitter. If there is, than use just them

            // meta:name can be used
            $("meta[name='og:image']").each(function(i, elem) {
                var imageTmp = internals.checkImage($(elem).attr('content'));
                if(imageTmp !== '')
                    aImages = internals.addImage(aImages, imageTmp, urlParsed);
            });

            // Limit to 5 images
            if(aImages.length < internals.imageLimit) {
                // meta:property is more comonly used
                $("meta[property='og:image']").each(function (i, elem) {
                    var imageTmp = internals.checkImage($(elem).attr('content'));
                    if (imageTmp !== '')
                        aImages = internals.addImage(aImages, imageTmp, urlParsed);
                });
            }

            // Try to get througth twitter meta tags
            if(aImages.length === 0) {
                // get twitter:image
                $("meta[name='twitter:image']").each(function(i, elem) {
                    var imageTmp = internals.checkImage($(elem).attr('content'));
                    if(imageTmp !== '')
                        aImages = internals.addImage(aImages, imageTmp, urlParsed);
                });
            }

            if(aImages.length === 0) {
                // get twitter:image:src
                $("meta[name='twitter:image:src']").each(function(i, elem) {
                    var imageTmp = internals.checkImage($(elem).attr('content'));
                    if(imageTmp !== '')
                        aImages = internals.addImage(aImages, imageTmp, urlParsed);
                });
            }

            // if we gotten images from twitter or open graph, than stop here. No need to parse any longer
            // also, if we have already 5 (internals.imageLimit) images, than stop
            gotFromOGorTwitter = aImages.length > 0;

            if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) {
                // there can be up 10 (internals.tagImageLimit) images only if we parse from img tags
                $("img").each(function (i, elem) {
                    var $el = $(elem);

                    // check for lazyload --> UOL uses it (more originality)
                    var imageTmp = $el.attr('data-original');

                    if(_.isEmpty(imageTmp))
                        imageTmp = internals.checkImage($el.attr('src'));

                    if (imageTmp !== '')
                        aImages = internals.addImage(aImages, imageTmp, urlParsed);
                });
            }

            if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) {
                // get background url or background-image if we have not gotten any images yet
                $('[style]').each(function (i, elem) {
                    var aImages = /url\(.*?\)/ig.exec($(elem).html());
                    if (aImages && aImages.length > 0) {
                        for (var i = 0; i < aImages.length; i++) {
                            var item = aImages[i];
                            var imageTmp;
                            imageTmp = item.substring(item.indexOf('(') + 1, item.indexOf(')'));
                            // limit only images with extensions known. Crazy CDNs will be excluded.
                            if (imageTmp.indexOf('jpg') !== -1 || imageTmp.indexOf('png') !== -1 ||
                                imageTmp.indexOf('gif') !== -1 || imageTmp.indexOf('jpeg') !== -1) {
                                if (imageTmp.indexOf("'") === -1 && imageTmp.indexOf("apos") === -1) {
                                    aImages = internals.addImage(aImages, imageTmp, urlParsed);
                                }
                            }
                        }
                    }
                });
            }

            if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) {
                // some dudes also use meta:itemprop --> such good imagination (Google is one of them)
                aImages = internals.addImage(aImages, $("meta[itemprop='image']").attr("content"), urlParsed);
            }

            // Crazy little trick. After making sure images do not repest (unique), reverse the array. Why? You might ask...
            // We assume that the most intersting images are the ones in the middle of the page. Reversing, makes sure that we get first the
            // most intersting ones.
            aImages = _.unique(aImages).reverse();

            var retData = {
                host: internals.checkEmpty(urlParsed.host, ''),  // the calling host. ex. http://google.com --> google.com
                title: internals.checkEmpty(title, ''), // the crawled title
                description: internals.elipses(internals.checkEmpty(description, ''), internals.descriptionSizeLimite), // the description limited by descriptionSizeLimite characters
                rawDescription: internals.checkEmpty(description, ''),  // the crawled description
                imageCount: aImages.length, // image count
                images: aImages,    // the array of image urls collected
                views: 0,   // used only for video crawling (it is here for compatibility when we add video crawling here
                likes: 0,   // used only for video crawling (it is here for compatibility when we add video crawling here
                url: uri,   // the actual incomming url
                width: 0,   // used only for video crawling (it is here for compatibility when we add video crawling here
                height: 0,  // used only for video crawling (it is here for compatibility when we add video crawling here
                id: 0,      // used only for video crawling (it is here for compatibility when we add video crawling here
                source: 'url'   // just to reference that we crawled an URL. In the future we might add more types of crawling like youtube, vimeo, twitter...
            };

            cb(null, retData);
        } catch(ex) {
            cb(null, noOpRetData);
        }
    });

}

module.exports = {
    parse: internals.crawl
};
	// Client file

	//*** hdCrawl ****//

	// dependencies
	// * lodash or underscore
	// * jQuery

	// Providers
	hdCrawlProviders = {
	youtube: {
	regexp: /(?:.+?)?(?:\/v\/\|watch\/\|\?v=\|\&v=\|youtu\.be\/\|\/v=\|^youtu\.be\/\|y2u\.be\/\|^y2u\.be\/)([a-zA-Z0-9_-]{11})+/i,
	dimensions: { width: 640, height: 390 },
	authKey: 'your_google_authetication_key_goes_here',
	match: function(url) {
	return this.regexp.test(url);
	},
	preview: function (url, input, cb) {
	var self = this;
	if(input.match(self.regexp)) {
	$.getJSON('https://www.googleapis.com/youtube/v3/videos?id=' + RegExp.$1 + '&key=' + self.authKey + '&part=snippet,statistics').success(function (d) {
	var video = {};
	var ytData = d.items[0];
	video.host = 'youtube.com';
	video.title = ytData.snippet.title;
	video.imageCount = 1;
	video.images = [ytData.snippet.thumbnails.medium.url];
	video.description = (ytData.snippet.description.trunc(250, true)).replace(/\n/g, ' ').replace(/ /g, ' ');
	video.rawDescription = ytData.snippet.description;
	video.views = ytData.statistics.viewCount;
	video.likes = ytData.statistics.likeCount;
	video.url = 'https://www.youtube.com/watch?v=' + RegExp.$1;
	video.width = self.dimensions.width;
	video.height = self.dimensions.height;
	video.id = ytData.id;
	video.source = 'youtube';
	cb(input, video);
	});
	} else {
	cb(input, '', {});
	}
	}
	},
	vimeo: {
	regexp: /https?:\/\/(?:www\.)?vimeo.com\/(?:channels\/(?:\w+\/)?\|groups\/([^\/])\/videos\/\|album\/(\d+)\/video\/\|)(\d+)(?:$\|\/\|\?)/i,
	dimensions: { width: 640, height: 390 },
	match: function(url) {
	return this.regexp.test(url);
	},
	preview: function(url, input, cb) {
	var self = this;
	if(input.match(self.regexp)) {
	$.getJSON('https://vimeo.com/api/v2/video/' + RegExp.$3 + '.json').success(function (d) {
	var video = {};
	video.host = 'vimeo.com';
	video.title = d[0].title;
	video.rawDescription = (d[0].description).replace(/\n/g, '<br/>').replace(/ /g, '<br/>');
	video.description = (d[0].description).replace(/((<\|<)br\s\/(>\|>)\r\n)/g, ' ').trunc(250, true);
	video.imageCount = 1;
	video.images = [d[0].thumbnail_medium];
	video.views = d[0].stats_number_of_plays;
	video.likes = d[0].stats_number_of_likes;
	video.url = d[0].url;
	video.width = self.dimensions.width;
	video.height = self.dimensions.height;
	video.id = d[0].id;
	video.source = 'vimeo';
	cb(input, video);
	});
	}
	}
	},
	url: {
	regexp: /((href\|src)=["']\|)(\b(https?\|ftp\|file):\/\/[-A-Z0-9+&@#\/%?=~_\|!:,.;]*[-A-Z0-9+&@#\/%=~_\|])/i,
	dimensions: { width: 640, height: 390 },
	match: function(url) {
	return this.regexp.test(url);
	},
	preview: function(url, input, cb) {
	var self = this;
	// This is an internal function that just calls the Node crawling function
	// I use HAPI to create my routes to reach the node code. Here you can use anything you want...
	HDV.crawlUrl(url, function(urlData) {
	urlData.width = self.dimensions.width;
	urlData.height = self.dimensions.height;
	cb(input, urlData);
	});
	}
	}
	}

	// Crawler object to be used
	// * Things to inproove
	// * crawl more than one URL
	// * define provider order
	// * return more a match url --> provider list, so we can perform more than one crawlling
	function hdCrawl() {
	var self = this;
	self.providers = [];
	}

	// in:
	// url --> calls the providers match function to see if the url matchs the matching function (general a regexp, but it could be anything you need to check the match)
	// we follow the hdCrawlProviders order. The first that matches is returned
	// out:
	// the provider if we matched any or null if no providers were found
	//
	hdCrawl.prototype.getProvider = function(url) {
	var self = this;
	self.providers = self.providers.length > 0 ? self.providers : _.keys(hdCrawlProviders);
	for (var i = 0; i < self.providers.length; i++) {
	var provider = hdCrawlProviders[self.providers[i]];
	if(provider.match(url))
	return provider;
	}
	return null;
	}

	// in:
	// * input --> the text to crawl
	// out (via callback):
	// * input, if modified by the provider
	// * the crawledInfo
	// var retData = {
	// host: internals.checkEmpty(urlParsed.host, ''), // the calling host. ex. http://google.com --> google.com
	// title: internals.checkEmpty(title, ''), // the crawled title
	// description: internals.elipses(internals.checkEmpty(description, ''), internals.descriptionSizeLimite), // the description limited by descriptionSizeLimite characters
	// rawDescription: internals.checkEmpty(description, ''), // the crawled description
	// imageCount: aImages.length, // image count
	// images: aImages, // the array of image urls collected
	// views: 0, // the views that the video has
	// likes: 0, // number of likes of the video
	// url: uri, // the actual incomming url
	// width: 0, // base width and heigth of the video
	// height: 0, // base width and heigth of the video
	// id: 0, // video id
	// source: 'url' // just to reference that we crawled an URL. In the future we might add more types of crawling like youtube, vimeo, twitter...
	// };

	hdCrawl.prototype.crawl = function(input, cb) {
	var self = this;
	// first check if we have a URLs to process
	var urlRegex = /((href\|src)=["']\|)(\b(https?\|ftp\|file):\/\/[-A-Z0-9+&@#\/%?=~_\|!:,.;]*[-A-Z0-9+&@#\/%=~_\|])/ig;
	var urls = input.match(urlRegex);
	if(urls == null \|\| urls.length === 0) {
	cb(input, '', {});
	} else {
	// get and call the provider
	var url = urls[0];
	var provider = self.getProvider(url);
	if(provider != null && url !== '') {
	provider.preview(url, input, function(input, crawledInfo) {
	cb(input, crawledInfo);
	});
	} else {
	cb(input, '', {});
	}
	}
	}
	// Node file

	// Dependencies
	var _ = require('lodash');
	var Request = require('request');
	var Cheerio = require('cheerio');
	var URL = require('url');

	// **********************************************************************
	// Things to improove:
	// * Separate text crawling and image crawling for a better performance
	// * Create crawling providers, like: url, vimeo, youtube, etc.
	// * Add caching: in memory, redis, mongodb, memcached.
	// The caching would be url --> parsed url data (depending on your storage architecture, the best storages would be key/pair like Redis, memcached, etc.
	// * Add expiration to the cache
	// **********************************************************************

	internals = {
	tagImageLimit: 10, // if we do not have at least imageLimit images until we reach collecting from img tags, collect than this limit of img tags
	imageLimit: 5, // how many images do we want in general
	descriptionSizeLimite: 100 // quantity of characters to limit the description
	};

	// General helper functions
	internals.isEmpty = function(value) {
	return (value == null \|\| value === '' \|\| _.isUndefined(value) \|\| _.isEmpty(value));
	}

	internals.checkEmpty = function(value, newValue) {
	return (value != null && value !== '' && !_.isUndefined(value) && !_.isEmpty(value)) ? value : newValue;
	}

	internals.addImage = function(aImages, image, urlParsed) {
	var regExp = /^(https?:\/\/)?((([a-z\d]([a-z\d-][a-z\d]))\.)+[a-z]{2,}\|((\d{1,3}\.){3}\d{1,3}))(\:\d+)?(\/[-a-z\d%_@.~+&=!#$%\\(\)<>?])(\?[;&a-z\d%_@.~+&=!#$%\\(\)<>?])?(\#[-a-z\d_])?$/i;
	var addImage = false;
	if(aImages.length < internals.tagImageLimit) {
	if (!regExp.test(image)) {
	if (image != null && image !== '' && !_.isUndefined(image) && !_.isEmpty(image) && image.length >= 2) {
	// there are cases where CDNs start with //url.com/image, we do NOT accept this crazyness
	if (image[1] !== '/') {
	var imgUrlParsed = URL.parse(image);
	// check for relative url starting with / or without it (that is, just the image name and the path to it), than add the host before
	if (imgUrlParsed.protocol == null \|\| imgUrlParsed.protocol === '') {
	image = urlParsed.protocol + "//" + urlParsed.host + (image[0] === '/' ? image : ('/' + image));
	addImage = true;
	}
	}
	}
	} else
	addImage = true;
	}

	// we cannot check in this case for image extensions. More crazyness, some CDNs have a path to an image without the extension (arghhhhhhh) --> github is one of them
	if(addImage)
	aImages.push(image);
	return aImages;
	}

	internals.checkImage = function(image) {
	if(!_.isEmpty(image) && image !== '')
	return image;
	return '';
	}

	internals.elipses = function(value, size) {
	var plainText = value.trim();
	if(plainText.length > size)
	return plainText.substr(0,size) + "…";
	return plainText.replace(/((<\|<)br\s\/(>\|>)\r\n)/g, ' ');
	}

	// Crawling functions
	internals.crawl = function(uri, cb) {

	Request({ uri: uri }, function(err, response, body) {
	var noOpRetData = {
	host: '', title: '', rawDescription: '', description: '',
	imageCount: 0, images: [], views: 0, likes: 0, url: '', width: 0, height: 0, id: 0, source: 'url', sourceUrl: uri
	};

	if(err \|\| (response && response.statusCode !== 200) \|\| body == null \|\| body === '') {
	cb(null, noOpRetData);
	return;
	}

	var $;
	try {
	$ = Cheerio.load(body);
	var urlParsed = URL.parse(uri);

	// Get the title
	var title = internals.checkEmpty('', $("meta[property='og:title']").attr("content"));

	if(internals.isEmpty(title))
	title = $("meta[name='title']").attr("content");

	if(internals.isEmpty(title))
	title = $("title").text();

	// Get the description
	var description = internals.checkEmpty('', $("meta[name='description']").attr("content"));

	if(internals.isEmpty(description))
	description = $("meta[property='og:description']").attr("content");

	if(internals.isEmpty(description))
	description = $("meta[property='twitter:description']").attr("content");

	if(internals.isEmpty(description))
	description = $('h1').html();

	// Get Images
	var aImages = [];
	var gotFromOGorTwitter = false; // try to get images from OpenGraph or Twitter. If there is, than use just them

	// meta:name can be used
	$("meta[name='og:image']").each(function(i, elem) {
	var imageTmp = internals.checkImage($(elem).attr('content'));
	if(imageTmp !== '')
	aImages = internals.addImage(aImages, imageTmp, urlParsed);
	});

	// Limit to 5 images
	if(aImages.length < internals.imageLimit) {
	// meta:property is more comonly used
	$("meta[property='og:image']").each(function (i, elem) {
	var imageTmp = internals.checkImage($(elem).attr('content'));
	if (imageTmp !== '')
	aImages = internals.addImage(aImages, imageTmp, urlParsed);
	});
	}

	// Try to get througth twitter meta tags
	if(aImages.length === 0) {
	// get twitter:image
	$("meta[name='twitter:image']").each(function(i, elem) {
	var imageTmp = internals.checkImage($(elem).attr('content'));
	if(imageTmp !== '')
	aImages = internals.addImage(aImages, imageTmp, urlParsed);
	});
	}

	if(aImages.length === 0) {
	// get twitter:image:src
	$("meta[name='twitter:image:src']").each(function(i, elem) {
	var imageTmp = internals.checkImage($(elem).attr('content'));
	if(imageTmp !== '')
	aImages = internals.addImage(aImages, imageTmp, urlParsed);
	});
	}

	// if we gotten images from twitter or open graph, than stop here. No need to parse any longer
	// also, if we have already 5 (internals.imageLimit) images, than stop
	gotFromOGorTwitter = aImages.length > 0;

	if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) {
	// there can be up 10 (internals.tagImageLimit) images only if we parse from img tags
	$("img").each(function (i, elem) {
	var $el = $(elem);

	// check for lazyload --> UOL uses it (more originality)
	var imageTmp = $el.attr('data-original');

	if(_.isEmpty(imageTmp))
	imageTmp = internals.checkImage($el.attr('src'));

	if (imageTmp !== '')
	aImages = internals.addImage(aImages, imageTmp, urlParsed);
	});
	}

	if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) {
	// get background url or background-image if we have not gotten any images yet
	$('[style]').each(function (i, elem) {
	var aImages = /url\(.*?\)/ig.exec($(elem).html());
	if (aImages && aImages.length > 0) {
	for (var i = 0; i < aImages.length; i++) {
	var item = aImages[i];
	var imageTmp;
	imageTmp = item.substring(item.indexOf('(') + 1, item.indexOf(')'));
	// limit only images with extensions known. Crazy CDNs will be excluded.
	if (imageTmp.indexOf('jpg') !== -1 \|\| imageTmp.indexOf('png') !== -1 \|\|
	imageTmp.indexOf('gif') !== -1 \|\| imageTmp.indexOf('jpeg') !== -1) {
	if (imageTmp.indexOf("'") === -1 && imageTmp.indexOf("apos") === -1) {
	aImages = internals.addImage(aImages, imageTmp, urlParsed);
	}
	}
	}
	}
	});
	}

	if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) {
	// some dudes also use meta:itemprop --> such good imagination (Google is one of them)
	aImages = internals.addImage(aImages, $("meta[itemprop='image']").attr("content"), urlParsed);
	}

	// Crazy little trick. After making sure images do not repest (unique), reverse the array. Why? You might ask...
	// We assume that the most intersting images are the ones in the middle of the page. Reversing, makes sure that we get first the
	// most intersting ones.
	aImages = _.unique(aImages).reverse();

	var retData = {
	host: internals.checkEmpty(urlParsed.host, ''), // the calling host. ex. http://google.com --> google.com
	title: internals.checkEmpty(title, ''), // the crawled title
	description: internals.elipses(internals.checkEmpty(description, ''), internals.descriptionSizeLimite), // the description limited by descriptionSizeLimite characters
	rawDescription: internals.checkEmpty(description, ''), // the crawled description
	imageCount: aImages.length, // image count
	images: aImages, // the array of image urls collected
	views: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
	likes: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
	url: uri, // the actual incomming url
	width: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
	height: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
	id: 0, // used only for video crawling (it is here for compatibility when we add video crawling here
	source: 'url' // just to reference that we crawled an URL. In the future we might add more types of crawling like youtube, vimeo, twitter...
	};

	cb(null, retData);
	} catch(ex) {
	cb(null, noOpRetData);
	}
	});

	}

	module.exports = {
	parse: internals.crawl
	};