Skip to content

Instantly share code, notes, and snippets.

@BudickDa
Last active April 8, 2017 08:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BudickDa/bb7adaf7aa5e4773ce88a2feb8b7fa61 to your computer and use it in GitHub Desktop.
Save BudickDa/bb7adaf7aa5e4773ce88a2feb8b7fa61 to your computer and use it in GitHub Desktop.
How to use Crawly McCrawlface in Meteor
/**
* This is server-side code and should be in /server
* This file should must be imported to /server/main.js with import './crawler';
*/
import {Meteor} from 'meteor/meteor';
import {Mongo} from 'meteor/mongo';
import _ from 'underscore';
import Crawler from 'crawly-mccrawlface';
import Cheerio from 'cheerio';
const Cache = new Mongo.Collection('cache');
const cache = {
get: Meteor.bindEnvironment(function(key) {
const doc = Cache.findOne({key: key});
if (doc) {
if (new Date().getTime() < doc.expire) {
return doc.value;
} else {
Cache.remove(doc._id);
}
}
return;
}),
set: Meteor.bindEnvironment(function(key, value, expire) {
Cache.upsert({key: key}, {key: key, value: value, expire: expire);
})
}
const crawl = function() {
return new Promise((resolve, reject) => {
try{
const c = new Crawler('http://www.polizei.bayern.de/news/presse/archiv/index.html?type=archiv&rubid=rub-4&period=fromto&periodto=04.04.2017&periodfrom=01.04.2010&periodselect=All&start=0', {
readyIn: 50,
goHaywire: false,
userAgent: 'CrawlyMcCrawlface',
expireDefault: 7 * 24 * 60 * 60 * 1000
});
c.addCache(cache);
c.start();
c.on('ready', resolve);
}catch (e){
reject(e);
}
});
}
const clean = string => {
return string.replace(/\n/gi, ' ').replace(/\s+/gi, ' ')
};
Meteor.startup(async function() {
const crawler = await crawl();
crawler.stop();
try{
crawler.sites.forEach(site => {
if (Boolean(site.url.href.match(/index\.html\/[0-9]{6}/i))) {
const $ = Cheerio.load(site.getContent());
const text = clean($('body').text());
const headline = clean($('title').text());
const city = findCity(text);
Meteor.call('insertReport', {
headline: headline,
text: text,
city: city,
url: site.url.href
});
}
});
}catch (e){
console.log(e);
}
});
function findCity(text) {
/*do some NER magic in here*/
return 'city';
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment