Last active
April 8, 2017 08:35
-
-
Save BudickDa/bb7adaf7aa5e4773ce88a2feb8b7fa61 to your computer and use it in GitHub Desktop.
How to use Crawly McCrawlface in Meteor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* This is server-side code and should be in /server | |
* This file should must be imported to /server/main.js with import './crawler'; | |
*/ | |
import {Meteor} from 'meteor/meteor'; | |
import {Mongo} from 'meteor/mongo'; | |
import _ from 'underscore'; | |
import Crawler from 'crawly-mccrawlface'; | |
import Cheerio from 'cheerio'; | |
const Cache = new Mongo.Collection('cache'); | |
const cache = { | |
get: Meteor.bindEnvironment(function(key) { | |
const doc = Cache.findOne({key: key}); | |
if (doc) { | |
if (new Date().getTime() < doc.expire) { | |
return doc.value; | |
} else { | |
Cache.remove(doc._id); | |
} | |
} | |
return; | |
}), | |
set: Meteor.bindEnvironment(function(key, value, expire) { | |
Cache.upsert({key: key}, {key: key, value: value, expire: expire); | |
}) | |
} | |
const crawl = function() { | |
return new Promise((resolve, reject) => { | |
try{ | |
const c = new Crawler('http://www.polizei.bayern.de/news/presse/archiv/index.html?type=archiv&rubid=rub-4&period=fromto&periodto=04.04.2017&periodfrom=01.04.2010&periodselect=All&start=0', { | |
readyIn: 50, | |
goHaywire: false, | |
userAgent: 'CrawlyMcCrawlface', | |
expireDefault: 7 * 24 * 60 * 60 * 1000 | |
}); | |
c.addCache(cache); | |
c.start(); | |
c.on('ready', resolve); | |
}catch (e){ | |
reject(e); | |
} | |
}); | |
} | |
const clean = string => { | |
return string.replace(/\n/gi, ' ').replace(/\s+/gi, ' ') | |
}; | |
Meteor.startup(async function() { | |
const crawler = await crawl(); | |
crawler.stop(); | |
try{ | |
crawler.sites.forEach(site => { | |
if (Boolean(site.url.href.match(/index\.html\/[0-9]{6}/i))) { | |
const $ = Cheerio.load(site.getContent()); | |
const text = clean($('body').text()); | |
const headline = clean($('title').text()); | |
const city = findCity(text); | |
Meteor.call('insertReport', { | |
headline: headline, | |
text: text, | |
city: city, | |
url: site.url.href | |
}); | |
} | |
}); | |
}catch (e){ | |
console.log(e); | |
} | |
}); | |
function findCity(text) { | |
/*do some NER magic in here*/ | |
return 'city'; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment