Created
September 6, 2014 18:28
-
-
Save andrewxhill/f906c9453ca35d93c998 to your computer and use it in GitHub Desktop.
Node.js app to harvest a georss feed into CartoDB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var express = require('express'); | |
var app = express(); | |
var http = require('http'), | |
https = require('https'), | |
querystring = require('querystring'), | |
Step = require('step'), | |
fs = require('fs'), | |
// jQuery = require('jquery'); | |
parseString = require('xml2js').parseString; | |
var config = {} | |
try { | |
var config = require('./config') | |
} catch(e) { } | |
String.prototype.format = (function (i, safe, arg) { | |
function format() { | |
var str = this, | |
len = arguments.length + 1; | |
for (i = 0; i < len; arg = arguments[i++]) { | |
safe = typeof arg === 'object' ? JSON.stringify(arg) : arg; | |
str = str.replace(RegExp('\\{' + (i - 1) + '\\}', 'g'), safe); | |
} | |
return str; | |
} | |
//format.native = String.prototype.format; | |
return format; | |
})(); | |
function sql_escape (str) { | |
return str.replace(/[\0\x08\x09\x1a\n\r"'\\\%]/g, function (char) { | |
switch (char) { | |
case "\0": | |
return "\\0"; | |
case "\x08": | |
return "\\b"; | |
case "\x09": | |
return "\\t"; | |
case "\x1a": | |
return "\\z"; | |
case "\n": | |
return "\\n"; | |
case "\r": | |
return "\\r"; | |
case "'": | |
return "''"; | |
case "\"": | |
case "\\": | |
case "%": | |
return "\\"+char; // prepends a backslash to backslash, percent, | |
// and double/single quotes | |
} | |
}); | |
} | |
function callCartoDB(sql) { | |
var post_data = querystring.stringify({ | |
'api_key' : config.cartodb.api_key, | |
'q': sql | |
}); | |
var post_options = { | |
host: config.cartodb.username+'.cartodb.com', | |
port: '80', | |
path: '/api/v2/sql', | |
method: 'POST', | |
headers: { | |
'Content-Type': 'application/x-www-form-urlencoded', | |
'Content-Length': post_data.length | |
} | |
}; | |
var post_req = http.request(post_options, function(res) { | |
res.setEncoding('utf8'); | |
res.on('data', function (chunk) { | |
process.stdout.write("."); | |
}); | |
}); | |
// console.log(post_data) | |
post_req.write(post_data); | |
post_req.end(); | |
} | |
function get_category_label(entry){ | |
var label = null; | |
if (entry.category != undefined){ | |
if (entry.category[0].$ != undefined){ | |
if (entry.category[0].$.label != undefined){ | |
label = entry.category[0].$.label; | |
} | |
} | |
} | |
return label | |
} | |
function get_category_term(entry){ | |
var term = null; | |
if (entry.category != undefined){ | |
if (entry.category[0].$ != undefined){ | |
if (entry.category[0].$.term != undefined){ | |
term = entry.category[0].$.term; | |
} | |
} | |
} | |
return term | |
} | |
function get_content(entry){ | |
var content = null; | |
// console.log(entry.content[0]['_']) | |
if (entry.content != undefined){ | |
if ("_" in entry.content[0]){ | |
content = entry.content[0]['_'].replace(/\t/g, '').replace(/\n/g,''); | |
} | |
} | |
return content | |
} | |
function get_point(entry){ | |
var point = null; | |
if ("georss:point" in entry){ | |
point = entry["georss:point"][0].split(' '); | |
point = 'CDB_LatLng({0}, {1})'.format(point[0], point[1]) | |
} | |
return point | |
} | |
function toSqlTime(val){ | |
val = val.trim(); | |
val = val == '' ? null : "'{0}'::timestamp".format(val); | |
return val | |
} | |
function toSqlString(val){ | |
val = val.trim(); | |
val = val == '' ? null : "'{0}'".format(val); | |
return val | |
} | |
var options = { | |
host: 'www.portlandonline.com', | |
path: '/scripts/911incidents.cfm' | |
}; | |
var base_sql = "WITH n(id, title, updated, published, summary, label, term, the_geom, content) AS (VALUES {0}) INSERT INTO portland_911_feed (id, title, updated, published, summary, label, term, the_geom, content) SELECT n.id, n.title, n.updated, n.published, n.summary, n.label, n.term, n.the_geom, n.content FROM n WHERE n.id NOT IN (SELECT id FROM portland_911_feed)"; | |
var total_inserts = 0; | |
var last_ids = []; | |
function run(){ | |
var sql = []; | |
var req = http.get(options, function(res) { | |
// console.log('STATUS: ' + res.statusCode); | |
// console.log('HEADERS: ' + JSON.stringify(res.headers)); | |
// Buffer the body entirely for processing as a whole. | |
var bodyChunks = []; | |
res.on('data', function(chunk) { | |
// You can process streamed parts here... | |
bodyChunks.push(chunk); | |
}).on('end', function() { | |
var xml = Buffer.concat(bodyChunks); | |
// var xml = "<root>Hello xml2js!</root>" | |
parseString(xml, function (err, result) { | |
for (i in result.feed.entry){ | |
var id = toSqlString(result.feed.entry[i].id[0]); | |
if (last_ids.indexOf(id) == -1) { | |
var curr_sql = | |
'({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8})'.format( | |
id, | |
toSqlString(result.feed.entry[i].title[0]), | |
toSqlTime(result.feed.entry[i].updated[0]), | |
toSqlTime(result.feed.entry[i].published[0]), | |
toSqlString(result.feed.entry[i].summary[0]), | |
toSqlString(get_category_label(result.feed.entry[i])), | |
toSqlString(get_category_term(result.feed.entry[i])), | |
get_point(result.feed.entry[i]), | |
toSqlString(get_content(result.feed.entry[i])) | |
); | |
total_inserts = total_inserts+1; | |
sql.push(curr_sql); | |
} | |
last_ids.push(id); | |
} | |
// console.log(sql.length) | |
callCartoDB(base_sql.format(sql.join())); | |
while (last_ids.length > 100){ | |
last_ids.shift(); | |
} | |
}); | |
// console.log('BODY: ' + body); | |
// ...and/or process the entire body here. | |
}) | |
}); | |
req.on('error', function(e) { | |
// console.log('ERROR: ' + e.message); | |
}); | |
} | |
function start(){ | |
run() | |
setInterval(run, 1800000); | |
} | |
app.set('port', (process.env.PORT || 5000)) | |
app.use(express.static(__dirname + '/public')) | |
app.get('/', function(request, response) { | |
response.send('Your app has made {0} total inserts'.format(total_inserts)) | |
}) | |
app.listen(app.get('port'), function() { | |
console.log("Node app is running at localhost:" + app.get('port')); | |
start(); | |
}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var config = {}; | |
config.cartodb = { | |
username : "andrew", | |
api_key : "mykey" | |
} | |
module.exports = config; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "skylinepdx", | |
"version": "0.0.1", | |
"main": "index.js", | |
"description": "GeoRSS feed to CartoDB", | |
"licenses": [{ "type": "MIT" }], | |
"author": { | |
"name": "Andrew W. Hill, CartoDB", | |
"email": "andrew@cartodb.com" | |
}, | |
"engines": { | |
"node": "0.10.x" | |
}, | |
"scripts": { | |
"start": "node index.js" | |
}, | |
"dependencies": { | |
"forever-monitor": "1.2.x", | |
"express": "4.x.x", | |
"xml2js": "0.4.x", | |
"http":"0.0.x", | |
"requestify":"0.1.x", | |
"step": "0.0.4" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment