A scrapper based on nodejs, written in coffeescript and using cheerio
and request
packages
This is an equivalent of the pjscrape exemple but really faster:
# Bus 0.1(alpha) | |
# (c) 2010 John Wright, QuickLeft Inc. | |
# Bus may be freely distributed under the MIT license. | |
# For all details and documentation: | |
# http://github.com/mrjjwright/Bus | |
# | |
# | |
# | |
# Bus would not be possible without Jeremy Ashkenas who wrote CoffeeScript, the language |
methodMap = { | |
'create': 'POST', | |
'update': 'PUT', | |
'delete': 'DELETE', | |
'read': 'GET' | |
}; | |
getUrl = function(object) { | |
if (!(object && object.url)) { | |
throw new Error("A 'url' property or function must be specified"); | |
} else { |
require "nosqlite" | |
require "Math.uuid" | |
convert_callback: (row) -> | |
if not row.guid? | |
row.guid: Math.uuidFast(); | |
return row; | |
db_file: "my_db.sqlite" |
require.paths.unshift("/Users/johnw/js/node_modules") | |
require("./underscore") | |
sys: require("sys") | |
rest: require("restler") | |
rest.get('http://github.com/api/v2/json/repos/show/mrjjwright').addListener('complete', | |
((data) -> | |
repositories: JSON.parse(data).repositories | |
for repository in repositories |
cheerio = require('cheerio') | |
Shred = require('shred') | |
shred = new Shred() | |
http = require('http') | |
URL = require('url') | |
server = http.createServer (request, response) -> | |
url = URL.parse(request.url, true) | |
urlToDiscover = url.query['url'] | |
startDiscovery urlToDiscover, (theImageURL) -> |
var fs = require('fs'), | |
async = require('async'); | |
var try_series = function(func_name, func, data, times, cb){ | |
var start = new Date(); | |
var tries = new Array(times); | |
for(var i = 0 ; i < times ; i++) { | |
tries[i] = function(callback){func(data, callback);}; | |
} | |
async.series(tries, function(err,result){ |
var cheerio = require('cheerio') | |
var request = require('request') | |
var pictureTube = require('picture-tube') | |
var url = require('url') | |
var async = require('async') | |
var site = process.argv[2] | |
console.log('fetching', site) | |
request(site, function(e,r,b) { |
var http = require('http'), | |
https = require('https'), | |
Iconv = require('iconv').Iconv, | |
iconv = new Iconv('EUC-JP', 'UTF-8//TRANSLIT//IGNORE'), | |
cheerio = require('cheerio'), | |
request = require('request'); | |
var site = 'http://www.hit.ac.jp/gakusei/chgschool/', | |
port = 8880; |
var sip = require('sip'); | |
var sys = require('sys'); | |
var redis = require('redis'); | |
//Trim leading and trailing whitespace from string values. | |
function trim(str) { | |
return str.replace(/^\s+|\s+$/g, ''); | |
} | |
sip.start({},function(request) { |