Skip to content

Instantly share code, notes, and snippets.

@emirkin
Created June 2, 2012 17:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emirkin/2859254 to your computer and use it in GitHub Desktop.
Save emirkin/2859254 to your computer and use it in GitHub Desktop.
Scraping restaurants in Javascript
/*
* This example shows how to collect restaurant information and menus on the fly.
*/
// Instantiate Bobik client from Bobik SDK available at http://usebobik.com/sdk.
// You're welcome to link directly to the JS file. However we make no guarantees about keeping the link unchanged.
// Thus, you should do it only when you have a quick and immediate access to where this url is used (e.g. during development)
var bobik = new Bobik("YOUR_AUTH_TOKEN");
// Finds restaurant directory information (name, website, address, menu_url).
// Upon success, triggers find_menus().
function find_restaurants(neighborhood, cuisine) {
console.log("Looking for " + cuisine + " restaurants in " + neighborhood + "...");
var src_url = "http://sanfrancisco.menupages.com/restaurants/all-areas/" + neighborhood + "/" + cuisine;
bobik.scrape({
urls: [src_url],
query_set: "menupages"
}, function (scraped_data) {
if (!scraped_data) {
console.log("Data is unavailable");
return;
}
var restaurants = scraped_data[src_url]
if (!restaurants || restaurants.length == 0) {
console.log("Did not find any restaurants");
return;
}
var restaurants = group_restaurants(restaurants);
console.log("Found " + restaurants.length + " restaurants");
var print_as_they_become_available = true;
if (print_as_they_become_available)
find_menus_async(restaurants);
else
find_menus_sync(restaurants);
})
}
// A helper function that takes a hash of restaurant names, addresses and websites,
// and turns them into an array of grouped restaurant attributes.
// Also, each restaurant is augmented with the menu url.
function group_restaurants(restaurants) {
var names = restaurants['Name']; // an array of names
var addresses = restaurants['Address']; // an array of addresses
var urls = restaurants['Url']; // an array of urls
var restaurants = [];
for (var i=0; i<names.length; i++) {
var website = "http://sanfrancisco.menupages.com" + urls[i];
// push this restaurant to the array of results
restaurants.push({
'name' : names[i],
'address' : addresses[i],
'website' : website,
'menu_url' : website + "menu"
})
}
return restaurants;
}
// Finds menus for all restaurants and adds those menus to the corresponding restaurant hashes.
// Upon completion, prints full restaurant information.
// This variant processes restaurants in parallel and prints them out as the information becomes available.
function find_menus_async(restaurants) {
console.log("Looking for menus...");
for (var x in restaurants) {
var restaurant = restaurants[x];
var menu_url = restaurant['menu_url'];
bobik.scrape({
urls: [menu_url], // send only one at a time (and don't wait for it to complete before sending the next)
query_set: "menu"
}, function (scraped_data) {
restaurant['menu'] = scraped_data[menu_url];
console.log("Found restaurant:" + restaurant);
})
}
}
// This variant of find_menu displays results only when all are ready.
function find_menus_sync(restaurants) {
console.log("Looking for menus...");
// Assemble a list of menu urls and a {url -> restaurant} map.
// We need this map to match results (since they will be bucketed by url)
var menu_urls = new Array();
var url_to_restaurant = {};
for (var x in restaurants) {
var restaurant = restaurants[x];
var menu_url = restaurant['menu_url'];
menu_urls.push(menu_url);
url_to_restaurant[menu_url] = restaurant;
}
bobik.scrape({
urls: menu_urls,
query_set: "menu"
}, function (scraped_data) {
for (var url in scraped_data)
url_to_restaurant[url]['menu'] = scraped_data[url];
console.log(restaurants);
})
}
// Go!
//find_restaurants('soma', 'italian')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment