Skip to content

Instantly share code, notes, and snippets.

@emirkin
Created July 3, 2012 18:17
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save emirkin/3041523 to your computer and use it in GitHub Desktop.
Code to Scrape Drugstores
package com.example;
// Download Bobik SDK from http://usebobik.com/sdk
import android.util.Log;
import bobik.BobikClient;
import bobik.BobikHelper;
import bobik.Job;
import bobik.JobListener;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Implements search capability for a comparison shopping app
*/
public class ComparisonShoppingApp {
BobikClient bobik = new BobikClient(YOUR_AUTHENTICATION_TOKEN);
public void printDeals(String drug) throws Exception {
for (JSONObject option : findBestPrice(drug))
Log.d("Super Shopper", option.toString());
}
/**
* Searches the web for various buying options for a given drug
* and returns results sorted in accordance to the lowest price
* @param drug
* @return purchasing options, sorted by price
*/
public List<JSONObject> findBestPrice(String drug) throws Exception {
List<JSONObject> allOptions = findAllOptions(drug);
Collections.sort(allOptions, new Comparator<JSONObject>() {
@Override
public int compare(JSONObject jsonObject1, JSONObject jsonObject2) {
try {
double price1 = jsonObject1.getDouble("Price");
double price2 = jsonObject2.getDouble("Price");
return (price1 == price2)? 0 : (price1>price2? +1 : -1);
} catch (JSONException e) {
throw new RuntimeException(e);
}
}
});
return allOptions;
}
private String[] getSearchUrls(String keyword) {
try {
String encodedKeyword = java.net.URLEncoder.encode(keyword, "UTF-8");
return new String[]{
"http://www.cvs.com/search/_/N-3mZ2k?pt=product&searchTerm=" + encodedKeyword,
"http://www.myotcstore.com/store/Search.aspx?SearchTerms=" + encodedKeyword,
"http://www.familymeds.com/search/search-results.aspx?SearchTerm=" + encodedKeyword,
"http://www.canadadrugs.com/search.php?keyword=" + encodedKeyword,
"http://thebestonlinepharmacy.net/product.php?prod=" + encodedKeyword,
"http://www.walgreens.com/search/results.jsp?Ntt=" + encodedKeyword,
"http://www.drugstore.com/search/search_results.asp?N=0&Ntx=mode%2Bmatchallpartial&Ntk=All&Ntt=" + encodedKeyword
};
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
/**
* Searches on the web for various buying options for a given drug
*
* @param drug
* @return An array of hashes containing some or all of the following elements:
* Title - product title
* Image - product image
* Price - generally a X.XX number, although there can be something as ugly as "$6.99\r\n2/$11.00 or 1/$5.99\r\n \r\nSavings: $1.00 (14%) on 1"
* Details - size, weight, and any additional information that could not be categorized easily
*/
public List<JSONObject> findAllOptions(String drug) throws Exception {
// First, find options in the raw form, then clean them up (transpose, normalize) and return
JSONObject request = new JSONObject();
for (String url : getSearchUrls(drug))
request.accumulate("urls", url);
for (String query_set : new String[]{"cvs", "MyOTCStore", "drugstore.com", "FamilyMeds", "walgreens", "CanadaDrugs", "thebestonlinepharmacy"})
request.accumulate("query_sets", query_set);
request.put("ignore_robots_txt", true);
final List<JSONObject> results = new ArrayList<JSONObject>();
Job job = bobik.scrape(request, new JobListenerImpl() {
@Override
public void onSuccess(JSONObject jsonObject) {
// Aggregate results across all search urls
Iterator search_urls = jsonObject.keys();
while (search_urls.hasNext()) {
String search_url = (String)search_urls.next();
String url_base = getUrlBase(search_url);
try {
JSONObject results_parallel_arrays_of_attributes = jsonObject.getJSONObject(search_url);
if (results_parallel_arrays_of_attributes.getJSONArray("Price").length() == 0)
continue; // no priced results from this source
List<JSONObject> results_from_this_url = BobikHelper.transpose(results_parallel_arrays_of_attributes);
// Perform some remaining cleanup
for (JSONObject r : results_from_this_url) {
// 1. Make urls absolute
for (String link_key : new String[]{"Image", "Link"}) {
try {
r.put(link_key, url_base + r.get(link_key));
} catch (JSONException e) {
// continue to the next result if Image or Link is missing
}
}
// 2. Extract price
r.put("Price", cleanPrice(r.getString("Price")));
}
results.addAll(results_from_this_url);
} catch (JSONException e) {
e.printStackTrace();
// continue to the next store if this search url is broken
}
}
}
});
// Feel free to remove this call if you'd rather show results as they become available
job.waitForCompletion();
return results;
}
/**
* Cleans up a price fragment.
* Example:
* $6.99\r\n2/$11.00 or 1/$5.99\r\n \r\nSavings: $1.00 (14%) on 1
* will become
* 6.99
* @param priceBlurb
* @throws IllegalArgumentException if price cannot be determined. If such an exception is thrown,
* it's best to either to keep the original text fragment or throw away the result
* @return a single simplest price number
*/
private static final Pattern price_pattern = Pattern.compile("\\$[0-9]+\\.?[0-9]?[0-9]?");
private double cleanPrice(String priceBlurb) throws IllegalArgumentException {
try {
Matcher matcher = price_pattern.matcher(priceBlurb);
if (matcher.find()) {
priceBlurb = matcher.group(0);
if (priceBlurb.startsWith("$"))
priceBlurb = priceBlurb.substring(1);
}
return Double.parseDouble(priceBlurb);
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException("No price found");
}
}
private String getUrlBase(String url) {
int slashslash = url.indexOf("//") + 2;
return url.substring(0, url.indexOf('/', slashslash));
}
// Since we don't expect any errors and don't care about progress (in this example),
// stub 2 functions with simple loggers
private abstract static class JobListenerImpl extends JobListener {
@Override
public void onProgress(float currentProgress) {
Log.d(log_tag, "Current progress for job " + job.id() + " is " + currentProgress * 100 + "%");
}
@Override
public void onErrors(Collection<String> errors){
for (String s : errors)
Log.e(log_tag, "Error for job " + job.id() + ": " + s);
}
private final String log_tag = "Super Shopper";
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment