public
Created

Code to Scrape Drugstores

  • Download Gist
gistfile1.java
Java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
package com.example;
 
// Download Bobik SDK from http://usebobik.com/sdk
import android.util.Log;
import bobik.BobikClient;
import bobik.BobikHelper;
import bobik.Job;
import bobik.JobListener;
 
import org.json.JSONException;
import org.json.JSONObject;
 
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
/**
* Implements search capability for a comparison shopping app
*/
public class ComparisonShoppingApp {
 
BobikClient bobik = new BobikClient(YOUR_AUTHENTICATION_TOKEN);
 
 
public void printDeals(String drug) throws Exception {
for (JSONObject option : findBestPrice(drug))
Log.d("Super Shopper", option.toString());
}
 
 
/**
* Searches the web for various buying options for a given drug
* and returns results sorted in accordance to the lowest price
* @param drug
* @return purchasing options, sorted by price
*/
public List<JSONObject> findBestPrice(String drug) throws Exception {
List<JSONObject> allOptions = findAllOptions(drug);
Collections.sort(allOptions, new Comparator<JSONObject>() {
@Override
public int compare(JSONObject jsonObject1, JSONObject jsonObject2) {
try {
double price1 = jsonObject1.getDouble("Price");
double price2 = jsonObject2.getDouble("Price");
return (price1 == price2)? 0 : (price1>price2? +1 : -1);
} catch (JSONException e) {
throw new RuntimeException(e);
}
}
});
return allOptions;
}
 
private String[] getSearchUrls(String keyword) {
try {
String encodedKeyword = java.net.URLEncoder.encode(keyword, "UTF-8");
return new String[]{
"http://www.cvs.com/search/_/N-3mZ2k?pt=product&searchTerm=" + encodedKeyword,
"http://www.myotcstore.com/store/Search.aspx?SearchTerms=" + encodedKeyword,
"http://www.familymeds.com/search/search-results.aspx?SearchTerm=" + encodedKeyword,
"http://www.canadadrugs.com/search.php?keyword=" + encodedKeyword,
"http://thebestonlinepharmacy.net/product.php?prod=" + encodedKeyword,
"http://www.walgreens.com/search/results.jsp?Ntt=" + encodedKeyword,
"http://www.drugstore.com/search/search_results.asp?N=0&Ntx=mode%2Bmatchallpartial&Ntk=All&Ntt=" + encodedKeyword
};
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
 
/**
* Searches on the web for various buying options for a given drug
*
* @param drug
* @return An array of hashes containing some or all of the following elements:
* Title - product title
* Image - product image
* Price - generally a X.XX number, although there can be something as ugly as "$6.99\r\n2/$11.00 or 1/$5.99\r\n \r\nSavings: $1.00 (14%) on 1"
* Details - size, weight, and any additional information that could not be categorized easily
*/
public List<JSONObject> findAllOptions(String drug) throws Exception {
// First, find options in the raw form, then clean them up (transpose, normalize) and return
JSONObject request = new JSONObject();
for (String url : getSearchUrls(drug))
request.accumulate("urls", url);
for (String query_set : new String[]{"cvs", "MyOTCStore", "drugstore.com", "FamilyMeds", "walgreens", "CanadaDrugs", "thebestonlinepharmacy"})
request.accumulate("query_sets", query_set);
request.put("ignore_robots_txt", true);
 
final List<JSONObject> results = new ArrayList<JSONObject>();
Job job = bobik.scrape(request, new JobListenerImpl() {
 
@Override
public void onSuccess(JSONObject jsonObject) {
// Aggregate results across all search urls
Iterator search_urls = jsonObject.keys();
while (search_urls.hasNext()) {
String search_url = (String)search_urls.next();
String url_base = getUrlBase(search_url);
try {
JSONObject results_parallel_arrays_of_attributes = jsonObject.getJSONObject(search_url);
if (results_parallel_arrays_of_attributes.getJSONArray("Price").length() == 0)
continue; // no priced results from this source
List<JSONObject> results_from_this_url = BobikHelper.transpose(results_parallel_arrays_of_attributes);
// Perform some remaining cleanup
for (JSONObject r : results_from_this_url) {
// 1. Make urls absolute
for (String link_key : new String[]{"Image", "Link"}) {
try {
r.put(link_key, url_base + r.get(link_key));
} catch (JSONException e) {
// continue to the next result if Image or Link is missing
}
}
// 2. Extract price
r.put("Price", cleanPrice(r.getString("Price")));
}
results.addAll(results_from_this_url);
} catch (JSONException e) {
e.printStackTrace();
// continue to the next store if this search url is broken
}
}
}
});
// Feel free to remove this call if you'd rather show results as they become available
job.waitForCompletion();
return results;
}
 
/**
* Cleans up a price fragment.
* Example:
* $6.99\r\n2/$11.00 or 1/$5.99\r\n \r\nSavings: $1.00 (14%) on 1
* will become
* 6.99
* @param priceBlurb
* @throws IllegalArgumentException if price cannot be determined. If such an exception is thrown,
* it's best to either to keep the original text fragment or throw away the result
* @return a single simplest price number
*/
private static final Pattern price_pattern = Pattern.compile("\\$[0-9]+\\.?[0-9]?[0-9]?");
private double cleanPrice(String priceBlurb) throws IllegalArgumentException {
try {
Matcher matcher = price_pattern.matcher(priceBlurb);
if (matcher.find()) {
priceBlurb = matcher.group(0);
if (priceBlurb.startsWith("$"))
priceBlurb = priceBlurb.substring(1);
}
return Double.parseDouble(priceBlurb);
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException("No price found");
}
}
 
private String getUrlBase(String url) {
int slashslash = url.indexOf("//") + 2;
return url.substring(0, url.indexOf('/', slashslash));
}
 
// Since we don't expect any errors and don't care about progress (in this example),
// stub 2 functions with simple loggers
private abstract static class JobListenerImpl extends JobListener {
@Override
public void onProgress(float currentProgress) {
Log.d(log_tag, "Current progress for job " + job.id() + " is " + currentProgress * 100 + "%");
}
 
@Override
public void onErrors(Collection<String> errors){
for (String s : errors)
Log.e(log_tag, "Error for job " + job.id() + ": " + s);
}
 
private final String log_tag = "Super Shopper";
};
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.