Skip to content

Instantly share code, notes, and snippets.

@eviltester
Created January 18, 2017 18:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eviltester/42214ec42644bc939b90dc2eaa569652 to your computer and use it in GitHub Desktop.
Save eviltester/42214ec42644bc939b90dc2eaa569652 to your computer and use it in GitHub Desktop.
Example of screen scraping Amazon to get sales rank - the hassle of maintaining the Regex is a good reason to use the API
package uk.co.compendiumdev.amazon;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ScrapeSalesRankAmazon {
@Test
public void canScrapeSalesRank(){
// dear evil tester paperback
// https://www.amazon.com/dp/0956733271
List<String> amazonUrlFormats = new ArrayList<String>();
amazonUrlFormats.add("https://www.amazon.com/dp/%s"); // US
amazonUrlFormats.add("https://www.amazon.co.uk/dp/%s"); // UK
amazonUrlFormats.add("https://www.amazon.fr/dp/%s"); // FRANCE
amazonUrlFormats.add("https://www.amazon.de/dp/%s"); // GERMANY
//amazonUrlFormats.add("https://www.amazon.com.br/dp/%s"); // Brazil
amazonUrlFormats.add("https://www.amazon.in/dp/%s"); // India
amazonUrlFormats.add("https://www.amazon.it/dp/%s"); // Italy
amazonUrlFormats.add("https://www.amazon.es/dp/%s"); // Spain
for(String urlFormat : amazonUrlFormats){
Connection.Response response=null;
try {
String theUrl= String.format(urlFormat, "B01D33OG6E"); // "0956733271");
System.out.println(theUrl);
response = Jsoup.
connect(theUrl).
followRedirects(true).
userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36").
execute();
} catch (IOException e) {
e.printStackTrace();
}
if(response!=null && response.statusCode()==200){
Document doc = null;
try {
doc = response.parse();
} catch (IOException e) {
e.printStackTrace();
}
Elements salesRanks = doc.select("#SalesRank");
if(salesRanks.size()>0) {
Element salesRank = salesRanks.first();
String pattern = ":<\\/b>[\\s#'Nr.']*([0-9,\\.]+)";
Pattern p = Pattern.compile(pattern);
Matcher m = p.matcher(salesRank.html());
String salesRankValue="";
if (m.find()) {
salesRankValue = m.group(1).replaceAll("\\.", "").replaceAll(",","");
System.out.println(salesRankValue);
if(salesRankValue.length()<1){
System.out.println(salesRank.html());
}
} else {
System.out.println("ERROR: Could not find sales rank with REGEX");
}
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment