Created
January 18, 2017 18:34
-
-
Save eviltester/42214ec42644bc939b90dc2eaa569652 to your computer and use it in GitHub Desktop.
Example of screen scraping Amazon to get sales rank - the hassle of maintaining the Regex is a good reason to use the API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package uk.co.compendiumdev.amazon; | |
import org.jsoup.Connection; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import org.junit.Test; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class ScrapeSalesRankAmazon { | |
@Test | |
public void canScrapeSalesRank(){ | |
// dear evil tester paperback | |
// https://www.amazon.com/dp/0956733271 | |
List<String> amazonUrlFormats = new ArrayList<String>(); | |
amazonUrlFormats.add("https://www.amazon.com/dp/%s"); // US | |
amazonUrlFormats.add("https://www.amazon.co.uk/dp/%s"); // UK | |
amazonUrlFormats.add("https://www.amazon.fr/dp/%s"); // FRANCE | |
amazonUrlFormats.add("https://www.amazon.de/dp/%s"); // GERMANY | |
//amazonUrlFormats.add("https://www.amazon.com.br/dp/%s"); // Brazil | |
amazonUrlFormats.add("https://www.amazon.in/dp/%s"); // India | |
amazonUrlFormats.add("https://www.amazon.it/dp/%s"); // Italy | |
amazonUrlFormats.add("https://www.amazon.es/dp/%s"); // Spain | |
for(String urlFormat : amazonUrlFormats){ | |
Connection.Response response=null; | |
try { | |
String theUrl= String.format(urlFormat, "B01D33OG6E"); // "0956733271"); | |
System.out.println(theUrl); | |
response = Jsoup. | |
connect(theUrl). | |
followRedirects(true). | |
userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"). | |
execute(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
if(response!=null && response.statusCode()==200){ | |
Document doc = null; | |
try { | |
doc = response.parse(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
Elements salesRanks = doc.select("#SalesRank"); | |
if(salesRanks.size()>0) { | |
Element salesRank = salesRanks.first(); | |
String pattern = ":<\\/b>[\\s#'Nr.']*([0-9,\\.]+)"; | |
Pattern p = Pattern.compile(pattern); | |
Matcher m = p.matcher(salesRank.html()); | |
String salesRankValue=""; | |
if (m.find()) { | |
salesRankValue = m.group(1).replaceAll("\\.", "").replaceAll(",",""); | |
System.out.println(salesRankValue); | |
if(salesRankValue.length()<1){ | |
System.out.println(salesRank.html()); | |
} | |
} else { | |
System.out.println("ERROR: Could not find sales rank with REGEX"); | |
} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment