Skip to content

Instantly share code, notes, and snippets.

@TruffleClock
Forked from kentbye/app-store-ratings.php
Last active October 16, 2018 21:19
Show Gist options
  • Save TruffleClock/5676475 to your computer and use it in GitHub Desktop.
Save TruffleClock/5676475 to your computer and use it in GitHub Desktop.
Scrape app reviews from iTunes.
<?php
/**
*
* An updated version of kentbye's gist to reflect changes in iTunes and fix some errors
*
* Scrape app reviews from iTunes.
*
* Set the iOS app id and the number of pages to scrape, and it creates a {$app_id}-reviews.csv file
* Increase set_time_limit(N) if your query runs out of time before completion
* Add/remove countries from the $countries array as needed
*
* Original author:
* @author Kent Bye <kent@kentbye.com>
* Modified and extended from Sean Murphy's gist at https://gist.github.com/1878352
* Modified and updated from kentbye's gist at https://gist.github.com/kentbye/3740357
*/
// Set the id for the app (Right click on the icon to copy the link in iTunes.
// It should look something like this: http://itunes.apple.com/us/app/netflix/id363590051?mt=8
$app_id = '363590051'; // Netflix app
// extend execution time if necessary, if requesting many pages of reviews
// change the default 30 (seconds) to a value that suits your query
//set_time_limit(30);
// Manually set the number of review pages for the app. Select "All Versions" of reviews in iTunes and see what the last number is.
// Default is set to one to grab the latest 10 results
// WARNING: Do not fetch all reviews from all countries if your app has many pages of reviews (100~200+), as Apple will block your IP for a while.
$total_number_of_review_pages = 1;
// Initialize the results array
$results = array();
// Just all reviews from all countries
// Remove unnecessary countries from the array if needed
$countries = json_decode('[{"storefront":"143455-6,12","name":"Canada"},{"storefront":"143441-1,12","name":"United States"},{"storefront":"143565,12","name":"Belarus"},{"storefront":"143446-2,12","name":"Belgium"},{"storefront":"143526,12","name":"Bulgaria"},{"storefront":"143494,12","name":"Croatia"},{"storefront":"143557-2,12","name":"Cyprus"},{"storefront":"143489,12","name":"Czech Republic"},{"storefront":"143458-2,12","name":"Denmark"},{"storefront":"143443,12","name":"Deutschland"},{"storefront":"143454-8,12","name":"España"},{"storefront":"143518,12","name":"Estonia"},{"storefront":"143447-2,12","name":"Finland"},{"storefront":"143442,12","name":"France"},{"storefront":"143448,12","name":"Greece"},{"storefront":"143482,12","name":"Hungary"},{"storefront":"143558,12","name":"Iceland"},{"storefront":"143449,12","name":"Ireland"},{"storefront":"143450,12","name":"Italia"},{"storefront":"143519,12","name":"Latvia"},{"storefront":"143520,12","name":"Lithuania"},{"storefront":"143451-2,12","name":"Luxembourg"},{"storefront":"143530,12","name":"Macedonia"},{"storefront":"143521,12","name":"Malta"},{"storefront":"143523,12","name":"Moldova"},{"storefront":"143452,12","name":"Nederland"},{"storefront":"143457-2,12","name":"Norway"},{"storefront":"143445,12","name":"Österreich"},{"storefront":"143478,12","name":"Poland"},{"storefront":"143453,12","name":"Portugal"},{"storefront":"143487,12","name":"Romania"},{"storefront":"143496,12","name":"Slovakia"},{"storefront":"143499,12","name":"Slovenia"},{"storefront":"143456,12","name":"Sverige"},{"storefront":"143459-2,12","name":"Switzerland"},{"storefront":"143480,12","name":"Turkey"},{"storefront":"143444,12","name":"United Kingdom"},{"storefront":"143469,12","name":"Россия"},{"storefront":"143563,12","name":"Algeria"},{"storefront":"143564,12","name":"Angola"},{"storefront":"143524,12","name":"Armenia"},{"storefront":"143568,12","name":"Azerbaijan"},{"storefront":"143559,12","name":"Bahrain"},{"storefront":"143525,12","name":"Botswana"},{"storefront":"143516,12","name":"Egypt"},{"storefront":"143573,12","name":"Ghana"},{"storefront":"143467,12","name":"India"},{"storefront":"143491,12","name":"Israel"},{"storefront":"143528,12","name":"Jordan"},{"storefront":"143529,12","name":"Kenya"},{"storefront":"143493,12","name":"Kuwait"},{"storefront":"143497,12","name":"Lebanon"},{"storefront":"143531,12","name":"Madagascar"},{"storefront":"143532,12","name":"Mali"},{"storefront":"143533,12","name":"Mauritius"},{"storefront":"143534,12","name":"Niger"},{"storefront":"143561,12","name":"Nigeria"},{"storefront":"143562,12","name":"Oman"},{"storefront":"143498,12","name":"Qatar"},{"storefront":"143479,12","name":"Saudi Arabia"},{"storefront":"143535,12","name":"Senegal"},{"storefront":"143472,12","name":"South Africa"},{"storefront":"143572,12","name":"Tanzania"},{"storefront":"143536,12","name":"Tunisia"},{"storefront":"143481,12","name":"UAE"},{"storefront":"143537,12","name":"Uganda"},{"storefront":"143571,12","name":"Yemen"},{"storefront":"143460,12","name":"Australia"},{"storefront":"143560,12","name":"Brunei Darussalam"},{"storefront":"143465-2,12","name":"China"},{"storefront":"143463,12","name":"Hong Kong"},{"storefront":"143476,12","name":"Indonesia"},{"storefront":"143462-1,12","name":"Japan"},{"storefront":"143517,12","name":"Kazakhstan"},{"storefront":"143515,12","name":"Macau"},{"storefront":"143473,12","name":"Malaysia"},{"storefront":"143461,12","name":"New Zealand"},{"storefront":"143477,12","name":"Pakistan"},{"storefront":"143474,12","name":"Philippines"},{"storefront":"143464,12","name":"Singapore"},{"storefront":"143486,12","name":"Sri Lanka"},{"storefront":"143470,12","name":"Taiwan"},{"storefront":"143475,12","name":"Thailand"},{"storefront":"143566,12","name":"Uzbekistan"},{"storefront":"143471,12","name":"Vietnam"},{"storefront":"143466,12","name":"대한민국"},{"storefront":"143538,12","name":"Anguilla"},{"storefront":"143540,12","name":"Antigua and Barbuda"},{"storefront":"143505-2,12","name":"Argentina"},{"storefront":"143539,12","name":"Bahamas"},{"storefront":"143541,12","name":"Barbados"},{"storefront":"143555-2,12","name":"Belize"},{"storefront":"143542,12","name":"Bermuda"},{"storefront":"143556-2,12","name":"Bolivia"},{"storefront":"143503,12","name":"Brasil"},{"storefront":"143543,12","name":"British Virgin Islands"},{"storefront":"143544,12","name":"Cayman Islands"},{"storefront":"143483-2,12","name":"Chile"},{"storefront":"143501-2,12","name":"Colombia"},{"storefront":"143495-2,12","name":"Costa Rica"},{"storefront":"143545,12","name":"Dominica"},{"storefront":"143508-2,12","name":"Dominican Republic"},{"storefront":"143509-2,12","name":"Ecuador"},{"storefront":"143506-2,12","name":"El Salvador"},{"storefront":"143546,12","name":"Grenada"},{"storefront":"143504-2,12","name":"Guatemala"},{"storefront":"143553,12","name":"Guyana"},{"storefront":"143510-2,12","name":"Honduras"},{"storefront":"143511,12","name":"Jamaica"},{"storefront":"143468,12","name":"México"},{"storefront":"143547,12","name":"Montserrat"},{"storefront":"143512-2,12","name":"Nicaragua"},{"storefront":"143485-2,12","name":"Panama"},{"storefront":"143513-2,12","name":"Paraguay"},{"storefront":"143507-2,12","name":"Peru"},{"storefront":"143548,12","name":"St. Kitts and Nevis"},{"storefront":"143549,12","name":"St. Lucia"},{"storefront":"143550,12","name":"St. Vincent & The Grenadines"},{"storefront":"143554-2,12","name":"Suriname"},{"storefront":"143551,12","name":"Trinidad and Tobago"},{"storefront":"143552,12","name":"Turks & Caicos"},{"storefront":"143514-2,12","name":"Uruguay"},{"storefront":"143502-2,12","name":"Venezuela"}]');
// US reviews only
//$countries = json_decode('[{"storefront":"143441-1,12","name":"United States"}]');
// Write the results to a CSV file named after the $app_id
$fp = fopen($app_id . '-reviews.csv', 'w');
// Add in column names to the CSV file
$column_names = array('Date', 'Version', 'Rating', 'Review Title', 'Review', 'Helpful Percent', 'Helpful Votes', 'Total Votes', 'Username', 'User Page', 'Review ID', 'Country');
fputcsv($fp, $column_names);
// Start on the first page of most recent results, and continue to the manually set number of pages to crawl
for ($page = 1; $page <= $total_number_of_review_pages; $page++) {
// Loop through each of the countries
foreach ($countries as $country) {
$ch = curl_init();
// Grab app reviews sorted by most recent and specify the specific page
// updated for HTTPS, old one gave 301
curl_setopt($ch, CURLOPT_URL, "https://itunes.apple.com/WebObjects/MZStore.woa/wa/customerReviews?displayable-kind=11&id={$app_id}&page={$page}&sort=4");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
// Set the user agent to iTunes in order to get the full results
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'User-Agent: iTunes/11.0.2 (Macintosh; Intel Mac OS X 10.8.3) AppleWebKit/536.28.10',
"X-Apple-Store-Front: {$country->storefront}",
'X-Apple-Tz: -18000',
'Accept-Language: en-us, en;q=0.50',
));
$body = curl_exec($ch);
curl_close($ch);
$dom = new DOMDocument();
@$dom->loadHTML($body);
// Set the XPath selectors for the titles, ratings, username & link, Version + Date, Actual review, and how many people found it helpful
$xpath = new DOMXPath($dom);
$review_titles = $xpath->query('//html/body/div[@class="customer-reviews"]/div[5]/div[@class="paginated-container"]/div/div/h5/span');
$ratings = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/h5/div/@aria-label");
$user_links = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/span/a/@href");
$users_versions_dates = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/span");
$review_bodies = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/p[1]");
$helpfulness_ratings = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/p[2]/span[1]");
$review_id_urls = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/div/@report-a-concern-fragment-url");
// DEBUGGING: Output the full html from iTunes to confirm that the data is correct and matches what you expect from iTunes
// Write the output to an *.html file and confirm it's appearance in a web browser.
//echo $body;
// Convert the objects into a results array
// Review title
$i = 1;
foreach ($review_titles as $review_title) {
$results[$i][3] = $review_title->nodeValue;
$i++;
}
// Star rating
$i = 1;
foreach ($ratings as $rating) {
// Just return the first character, which is the star rating;
$results[$i][2] = $rating->nodeValue[0];
$i++;
}
// User, version number and date
$i = 1;
foreach ($users_versions_dates as $user_version_date) {
$split_user_version_date = explode("\n", $user_version_date->nodeValue);
$results[$i][8] = trim($split_user_version_date[4]); // User
$results[$i][1] = trim($split_user_version_date[8]); // Version Number
$results[$i][0] = trim($split_user_version_date[11]); // Date
$i++;
}
// The text of the actual review
$i = 1;
foreach ($review_bodies as $review_body) {
// Remove white space and character return at beginning
$results[$i][4] = trim($review_body->nodeValue);
$i++;
}
// How many people found that the review to be helpful or not helpful
$i = 1;
foreach ($helpfulness_ratings as $helpfulness_rating) {
// If it starts with "Was this review helpful?" then don't include since their review hasn't been evaluated by other users yet.
// NOTE: This initial letter of 'W' works for English, but for Spanish, it'd be "E" for Esta
// TODO: This logic could be improved.
if ($helpfulness_rating->nodeValue[0] == "W") {
$results[$i][5] = "";
$results[$i][6] = "";
$results[$i][7] = "";
} else {
$split_helpfulness_rating = explode("\n", $helpfulness_rating->nodeValue);
$helpful = explode(" ", $split_helpfulness_rating[0]);
$total_votes = explode(" ", trim($split_helpfulness_rating[1]));
if (intval($total_votes[0]) == 0) { // prevent division by zero
$results[$i][5] = 0;
}
else {
$results[$i][5] = intval($helpful[0])/intval($total_votes[0]); // Total Percentage of Helpfulness
}
$results[$i][6] = $helpful[0]; // Number of Helpful Votes
$results[$i][7] = $total_votes[0]; // Number of Total Votes
}
$i++;
}
// Link to the user's page of reviews
$i = 1;
foreach ($user_links as $user_link) {
$results[$i][9] = $user_link->nodeValue;
$i++;
}
// Review ID
$i = 1;
foreach ($review_id_urls as $review_id_url) {
$review_id = explode("=", $review_id_url->nodeValue);
$results[$i][10] = $review_id[1];
// Add country name in the last column
$results[$i][11] = $country->name;
$i++;
}
// DEBUGGING: Check to see that the information in the array is displayed properly
//print_r($results);
// Write each of the rows to the CSV file
foreach ($results as $fields) {
// Sort the results so that it properly outputs to the CSV file
ksort($fields);
fputcsv($fp, $fields);
}
}
}
fclose($fp);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment