Skip to content

Instantly share code, notes, and snippets.

@mandrean
Forked from kentbye/app-store-ratings.php
Last active September 1, 2015 13:35
Show Gist options
  • Save mandrean/995cf251464c49a27a4d to your computer and use it in GitHub Desktop.
Save mandrean/995cf251464c49a27a4d to your computer and use it in GitHub Desktop.
Scrape iOS app review data from iTunes
<?php
/**
* Scrape the number of app reviews from iTunes.
*
* Set the iOS app id, and it creates a {$app_id}-reviews.csv file. Automatically scrapes all pages.
*
* @author Sebastian Mandrean <sebastian.mandrean@gmail.com>
* Modified and extended from Kent Byhas_e's gist at https://gist.github.com/kentbye/3740357
* Original gist by Sean Murphy at https://gist.github.com/1878352
*/
// Set the id for the app (Right click on the icon to copy the link in iTunes.
// It should look something like this: http://itunes.apple.com/us/app/netflix/id363590051?mt=8
$app_id = '363590051'; // Netflix app
// Initialize the results array
$results = array();
// Just show US reviews
$countries = json_decode('[{"storefront":"143441-1,12","name":"United States"}]');
// For more countries, uncomment and add more country codes from this list: https://gist.github.com/1878352
// $countries = json_decode('[{"storefront":"143441-1,12","name":"United States"},{"storefront":"143455-6,12","name":"Canada"}]');
// Write the results to a CSV file named after the $app_id
$fp = fopen($app_id . '-reviews.csv', 'w');
// Add in column names to the CSV file
$column_names = array('Date', 'Version', 'Rating', 'Review Title', 'Review', 'Helpful Percent', 'Helpful Votes', 'Total Votes', 'Username', 'User Page', 'Review ID', 'Country');
fputcsv($fp, $column_names);
// Start on the first page of most recent results, and continue to the manually set number of pages to crawl
for ($page = 1, $done = false; $done == false; $page++) {
// Loop through each of the countries
foreach ($countries as $country) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_ENCODING, "");
// Grab app reviews sorted by most recent and specify the specific page
curl_setopt($ch, CURLOPT_URL, "https://itunes.apple.com/WebObjects/MZStore.woa/wa/customerReviews?displayable-kind=11&id={$app_id}&page={$page}&sort=4");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
// Set the user agent to iTunes in order to get the full results
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'User-Agent: iTunes/10.3.1 (Macintosh; Intel Mac OS X 10.6.8) AppleWebKit/533.21.1',
"X-Apple-Store-Front: {$country->storefront}",
'X-Apple-Tz: -18000',
'Accept-Language: en-us, en;q=0.50',
));
$body = curl_exec($ch);
curl_close($ch);
// Convert to UTF-8
$body = mb_convert_encoding($body,'ISO-8859-1','utf-8');
$dom = new DOMDocument();
@$dom->loadHTML($body);
// Set the XPath selectors for the titles, ratings, username & link, Version + Date, Actual review, and how many people found it helpful
$xpath = new DOMXPath($dom);
$has_no_reviews = $xpath->query('//html/body/div[contains(@class,"customer-reviews")]/div[contains(@class,"all-reviews")]/div[contains(@class,"write-a-review")]/div[contains(@class,"has-no-reviews")]');
$review_titles = $xpath->query('//html/body/div[contains(@class,"customer-reviews")]/div[5]/div[contains(@class,"paginated-container")]/div/div/h5/span');
$ratings = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/h5/div/@aria-label");
$user_links = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/span/a/@href");
$users_versions_dates = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/span");
$review_bodies = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/p[1]");
$helpfulness_ratings = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/p[2]/span[1]");
$review_id_urls = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/div/@report-a-concern-fragment-url");
// Stop if there is no more pages of reviews to scrape
if ($has_no_reviews->length != 0) {
$done = true;
break;
}
// DEBUGGING: Output the full html from iTunes to confirm that the data is correct and matches what you expect from iTunes
// Write the output to an *.html file and confirm it's appearance in a web browser.
//echo $body;
// Convert the objects into a results array
// Review title
$i = 1;
foreach ($review_titles as $review_title) {
$results[$i][3] = $review_title->nodeValue;
$i++;
}
// Star rating
$i = 1;
foreach ($ratings as $rating) {
// Just return the first character, which is the star rating;
$results[$i][2] = $rating->nodeValue[0];
$i++;
}
// User, version number and date
$i = 1;
foreach ($users_versions_dates as $user_version_date) {
$split_user_version_date = explode("\n", $user_version_date->nodeValue);
$results[$i][8] = trim($split_user_version_date[4]); // User
$results[$i][1] = trim($split_user_version_date[8]); // Version Number
$results[$i][0] = trim($split_user_version_date[11]); // Date
$i++;
}
// The text of the actual review
$i = 1;
foreach ($review_bodies as $review_body) {
// Remove white space and character return at beginning
$results[$i][4] = trim($review_body->nodeValue);
$i++;
}
// How many people found that the review to be helpful or not helpful
$i = 1;
foreach ($helpfulness_ratings as $helpfulness_rating) {
// If it starts with "Was this review helpful?" then don't include since their review hasn't been evaluated by other users yet.
// NOTE: This initial letter of 'W' works for English, but for Spanish, it'd be "E" for Esta
// TODO: This logic could be improved.
if ($helpfulness_rating->nodeValue[0] == "W") {
$results[$i][5] = "";
$results[$i][6] = "";
$results[$i][7] = "";
} else {
$split_helpfulness_rating = explode("\n", $helpfulness_rating->nodeValue);
$helpful = explode(" ", $split_helpfulness_rating[0]);
$total_votes = explode(" ", trim($split_helpfulness_rating[1]));
$results[$i][5] = intval($helpful[0])/intval($total_votes[0]); // Total Percentage of Helpfulness
$results[$i][6] = $helpful[0]; // Number of Helpful Votes
$results[$i][7] = $total_votes[0]; // Number of Total Votes
}
$i++;
}
// Link to the user's page of reviews
$i = 1;
foreach ($user_links as $user_link) {
$results[$i][9] = $user_link->nodeValue;
$i++;
}
// Review ID
$i = 1;
foreach ($review_id_urls as $review_id_url) {
$review_id = explode("=", $review_id_url->nodeValue);
$results[$i][10] = $review_id[1];
// Add country name in the last column
$results[$i][11] = $country->name;
$i++;
}
// DEBUGGING: Check to see that the information in the array is displayed properly
//print_r($results);
// Write each of the rows to the CSV file
foreach ($results as $fields) {
// Sort the results so that it properly outputs to the CSV file
ksort($fields);
fputcsv($fp, $fields);
}
}
}
fclose($fp);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment