Skip to content

Instantly share code, notes, and snippets.

@spicyramen
Last active April 10, 2018 22:00
Show Gist options
  • Save spicyramen/826e8bce134f2b4c5e18023ff9d01425 to your computer and use it in GitHub Desktop.
Save spicyramen/826e8bce134f2b4c5e18023ff9d01425 to your computer and use it in GitHub Desktop.
"""Extract important information from AppAnnie via API."""
import pandas as pd
from absl import app
from absl import flags
from absl import logging
from bs4 import BeautifulSoup as BS
from collections import namedtuple
from retrying import retry
from datetime import datetime, timedelta
import csv
import json
import os
import requests
import re
import urllib
API_KEY = os.environ['APPANNIE_API_KEY']
GOOGLE_PLAY = 'google-play'
GOOGLE_PLAY_STORE = 'Google Play'
IOS = 'ios'
IOS_STORE = 'ios'
# "Accept-Encoding": "gzip, deflate"
HEADERS = {'Authorization': 'Bearer %s' % API_KEY, 'Accept-Encoding': 'gzip'}
_APPS = 'apps'
_STORES = ('ios', 'google-play')
_RANK = 'rank'
_PRODUCT = 'product'
_PRODUCT_CODE = 'product_code'
_PRODUCT_NAME = 'product_name'
_PRODUCT_ID = 'product_id'
_PARENT_COMPANY_NAME = 'parent_company_name'
_PRIVACY_POLICY_URL = 'privacy_policy_url'
_COMPANY_NAME = 'company_name'
_DESCRIPTION = 'description'
_COMPANY_URL = 'company_url'
_SUPPORT_URL = 'support_url'
_DEVICE = {'ios': 'iphone', 'google-play': 'android'}
COUNTRY_LIST = ('US',)
# ios | mac | appletv | google-play | amazon-appstore | windows-phone |
# windows-store
# Example URL: https://api.appannie.com/v1.2/apps/ios/app/553834731/details
APP_DETAILS_URL = 'https://api.appannie.com/%s/apps/%s/app/%s/details'
TOP_APPS_URL = 'https://api.appannie.com/%s/intelligence/%s/%s/ranking'
COUNTRIES_URL = 'https://api.appannie.com/%s/meta/countries'
API_VERSION = 'v1.2'
_GRANULARITY = 'weekly'
FILENAME = 'data/top_ios_downloads.csv'
COMPANIES = 'data/capitalg_android.csv'
# Wait this long for outgoing HTTP connections to be established.
_CONNECT_TIMEOUT_SECONDS = 90
# Wait this long to read from an HTTP socket.
_READ_TIMEOUT_SECONDS = 120
_INITIAL_RETRY_INTERVAL_MS = 3000
_RETRY_TIMES = 2
FLAGS = flags.FLAGS
flags.DEFINE_string('filename', '', 'Dataset')
flags.DEFINE_integer('max_apps', 1000, 'Max number of apps to search',
lower_bound=1, upper_bound=1000)
flags.DEFINE_string('store', 'ios', 'ios or google-play')
flags.DEFINE_string('device', 'iphone', 'Device: android, ipad, iphone')
CONTACT_REGEXES = [re.compile(r'[\w\.-]+@[\w\.-]+'),
re.compile(
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),'
r']|(?:%[0-9a-fA-F][0-9a-fA-F]))+')]
Application = namedtuple('Application', 'rank name product_id url')
class MobileApplication(object):
"""This represents an App object."""
def __init__(self, app_id):
self.app_id = app_id
self._store = None
self.app_name_unified = None
self.parent_company_name = None
self.publisher_name_raw = None
self.publisher_name = None
self.company_name = None
self.company_url = None
self.support_url = None
self.urls = []
self.total_downloads = 0
self._url = None
@property
def url(self):
return self._url
@url.getter
def url(self):
return APP_DETAILS_URL % (API_VERSION, self.store, self.app_id)
@property
def store(self, store):
self._store = store
@store.getter
def store(self):
return self._store
@store.setter
def store(self, store):
self._store = store
def __str__(self):
return '%s %s %s %d' % (
self.app_id, self.app_name_unified, self.parent_company_name,
self.total_downloads)
def __unicode__(self):
return u'%s %s %s %d' % (
self.app_id, self.app_name_unified, self.parent_company_name,
self.total_downloads)
@property
def details(self):
return 'app_id: %s store: %s app_name_unified: %s ' \
'parent_company_name: %s publisher_name_raw: %s ' \
'publisher_name: %s company_name: %s company_url: %s ' \
'support_url: %s url: %s' % (
self.app_id, self._store, self.app_name_unified,
self.parent_company_name, self.publisher_name_raw,
self.publisher_name, self.company_name, self.company_url,
self.support_url, self.urls)
def GetCountries():
"""
Returns
-------
A list of countries
"""
session = requests.Session()
try:
countries_url = COUNTRIES_URL % API_VERSION
logging.info('Get Countries: %s' % countries_url)
response = session.get(countries_url,
timeout=(
_CONNECT_TIMEOUT_SECONDS,
_READ_TIMEOUT_SECONDS),
headers=HEADERS,
allow_redirects=False,
verify=True)
response.raise_for_status()
return _HandleCountriesResponse(response)
except requests.exceptions.HTTPError as err:
if err.response.status_code == 404:
logging.error('Page not found %s' % countries_url)
return
def GetCategories(store):
"""
Parameters
----------
market
Returns
-------
"""
if not store or store not in _STORES:
raise ValueError('Invalid market, should be ios or google-play')
CATEGORIES_URL = 'https://api.appannie.com/%s/meta/apps/%s/categories'
session = requests.Session()
try:
categories_url = CATEGORIES_URL % (API_VERSION, store)
logging.info('Get Categories: %s' % categories_url)
response = session.get(categories_url,
timeout=(
_CONNECT_TIMEOUT_SECONDS,
_READ_TIMEOUT_SECONDS),
headers=HEADERS,
allow_redirects=False,
verify=True)
response.raise_for_status()
return _HandleCategoriesResponse(response)
except requests.exceptions.HTTPError as err:
if err.response.status_code == 404:
logging.error('Page not found %s' % categories_url)
return
@retry(stop_max_attempt_number=_RETRY_TIMES,
stop_max_delay=_INITIAL_RETRY_INTERVAL_MS)
def GetAppAnnieDetails(mobile_app):
"""
Parameters
----------
mobile_app
Returns
-------
"""
logging.info(
'Looking up: %s at: %s' % (mobile_app.product_id, mobile_app.url))
session = requests.Session()
try:
response = session.get(mobile_app.url,
timeout=(
_CONNECT_TIMEOUT_SECONDS,
_READ_TIMEOUT_SECONDS),
headers=HEADERS,
allow_redirects=False,
verify=True)
response.raise_for_status()
return _HandleAppDetailsResponse(response)
except requests.exceptions.HTTPError as err:
logging.exception(err)
if err.response.status_code == 404:
logging.exception('Page not found %s' % mobile_app.url)
return
if err.response.status_code == 400:
logging.exception('Page not found %s' % mobile_app.url)
return
@retry(stop_max_attempt_number=_RETRY_TIMES,
stop_max_delay=_INITIAL_RETRY_INTERVAL_MS)
def GetTopApps(vertical, store, country, categories, start_date, end_date,
ranks, feeds, granularity, device):
"""Get Top Apps for AppAnnie.
https://api.appannie.com/v1.2/intelligence/
apps/
google-play/
ranking?
device=android&
countries=US&
start_date=2018-03-01&
end_date=2018-03-01&
feeds=free&
categories=OVERALL
Parameters
----------
vertical
store
country
categories
start_date
end_date
ranks
granularity
device
Returns
-------
"""
base_url = TOP_APPS_URL % (API_VERSION, vertical, store)
url_params = {
'countries': country,
'categories': categories,
'start_date': start_date.strftime('%Y-%m-%d'),
'end_date': end_date.strftime('%Y-%m-%d'),
'granularity': granularity,
'device': device
}
if feeds:
url_params['feeds'] = feeds
# Handle Ranks.
if isinstance(ranks, int):
url_params['ranks'] = ranks
url = '%s?%s' % (base_url, urllib.urlencode(url_params))
logging.info('Looking up: %s' % url)
session = requests.Session()
try:
response = session.get(url,
timeout=(
_CONNECT_TIMEOUT_SECONDS,
_READ_TIMEOUT_SECONDS),
headers=HEADERS,
allow_redirects=False,
verify=True)
response.raise_for_status()
return _HandleTopAppsResponse(response)
except requests.exceptions.HTTPError as err:
logging.exception(err)
if err.response.status_code == 404:
logging.error('Page not found %s' % url)
return
if err.response.status_code == 400:
logging.exception('No Apps in this category %s' % url)
return
def _ToString(value):
"""Returns a string type based on value variable type.
Since we handle multiple languages we need to return a string type to write
in file human readable character.
Args:
value: (None, str or unicode)
Returns:
A str or None if no input.
"""
if not value:
# logging.warning('No string, empty value')
return None
if isinstance(value, unicode):
return value.encode('utf-8')
else:
return str(value).encode('utf-8')
def _FetchPageContent(response):
"""Use FetchProxy to fetch the content of a URL.
Args:
response: (requests.models.Response), content we fetched in get request.
Returns:
(response) Image data in bytes as str type.
Raises:
ValueError: Invalid HTTP response.
"""
if not response:
logging.exception('HTTP Response is None')
return
return response.content
def _HandleCountriesResponse(response):
"""
Parameters
----------
response
Returns
-------
"""
content = _FetchPageContent(response)
if not content:
raise ValueError('HTTP Response is None')
content_json = json.loads(content)
country_list = content_json.get('country_list')
if country_list:
logging.info('Found: %d countries.' % len(country_list))
return [country['country_code'] for country in country_list]
def _HandleCategoriesResponse(response):
"""
Parameters
----------
response
Returns
-------
"""
content = _FetchPageContent(response)
if not content:
raise ValueError('HTTP Response is None')
content_json = json.loads(content)
category_list = content_json.get('categories')
category_labels_list = content_json.get('category_labels')
logging.info('Found: %d categories.' % len(category_list))
logging.info('Found: %d category labels.' % len(category_labels_list))
return category_list, category_labels_list
def _HandleTopAppsResponse(response):
"""Build AppDetails.
Parameters
----------
response
Returns
-------
"""
content = _FetchPageContent(response)
if not content:
raise ValueError('HTTP Response is None')
content_json = json.loads(content)
app_list = content_json.get('list')
top_apps = []
if app_list:
for _app in app_list:
rank = _app.get(_RANK)
product_name = _app.get(_PRODUCT_NAME)
product_id = str(_app.get(_PRODUCT_ID))
url = APP_DETAILS_URL % (API_VERSION, FLAGS.store, product_id)
mobile_app = Application(rank, product_name, product_id, url)
logging.info(mobile_app)
top_apps.append(mobile_app)
logging.info('Top Apps found: %s.' % len(top_apps))
return top_apps
def _HandleAppDetailsResponse(response):
"""This function handles HTTP response body in JSON format.
Args:
content: (str). API response information.
Returns:
App response
Raises:
ValueError: Invalid HTTP response.
"""
content = _FetchPageContent(response)
if not content:
raise ValueError('HTTP Response is None')
content_json = json.loads(content)
product = content_json.get(_PRODUCT)
if product:
# Extract App details.
app_id = product.get(_PRODUCT_CODE)
company_name = _ToString(product.get(_COMPANY_NAME))
parent_company_name = _ToString(product.get(_PARENT_COMPANY_NAME))
main_category = _ToString(product.get('main_category'))
company_url = _ToString(product.get(_COMPANY_URL))
privacy_policy_url = _ToString(product.get(_PRIVACY_POLICY_URL))
description = _ToString(product.get(_DESCRIPTION)).replace('\r',
'').replace(
'\n', '').replace(',', '')
email_info, url_info = _ExtractContactInformation(description)
support_url = _ToString(product.get(_SUPPORT_URL))
size = _ToString(product.get('size'))
languages = _ToString(product.get('languages'))
publisher_name = _ToString(product.get('publisher_name'))
unified_product_name = _ToString(product.get('unified_product_name'))
has_iap = product.get('has_iap')
app_details = [app_id, unified_product_name, company_name,
parent_company_name, publisher_name, main_category,
company_url, privacy_policy_url, email_info, url_info,
support_url, description, size, languages,
has_iap]
# logging.info('Company URL: %s Support URL: %s URLS: %s' % (
# company_url, support_url, url_info))
return app_details
def _ExtractContactInformation(description):
"""
Parameters
----------
description
Returns
-------
"""
email_info, url_info = None, None
if not description:
logging.error('No description found')
return email_info, url_info
# Clean HTML code.
description_clean = BS(description.replace('<br>', ' '), 'html.parser')
contact_info = [','.join(contact.findall(description_clean.get_text())) for
contact in CONTACT_REGEXES]
if contact_info:
email_info, url_info = contact_info
return email_info, url_info
def ProcessDataSet(apps):
"""
Parameters
----------
apps
Returns
-------
"""
all_apps = []
for _, _app in apps.iterrows():
_mobile_app = MobileApplication(_app['app_id'])
if _app['store'] == GOOGLE_PLAY_STORE:
_mobile_app.store = GOOGLE_PLAY
elif _app['store'] == IOS_STORE:
_mobile_app.store = IOS
_mobile_app.app_name_unified = _app['app_name_unified']
_mobile_app.parent_company_name = _app['parent_company_name']
_mobile_app.publisher_name_raw = _app['publisher_name_raw']
_mobile_app.publisher_name = _app['publisher_name']
_mobile_app.company_name = _app['company_name']
_mobile_app.total_downloads = _app['total_downloads']
all_apps.append(_mobile_app)
logging.info('Processed %d Applications' % len(all_apps))
return all_apps
def GetHistoricalData(start_date, end_date, granularity):
"""
/v1.2/intelligence/
{vertical}/
{market}
/ranking?
countries={country}&
categories={category}&
feeds={feeds}&
ranks={ranks}&
granularity={granularity}&
device={device}
https://api.appannie.com/v1.2/intelligence/
apps/
google-play/
ranking?
device=android&
countries=US&
start_date=2018-03-01&
end_date=2018-03-01&
feeds=free&
categories=OVERALL
"""
if granularity == 'weekly':
period = 7
elif granularity == 'month':
period = 30
else:
period = 1
start_date = datetime.strptime(start_date, '%Y-%m-%d')
end_date = datetime.strptime(end_date, '%Y-%m-%d')
date_periods = []
while start_date < end_date:
date_periods.append(datetime.strftime(start_date, '%Y-%m-%d'))
start_date = start_date + timedelta(days=period)
return date_periods
def LoadDataSet(filename):
"""
:param filename:
:return:
"""
return pd.read_csv('%s' % (filename))
def _GetDate(date_period):
"""Returns a Date object.
Args:
date_period: (str): A Date in string format 2018-01-12.
Returns:
A datetime object.
"""
year, month, day = date_period.split('-')
return datetime(int(year), int(month), int(day))
def SaveDataSet(results, filename):
"""Write results stored in list of lists into a file in CNS.
Args:
results: (list) list of lists Results with article information.
filename: (str) Destination file.
Raises:
ValueError: Result list is empty.
FileError: Unable to write filename.
"""
if not results:
raise ValueError('Result list is empty')
with open(filename, 'w+') as csvfile:
filewriter = csv.writer(csvfile)
filewriter.writerows(results)
logging.info('Apps stored: %d.', len(results))
def ProcessTopApps(top_apps, market, country, time_period):
"""Gets TopApps details using AppAnnie API.
Args:
top_apps: (list) List of Top Apps namedtuple.
market: (str) ios or google-play.
country: (str) Country of origin. ISO code.
time_period: (str) Historical time period.
Returns:
A list of lists including app details.
"""
app_details_results = []
if not top_apps:
logging.error('No Top Apps')
return
for top_app in top_apps:
app_details = GetAppAnnieDetails(top_app)
app_details_results.append([market, country, time_period] + app_details)
return app_details_results
def main(_):
"""
This function collects Top Apps over a period of time.
The API returns a list of 1000 apps per period. Some App information
may repeat.
We store all Apps in a dictionary.
Write TopApps in CSV file, including period.
For each App we request App details.
Parameters
----------
_
Returns
-------
"""
# date_periods = GetHistoricalData('2018-03-18', '2018-03-24', 'weekly')
# logging.info(date_periods)
category_list, _ = GetCategories(FLAGS.store)
country_list = GetCountries() + ['WW']
category_list = ['Overall > Games > Educational']
date_periods = ['2018-03-24']
total_apps_count = 0
all_apps = []
for country in country_list:
for date_period in date_periods:
for category in category_list:
logging.info('Getting Top Apps for: %s category: %s' % (
date_period, category))
# Collect list of Applications.
top_apps = GetTopApps(vertical=_APPS,
store=FLAGS.store,
country=country,
categories=category,
start_date=_GetDate(date_period),
end_date=_GetDate(date_period),
ranks=None,
feeds=None,
granularity=_GRANULARITY,
device=FLAGS.device)
# Collect Application details.
if top_apps:
logging.info('Top Apps found: %d in %s', len(top_apps),
country)
top_apps_details = ProcessTopApps(top_apps, FLAGS.store,
country, date_period)
total_apps_count += len(top_apps)
if top_apps_details:
all_apps.extend(top_apps_details)
else:
logging.warning('No Top Apps details found')
logging.info('Total apps found so far: %d', len(all_apps))
SaveDataSet(all_apps,
'data/%s_%s.csv' % (
'overall_games_educational_iphone', date_period))
logging.info('Total number of apps: %d' % total_apps_count)
if __name__ == '__main__':
app.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment