Skip to content

Instantly share code, notes, and snippets.

@elin-moco
Created October 17, 2019 02:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save elin-moco/18cde0465ef17db7f09fbabec34ae8f1 to your computer and use it in GitHub Desktop.
Save elin-moco/18cde0465ef17db7f09fbabec34ae8f1 to your computer and use it in GitHub Desktop.
Extract Firefox Market Share Data
#!/usr/bin/env python3
"""
This script loads market data from various sources and combine them into one big file.
"""
from argparse import ArgumentParser
import os,sys
import os.path
import pandas as pd
import requests
import time
import csv
import glob
parser = ArgumentParser(description=__doc__)
parser.add_argument(
"--date",
default="",
help="The base date of the data.",
)
parser.add_argument(
"--rm",
default="false",
help="Clean up cached files.",
)
def extract():
all_browser_share_url = 'https://gs.statcounter.com/chart.php?device=Desktop%20%26%20Mobile%20%26%20Tablet%20%26%20Console&device_hidden=desktop%2Bmobile%2Btablet%2Bconsole&multi-device=true&statType_hidden=browser&region_hidden={}&granularity=yearly&statType=Browser&region={}&fromInt=2015&toInt=2019&fromYear=2015&toYear=2019&csv=1'
with open('statcounter_key_countries.csv') as csvfile:
spamreader = csv.reader(csvfile)
for row in spamreader:
fpath2 = './data/{}_browser_share.csv'.format(row[1])
if not os.path.isfile(fpath2):
# url2 = mobile_os_share_url.format(row[2], row[0])
url2 = all_browser_share_url.format(row[2], row[0])
print(url2)
r2 = requests.get(url2, allow_redirects=True)
open(fpath2, 'wb').write(r2.content)
print('{} saved'.format(fpath2))
time.sleep(10)
else:
print('{} exists'.format(fpath2))
def transform_browser():
pop = pd.read_csv("./internet_users.csv", thousands=",")
countries = pd.read_csv("./statcounter_countries.csv", names=["name", "key", "code"])
em_markets = ["in", "id", "ph", "th", "vn", "my", "tw", "hk", "sg", "bd", "bn", "kh", "la"]
em_key_markets = ["in", "id"] # , "ph", "th", "tw"]
fx = "Firefox"
output = pd.DataFrame()
for fpath in glob.glob('./data/*_browser_share.csv'):
basename = os.path.basename(fpath)
region_key = basename[0:basename.index('_')]
region_code = countries[countries["key"] == region_key]["code"].values[0]
print(fpath)
df = pd.read_csv(fpath)
df = df.set_index("Date")
df[region_code] = df[fx]
output[region_code] = df[region_code] / 100
sum_market_share("em_key", em_key_markets, output, pop)
sum_market_share("em_all", em_markets, output, pop)
csv = output.to_csv()
with open("./market_share_trend.csv", "w") as f:
f.write(csv)
def sum_market_share(key, markets, output, pop):
em_key_pop = 0
for market in markets:
mpop = pop[pop["Country code"] == market]["Internet users"].values[0]
em_key_pop += mpop
if key not in output:
output[key] = output[market] * mpop
else:
output[key] += output[market] * mpop
output[key] /= em_key_pop
print(output[key])
def main():
args = parser.parse_args()
print(args)
extract()
transform_browser()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment