|
#!/usr/bin/env python3 |
|
|
|
|
|
""" |
|
This script loads market data from various sources and combine them into one big file. |
|
""" |
|
from argparse import ArgumentParser |
|
import os,sys |
|
import os.path |
|
|
|
import pandas as pd |
|
import requests |
|
import time |
|
import csv |
|
import glob |
|
|
|
parser = ArgumentParser(description=__doc__) |
|
parser.add_argument( |
|
"--date", |
|
default="", |
|
help="The base date of the data.", |
|
) |
|
parser.add_argument( |
|
"--rm", |
|
default="false", |
|
help="Clean up cached files.", |
|
) |
|
|
|
|
|
def extract(): |
|
all_browser_share_url = 'https://gs.statcounter.com/chart.php?device=Desktop%20%26%20Mobile%20%26%20Tablet%20%26%20Console&device_hidden=desktop%2Bmobile%2Btablet%2Bconsole&multi-device=true&statType_hidden=browser®ion_hidden={}&granularity=yearly&statType=Browser®ion={}&fromInt=2015&toInt=2019&fromYear=2015&toYear=2019&csv=1' |
|
with open('statcounter_key_countries.csv') as csvfile: |
|
spamreader = csv.reader(csvfile) |
|
for row in spamreader: |
|
fpath2 = './data/{}_browser_share.csv'.format(row[1]) |
|
if not os.path.isfile(fpath2): |
|
# url2 = mobile_os_share_url.format(row[2], row[0]) |
|
url2 = all_browser_share_url.format(row[2], row[0]) |
|
print(url2) |
|
r2 = requests.get(url2, allow_redirects=True) |
|
open(fpath2, 'wb').write(r2.content) |
|
print('{} saved'.format(fpath2)) |
|
time.sleep(10) |
|
else: |
|
print('{} exists'.format(fpath2)) |
|
|
|
|
|
def transform_browser(): |
|
pop = pd.read_csv("./internet_users.csv", thousands=",") |
|
countries = pd.read_csv("./statcounter_countries.csv", names=["name", "key", "code"]) |
|
em_markets = ["in", "id", "ph", "th", "vn", "my", "tw", "hk", "sg", "bd", "bn", "kh", "la"] |
|
em_key_markets = ["in", "id"] # , "ph", "th", "tw"] |
|
fx = "Firefox" |
|
output = pd.DataFrame() |
|
for fpath in glob.glob('./data/*_browser_share.csv'): |
|
basename = os.path.basename(fpath) |
|
region_key = basename[0:basename.index('_')] |
|
region_code = countries[countries["key"] == region_key]["code"].values[0] |
|
print(fpath) |
|
df = pd.read_csv(fpath) |
|
df = df.set_index("Date") |
|
df[region_code] = df[fx] |
|
output[region_code] = df[region_code] / 100 |
|
sum_market_share("em_key", em_key_markets, output, pop) |
|
sum_market_share("em_all", em_markets, output, pop) |
|
csv = output.to_csv() |
|
with open("./market_share_trend.csv", "w") as f: |
|
f.write(csv) |
|
|
|
|
|
def sum_market_share(key, markets, output, pop): |
|
em_key_pop = 0 |
|
for market in markets: |
|
mpop = pop[pop["Country code"] == market]["Internet users"].values[0] |
|
em_key_pop += mpop |
|
if key not in output: |
|
output[key] = output[market] * mpop |
|
else: |
|
output[key] += output[market] * mpop |
|
output[key] /= em_key_pop |
|
print(output[key]) |
|
|
|
|
|
def main(): |
|
args = parser.parse_args() |
|
print(args) |
|
extract() |
|
transform_browser() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |