Created
December 29, 2019 21:31
-
-
Save imvladikon/982653b574f9bd3644a2aace84874311 to your computer and use it in GitHub Desktop.
Get genre information of apps from the android market
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import click | |
import logging | |
from pathlib import Path | |
from dotenv import find_dotenv, load_dotenv | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import os.path | |
import re | |
import aiohttp | |
import asyncio | |
@click.command() | |
@click.argument('input_filepath', type=click.Path(exists=True)) | |
@click.argument('output_filepath', type=click.Path()) | |
async def main(input_filepath, output_filepath): | |
""" Runs data processing scripts to turn raw data from (../raw) into | |
cleaned data ready to be analyzed (saved in ../processed). | |
""" | |
logger = logging.getLogger(__name__) | |
logger.info('making data from json app_ids') | |
app_ids = set(line.strip() for line in open(input_filepath)) | |
if os.path.isfile(output_filepath): | |
app_ids_o = set(line.strip().split(";")[0] | |
for line in open(output_filepath)) | |
app_ids = app_ids.difference(app_ids_o) | |
with open(output_filepath, "w") as ofile: | |
for app_id in app_ids: | |
category = await get_cat_by_app(app_id) | |
ofile.writelines(app_id + ";" + category + "\n") | |
# print(app_id + ";" + category) | |
processing_file(output_filepath, "../../data/processed/" + os.path.basename(output_filepath)) | |
def processing_file(input_filepath, output_filepath): | |
with open(output_filepath, 'w') as new_file: | |
old_file = set(line.strip() for line in open(input_filepath)) | |
for line in old_file: | |
new_line = line | |
if "game" in new_line.split(";")[0] and new_line.split(";")[1] == "UNKNOWN": | |
new_line = new_line.split(";")[0] + ";" + "GAME_UNKNOWN" | |
new_file.write(new_line + "\n") | |
# right function, but too slow | |
async def get_category_by_app(app_id) -> str: | |
url = "https://play.google.com/store/apps/details?id={}&hl=en&gl=us".format( | |
app_id) | |
page = requests.get(url) | |
page.encoding = page.apparent_encoding | |
soup = BeautifulSoup(page.text, 'html.parser') | |
category = "UNKNOWN" | |
tag = soup.select_one("a[href*='/store/apps/category/']") | |
if tag and tag.get("href"): | |
category = tag["href"].replace( | |
"https://play.google.com", "").replace("/store/apps/category/", "") | |
return category | |
async def get_cat_by_app(app_id) -> str: | |
url = "https://play.google.com/store/apps/details?id={}&hl=en&gl=us".format( | |
app_id) | |
# page = requests.get(url) | |
# page.encoding = page.apparent_encoding | |
# html = page.text | |
async with aiohttp.ClientSession() as session: | |
html = await fetch(session, url) | |
search = re.search('itemprop="genre" href="\/store\/apps\/category\/(\w+)', html, re.IGNORECASE) | |
category = "UNKNOWN" | |
if search: | |
category = search.group(1) | |
return category | |
async def fetch(session, url): | |
async with session.get(url) as response: | |
return await response.text() | |
if __name__ == '__main__': | |
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
logging.basicConfig(level=logging.INFO, format=log_fmt) | |
# not used in this stub but often useful for finding various files | |
project_dir = Path(__file__).resolve().parents[2] | |
# find .env automagically by walking up directories until it's found, then | |
# load up the .env entries as environment variables | |
load_dotenv(find_dotenv()) | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main()) | |
loop.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment