Skip to content

Instantly share code, notes, and snippets.

@imvladikon
Created December 29, 2019 21:31
Show Gist options
  • Save imvladikon/982653b574f9bd3644a2aace84874311 to your computer and use it in GitHub Desktop.
Save imvladikon/982653b574f9bd3644a2aace84874311 to your computer and use it in GitHub Desktop.
Get genre information of apps from the android market
# -*- coding: utf-8 -*-
import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import requests
from bs4 import BeautifulSoup
import json
import os.path
import re
import aiohttp
import asyncio
@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
async def main(input_filepath, output_filepath):
""" Runs data processing scripts to turn raw data from (../raw) into
cleaned data ready to be analyzed (saved in ../processed).
"""
logger = logging.getLogger(__name__)
logger.info('making data from json app_ids')
app_ids = set(line.strip() for line in open(input_filepath))
if os.path.isfile(output_filepath):
app_ids_o = set(line.strip().split(";")[0]
for line in open(output_filepath))
app_ids = app_ids.difference(app_ids_o)
with open(output_filepath, "w") as ofile:
for app_id in app_ids:
category = await get_cat_by_app(app_id)
ofile.writelines(app_id + ";" + category + "\n")
# print(app_id + ";" + category)
processing_file(output_filepath, "../../data/processed/" + os.path.basename(output_filepath))
def processing_file(input_filepath, output_filepath):
with open(output_filepath, 'w') as new_file:
old_file = set(line.strip() for line in open(input_filepath))
for line in old_file:
new_line = line
if "game" in new_line.split(";")[0] and new_line.split(";")[1] == "UNKNOWN":
new_line = new_line.split(";")[0] + ";" + "GAME_UNKNOWN"
new_file.write(new_line + "\n")
# right function, but too slow
async def get_category_by_app(app_id) -> str:
url = "https://play.google.com/store/apps/details?id={}&hl=en&gl=us".format(
app_id)
page = requests.get(url)
page.encoding = page.apparent_encoding
soup = BeautifulSoup(page.text, 'html.parser')
category = "UNKNOWN"
tag = soup.select_one("a[href*='/store/apps/category/']")
if tag and tag.get("href"):
category = tag["href"].replace(
"https://play.google.com", "").replace("/store/apps/category/", "")
return category
async def get_cat_by_app(app_id) -> str:
url = "https://play.google.com/store/apps/details?id={}&hl=en&gl=us".format(
app_id)
# page = requests.get(url)
# page.encoding = page.apparent_encoding
# html = page.text
async with aiohttp.ClientSession() as session:
html = await fetch(session, url)
search = re.search('itemprop="genre" href="\/store\/apps\/category\/(\w+)', html, re.IGNORECASE)
category = "UNKNOWN"
if search:
category = search.group(1)
return category
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
# not used in this stub but often useful for finding various files
project_dir = Path(__file__).resolve().parents[2]
# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
load_dotenv(find_dotenv())
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment