imvladikon/make_dataset.py

## make_dataset.py
# -*- coding: utf-8 -*-
import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import requests
from bs4 import BeautifulSoup
import json
import os.path
import re
import aiohttp
import asyncio


@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
async def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """

    logger = logging.getLogger(__name__)
    logger.info('making data from json app_ids')

    app_ids = set(line.strip() for line in open(input_filepath))
    if os.path.isfile(output_filepath):
        app_ids_o = set(line.strip().split(";")[0]
                        for line in open(output_filepath))
        app_ids = app_ids.difference(app_ids_o)
    with open(output_filepath, "w") as ofile:
        for app_id in app_ids:
            category = await get_cat_by_app(app_id)
            ofile.writelines(app_id + ";" + category + "\n")
            # print(app_id + ";" + category)
    processing_file(output_filepath, "../../data/processed/" + os.path.basename(output_filepath))


def processing_file(input_filepath, output_filepath):
    with open(output_filepath, 'w') as new_file:
        old_file = set(line.strip() for line in open(input_filepath))
        for line in old_file:
            new_line = line
            if "game" in new_line.split(";")[0] and new_line.split(";")[1] == "UNKNOWN":
                new_line = new_line.split(";")[0] + ";" + "GAME_UNKNOWN"
            new_file.write(new_line + "\n")


# right function, but too slow
async def get_category_by_app(app_id) -> str:
    url = "https://play.google.com/store/apps/details?id={}&hl=en&gl=us".format(
        app_id)
    page = requests.get(url)
    page.encoding = page.apparent_encoding
    soup = BeautifulSoup(page.text, 'html.parser')
    category = "UNKNOWN"
    tag = soup.select_one("a[href*='/store/apps/category/']")
    if tag and tag.get("href"):
        category = tag["href"].replace(
            "https://play.google.com", "").replace("/store/apps/category/", "")
    return category


async def get_cat_by_app(app_id) -> str:
    url = "https://play.google.com/store/apps/details?id={}&hl=en&gl=us".format(
        app_id)
    # page = requests.get(url)
    # page.encoding = page.apparent_encoding
    # html = page.text
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url)
        search = re.search('itemprop="genre" href="\/store\/apps\/category\/(\w+)', html, re.IGNORECASE)
        category = "UNKNOWN"
        if search:
            category = search.group(1)
        return category


async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()


if __name__ == '__main__':
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    # not used in this stub but often useful for finding various files
    project_dir = Path(__file__).resolve().parents[2]

    # find .env automagically by walking up directories until it's found, then
    # load up the .env entries as environment variables
    load_dotenv(find_dotenv())

    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
    loop.close()
	# -- coding: utf-8 --
	import click
	import logging
	from pathlib import Path
	from dotenv import find_dotenv, load_dotenv
	import requests
	from bs4 import BeautifulSoup
	import json
	import os.path
	import re
	import aiohttp
	import asyncio


	@click.command()
	@click.argument('input_filepath', type=click.Path(exists=True))
	@click.argument('output_filepath', type=click.Path())
	async def main(input_filepath, output_filepath):
	""" Runs data processing scripts to turn raw data from (../raw) into
	cleaned data ready to be analyzed (saved in ../processed).
	"""

	logger = logging.getLogger(__name__)
	logger.info('making data from json app_ids')

	app_ids = set(line.strip() for line in open(input_filepath))
	if os.path.isfile(output_filepath):
	app_ids_o = set(line.strip().split(";")[0]
	for line in open(output_filepath))
	app_ids = app_ids.difference(app_ids_o)
	with open(output_filepath, "w") as ofile:
	for app_id in app_ids:
	category = await get_cat_by_app(app_id)
	ofile.writelines(app_id + ";" + category + "\n")
	# print(app_id + ";" + category)
	processing_file(output_filepath, "../../data/processed/" + os.path.basename(output_filepath))



	def processing_file(input_filepath, output_filepath):
	with open(output_filepath, 'w') as new_file:
	old_file = set(line.strip() for line in open(input_filepath))
	for line in old_file:
	new_line = line
	if "game" in new_line.split(";")[0] and new_line.split(";")[1] == "UNKNOWN":
	new_line = new_line.split(";")[0] + ";" + "GAME_UNKNOWN"
	new_file.write(new_line + "\n")


	# right function, but too slow
	async def get_category_by_app(app_id) -> str:
	url = "https://play.google.com/store/apps/details?id={}&hl=en&gl=us".format(
	app_id)
	page = requests.get(url)
	page.encoding = page.apparent_encoding
	soup = BeautifulSoup(page.text, 'html.parser')
	category = "UNKNOWN"
	tag = soup.select_one("a[href*='/store/apps/category/']")
	if tag and tag.get("href"):
	category = tag["href"].replace(
	"https://play.google.com", "").replace("/store/apps/category/", "")
	return category


	async def get_cat_by_app(app_id) -> str:
	url = "https://play.google.com/store/apps/details?id={}&hl=en&gl=us".format(
	app_id)
	# page = requests.get(url)
	# page.encoding = page.apparent_encoding
	# html = page.text
	async with aiohttp.ClientSession() as session:
	html = await fetch(session, url)
	search = re.search('itemprop="genre" href="\/store\/apps\/category\/(\w+)', html, re.IGNORECASE)
	category = "UNKNOWN"
	if search:
	category = search.group(1)
	return category


	async def fetch(session, url):
	async with session.get(url) as response:
	return await response.text()


	if __name__ == '__main__':
	log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	logging.basicConfig(level=logging.INFO, format=log_fmt)

	# not used in this stub but often useful for finding various files
	project_dir = Path(__file__).resolve().parents[2]

	# find .env automagically by walking up directories until it's found, then
	# load up the .env entries as environment variables
	load_dotenv(find_dotenv())

	loop = asyncio.get_event_loop()
	loop.run_until_complete(main())
	loop.close()