mdzhang/nomad.py

## nomad.py
"""Scrape data from NomadList into local CSV.

Usage:
    python scrape.py --cities Austin 'Chiang Mai' Taipei Auckland Ubud 'Buenos Aires' 'Mexico City'
"""
import argparse
import logging
import os
import re
import string
import typing as T

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from tabulate import tabulate

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


def load_to_record(city):
    clean_city = re.sub(r"\s+", "-", city.lower())
    url = f"https://nomadlist.com/{clean_city}"

    driver = webdriver.Firefox()

    driver.get(url)

    html_source = driver.page_source

    driver.close()

    soup = BeautifulSoup(html_source, "html.parser")
    nomad_scores = soup.find_all("div", attrs={"class": "tab-ranking"})[0]
    keys = list(
        map(lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "key"}))
    )
    values = list(
        map(
            lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "value"})
        )
    )

    record = dict(zip(keys, values))
    record["city"] = city
    return record


def load_to_df(cities):
    def skip_fail(city):
        try:
            return load_to_record(city)
        except Exception as exc:
            logger.exception(f"Failed to fetch city: {city}, {exc}")
            return None

    records = list(filter(None, map(skip_fail, cities)))

    df = pd.DataFrame.from_dict(records)

    def strip_emojis(s):
        return "".join(filter(lambda x: x in string.printable, s)).strip()

    cols = map(lambda col: strip_emojis(col), df.columns)
    df.columns = cols

    top_cols = [
        "LGBT friendly",
        "Female friendly",
        "Safety",
        "Nomad Score",
        "Internet",
        "Walkability",
        "Traffic safety",
        "English speaking",
        "Fun",
        "Happiness",
        "Places to work from",
        "Cost",
        "city",
    ]

    df2 = df[top_cols]
    # pd.set_option('display.max_columns', None)

    def extract_cost(df):
        df2 = df["Cost"].str.split(":", expand=True)
        cost = (
            df2[1]
            .str.extract(pat=r"\$([\d,]+) \/ mo")[0]
            .str.replace(",", "")
            .astype(int)
        )
        return cost

    df2["Cost"] = extract_cost(df2)

    def extract_internet(df):
        df2 = df["Internet"].str.split(":", expand=True)
        speed = (
            df2[1]
            .str.extract(pat=r"([\d,]+)Mbps \(avg\)")[0]
            .str.replace(",", "")
            .astype(int)
        )
        return speed

    df2["Internet"] = extract_internet(df2)

    def extract_nomad_score(df):
        return df2["Nomad Score"].str.split("/", expand=True)[0].astype(float)

    df2["Nomad Score"] = extract_nomad_score(df2)

    cat_cols = set(df2.dtypes[df2.dtypes == "object"].index)
    cat_cols.remove("city")
    levels = ["Bad", "Okay", "Good", "Great"]

    df2[cat_cols] = df2[cat_cols].apply(
        lambda s: s.astype("category").cat.set_categories(levels, ordered=True)
    )

    return df2.sort_values(
        by=["LGBT friendly", "Female friendly", "Walkability", "Safety"],
        ascending=False,
    )


def get_parser():
    parser = argparse.ArgumentParser(
        description="Fetch data from NomadList and write as CSV"
    )
    parser.add_argument("--cities", nargs="+", help="Cities to fetch data on")
    return parser


def main(cities=T.List[str]):
    cache_file = "nomadlist.csv"

    if not os.path.exists(cache_file):
        logger.info(f"Fetching contents for first time '{cache_file}'")
        df = load_to_df(cities)
        df.to_csv(cache_file, index=False)
    else:
        logger.info(f"Reusing local '{cache_file}'")
        df = pd.read_csv(cache_file)

    print(tabulate(df, headers="keys", tablefmt="psql"))


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    main(cities=args.cities)
	"""Scrape data from NomadList into local CSV.

	Usage:
	python scrape.py --cities Austin 'Chiang Mai' Taipei Auckland Ubud 'Buenos Aires' 'Mexico City'
	"""
	import argparse
	import logging
	import os
	import re
	import string
	import typing as T

	import pandas as pd
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from tabulate import tabulate

	logging.basicConfig()
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.DEBUG)


	def load_to_record(city):
	clean_city = re.sub(r"\s+", "-", city.lower())
	url = f"https://nomadlist.com/{clean_city}"

	driver = webdriver.Firefox()

	driver.get(url)

	html_source = driver.page_source

	driver.close()

	soup = BeautifulSoup(html_source, "html.parser")
	nomad_scores = soup.find_all("div", attrs={"class": "tab-ranking"})[0]
	keys = list(
	map(lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "key"}))
	)
	values = list(
	map(
	lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "value"})
	)
	)

	record = dict(zip(keys, values))
	record["city"] = city
	return record


	def load_to_df(cities):
	def skip_fail(city):
	try:
	return load_to_record(city)
	except Exception as exc:
	logger.exception(f"Failed to fetch city: {city}, {exc}")
	return None

	records = list(filter(None, map(skip_fail, cities)))

	df = pd.DataFrame.from_dict(records)

	def strip_emojis(s):
	return "".join(filter(lambda x: x in string.printable, s)).strip()

	cols = map(lambda col: strip_emojis(col), df.columns)
	df.columns = cols

	top_cols = [
	"LGBT friendly",
	"Female friendly",
	"Safety",
	"Nomad Score",
	"Internet",
	"Walkability",
	"Traffic safety",
	"English speaking",
	"Fun",
	"Happiness",
	"Places to work from",
	"Cost",
	"city",
	]

	df2 = df[top_cols]
	# pd.set_option('display.max_columns', None)

	def extract_cost(df):
	df2 = df["Cost"].str.split(":", expand=True)
	cost = (
	df2[1]
	.str.extract(pat=r"\$([\d,]+) \/ mo")[0]
	.str.replace(",", "")
	.astype(int)
	)
	return cost

	df2["Cost"] = extract_cost(df2)

	def extract_internet(df):
	df2 = df["Internet"].str.split(":", expand=True)
	speed = (
	df2[1]
	.str.extract(pat=r"([\d,]+)Mbps \(avg\)")[0]
	.str.replace(",", "")
	.astype(int)
	)
	return speed

	df2["Internet"] = extract_internet(df2)

	def extract_nomad_score(df):
	return df2["Nomad Score"].str.split("/", expand=True)[0].astype(float)

	df2["Nomad Score"] = extract_nomad_score(df2)

	cat_cols = set(df2.dtypes[df2.dtypes == "object"].index)
	cat_cols.remove("city")
	levels = ["Bad", "Okay", "Good", "Great"]

	df2[cat_cols] = df2[cat_cols].apply(
	lambda s: s.astype("category").cat.set_categories(levels, ordered=True)
	)

	return df2.sort_values(
	by=["LGBT friendly", "Female friendly", "Walkability", "Safety"],
	ascending=False,
	)


	def get_parser():
	parser = argparse.ArgumentParser(
	description="Fetch data from NomadList and write as CSV"
	)
	parser.add_argument("--cities", nargs="+", help="Cities to fetch data on")
	return parser


	def main(cities=T.List[str]):
	cache_file = "nomadlist.csv"

	if not os.path.exists(cache_file):
	logger.info(f"Fetching contents for first time '{cache_file}'")
	df = load_to_df(cities)
	df.to_csv(cache_file, index=False)
	else:
	logger.info(f"Reusing local '{cache_file}'")
	df = pd.read_csv(cache_file)

	print(tabulate(df, headers="keys", tablefmt="psql"))


	if __name__ == "__main__":
	parser = get_parser()
	args = parser.parse_args()
	main(cities=args.cities)