Skip to content

Instantly share code, notes, and snippets.

@mdzhang
Created October 12, 2019 19:24
Show Gist options
  • Save mdzhang/5d6ad5927295a0de1b8ac93140281fb1 to your computer and use it in GitHub Desktop.
Save mdzhang/5d6ad5927295a0de1b8ac93140281fb1 to your computer and use it in GitHub Desktop.
Scrape NomadList for data and dump to CSV
"""Scrape data from NomadList into local CSV.
Usage:
python scrape.py --cities Austin 'Chiang Mai' Taipei Auckland Ubud 'Buenos Aires' 'Mexico City'
"""
import argparse
import logging
import os
import re
import string
import typing as T
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from tabulate import tabulate
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def load_to_record(city):
clean_city = re.sub(r"\s+", "-", city.lower())
url = f"https://nomadlist.com/{clean_city}"
driver = webdriver.Firefox()
driver.get(url)
html_source = driver.page_source
driver.close()
soup = BeautifulSoup(html_source, "html.parser")
nomad_scores = soup.find_all("div", attrs={"class": "tab-ranking"})[0]
keys = list(
map(lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "key"}))
)
values = list(
map(
lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "value"})
)
)
record = dict(zip(keys, values))
record["city"] = city
return record
def load_to_df(cities):
def skip_fail(city):
try:
return load_to_record(city)
except Exception as exc:
logger.exception(f"Failed to fetch city: {city}, {exc}")
return None
records = list(filter(None, map(skip_fail, cities)))
df = pd.DataFrame.from_dict(records)
def strip_emojis(s):
return "".join(filter(lambda x: x in string.printable, s)).strip()
cols = map(lambda col: strip_emojis(col), df.columns)
df.columns = cols
top_cols = [
"LGBT friendly",
"Female friendly",
"Safety",
"Nomad Score",
"Internet",
"Walkability",
"Traffic safety",
"English speaking",
"Fun",
"Happiness",
"Places to work from",
"Cost",
"city",
]
df2 = df[top_cols]
# pd.set_option('display.max_columns', None)
def extract_cost(df):
df2 = df["Cost"].str.split(":", expand=True)
cost = (
df2[1]
.str.extract(pat=r"\$([\d,]+) \/ mo")[0]
.str.replace(",", "")
.astype(int)
)
return cost
df2["Cost"] = extract_cost(df2)
def extract_internet(df):
df2 = df["Internet"].str.split(":", expand=True)
speed = (
df2[1]
.str.extract(pat=r"([\d,]+)Mbps \(avg\)")[0]
.str.replace(",", "")
.astype(int)
)
return speed
df2["Internet"] = extract_internet(df2)
def extract_nomad_score(df):
return df2["Nomad Score"].str.split("/", expand=True)[0].astype(float)
df2["Nomad Score"] = extract_nomad_score(df2)
cat_cols = set(df2.dtypes[df2.dtypes == "object"].index)
cat_cols.remove("city")
levels = ["Bad", "Okay", "Good", "Great"]
df2[cat_cols] = df2[cat_cols].apply(
lambda s: s.astype("category").cat.set_categories(levels, ordered=True)
)
return df2.sort_values(
by=["LGBT friendly", "Female friendly", "Walkability", "Safety"],
ascending=False,
)
def get_parser():
parser = argparse.ArgumentParser(
description="Fetch data from NomadList and write as CSV"
)
parser.add_argument("--cities", nargs="+", help="Cities to fetch data on")
return parser
def main(cities=T.List[str]):
cache_file = "nomadlist.csv"
if not os.path.exists(cache_file):
logger.info(f"Fetching contents for first time '{cache_file}'")
df = load_to_df(cities)
df.to_csv(cache_file, index=False)
else:
logger.info(f"Reusing local '{cache_file}'")
df = pd.read_csv(cache_file)
print(tabulate(df, headers="keys", tablefmt="psql"))
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
main(cities=args.cities)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment