Skip to content

Instantly share code, notes, and snippets.

@AmauryCarrade
Created April 7, 2019 15:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AmauryCarrade/6cf41bfecb0f352bea3209c1a9e7f874 to your computer and use it in GitHub Desktop.
Save AmauryCarrade/6cf41bfecb0f352bea3209c1a9e7f874 to your computer and use it in GitHub Desktop.
"""
- Classification
- Date
- Location
- Category
- Title
- Price per person
- Review Rating
- Number of reives
- Content type : 1 => une photo, 2 => vidéo, 3 => multiple photos
- Duration => heures
- Offer => “Includes:1 repas”, “Includes:1 repas + boisson”, etc.
- Number of languages
- Languages
- Host photo
- About host
- What we'll do
- What else you should do
- What we'll provide
- Cancellation policy => ignore
- Who can come
- Government ID
- Group size
- Activity level
- Alcohol
- Link
"""
from time import sleep
import csv
import json
import os
from datetime import datetime
import click
import requests
from bs4 import BeautifulSoup
from click.termui import get_terminal_size
from click._termui_impl import BEFORE_BAR as WRITE_OVER_CHAR
from path import Path
# WARNING
# These were valid when this script was executed, but they are very likely to
# have a short TTL. If nothing work anymore, load a AirBNB page with a listing,
# lookup for XHR requests, and grab the client session ID & API key in the GET
# parameters.
CLIENT_SESSION_ID = "0b0ebe63-129a-4342-8ae1-f1f71f42ac9a"
API_KEY = "d306zoyjsyarp7ifhu67rjxn52tv0t20"
headers = [
"id", # ok
"classification", # ok
"date", # ok
"location", # ok
"category", # ok
"title", # ok
"price_per_person", # ok
"review_rating", # ok
"number_of_reives", # ok
"content_type", # ok
"duration", # ok
"offer", # ok
"number_of_languages", # ok
"languages", # ok
"host_photo", # ok
"about_host", # ok
"what_well_do", # ok
"what_else_you_should_do", # ok
"what_well_provide", # ok
"cancellation_policy", # ok
"who_can_come", # ok
"government_id", # ok
"group_size", # ok
"activity_level", # ok
"alcohol", # ok
"link", # ok
]
def save(experiences, h, to):
"""saves experiences with headers h to the csv filename `to`"""
with open(to, "w", newline="") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=h)
# Only write header if required
if "link" not in experiences[0] or experiences[0]["link"] != "link":
writer.writeheader()
for row in experiences:
writer.writerow(row)
@click.command(options_metavar="[options]")
@click.argument("filename", type=click.Path(dir_okay=False, readable=True, writable=True), metavar="<filename.csv>")
@click.option(
"--dirname",
type=click.Path(dir_okay=True, file_okay=False, writable=True),
help="The directory to write additional data into. If not specified, a "
"directory derived from the file name will be used.",
)
@click.option(
"--query", prompt="Please enter the search query (e.g. “Barcelona, Spain”)", help="The query sent to AirBNB."
)
@click.option("--locale", default="en", help="The localization of the results.", show_default=True)
@click.option(
"--skip-links-collection",
is_flag=True,
default=False,
help="If true, the given CSV will be loaded and only data from links "
"already there will be collected, without re-loading all experiences.",
)
@click.option(
"--client-session-id",
default=CLIENT_SESSION_ID,
help="The session ID to use for requests. If all requests are refused, you "
"may want to extract your session ID from XHR requests executed by "
"AirBNB's listing pages, somewhere in the GET parameters.",
)
@click.option(
"--api-key",
default=API_KEY,
help="The API key to use for requests. Same as before, update this key by "
"your own key found at the same place if the requests fail.",
)
@click.option("--wait-delay", default=4, help="The delay in seconds to wait between two requests.", show_default=True)
@click.option(
"--debug/--no-debug", default=False, help="Prints debug infos after the data collection", show_default=True
)
def cmd(filename, dirname, query, client_session_id, api_key, locale, skip_links_collection, wait_delay, debug):
if not os.path.exists(filename):
with open(filename, "w+"):
pass
with open(filename, newline="") as csv_file:
experiences = [row for row in csv.DictReader(csv_file, headers)]
s = requests.Session()
# Tells AirBNB we're a legitimate browser (+ firefox rpz)
s.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0"
# 1. All links
if not skip_links_collection:
"""
https://www.airbnb.com/api/v2/explore_tabs?locale=en&_format=for_explore_search_web&_intents=p1&adults=1&auto_ib=true&children=0&client_session_id=0b0ebe63-129a-4342-8ae1-f1f71f42ac9a&currency=EUR&experiences_per_grid=20&fetch_filters=true&guidebooks_per_grid=20&has_zero_guest_treatment=true&infants=0&is_guided_search=true&is_new_cards_experiment=true&is_standard_search=true&is_user_submitted_query=true&items_per_grid=18&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&locale=fr&luxury_pre_launch=false&metadata_only=false&query=Barcelone,%20Spain&query_understanding_enabled=true&refinement_paths[]=/experiences&satori_version=1.1.8&screen_size=large&selected_tab_id=experience_tab&show_groupings=true&supports_for_you_v3=true&timezone_offset=60&version=1.4.5
https://www.airbnb.com/api/v2/explore_tabs?locale=en&_format=for_explore_search_web&_intents=p1&adults=1&auto_ib=true&children=0&client_session_id=0b0ebe63-129a-4342-8ae1-f1f71f42ac9a&currency=EUR&experiences_per_grid=20&federated_search_session_id=564ca790-d95e-43e4-a757-752d01d2fa31&fetch_filters=true&guidebooks_per_grid=20&has_zero_guest_treatment=true&infants=0&is_guided_search=true&is_new_cards_experiment=true&is_standard_search=true&is_user_submitted_query=true&items_offset=18&items_per_grid=18&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&locale=fr&luxury_pre_launch=true&metadata_only=false&query=Barcelone, Spain&query_understanding_enabled=true&refinement_paths[]=/experiences&satori_version=1.1.8&screen_size=large&section_offset=2&show_groupings=true&supports_for_you_v3=true&tab_id=experience_tab&timezone_offset=60&version=1.4.
locale=en important
items_offset=18
items_per_grid=18
"""
def get_experiences(refinement_path=None):
if refinement_path is None:
refinement_path = "/experiences"
offset = 0
# It looks like the batch size is capped by airbnb, this is a size below the
# experimental max of 49 just to be sure
batch_size = 32
base_url = (
"https://www.airbnb.com/api/v2/explore_tabs"
"?_format=for_explore_search_web"
"&_intents=p1"
# "&adults=0"
"&auto_ib=true"
# "&children=0"
f"&client_session_id={client_session_id}" # may have to change that for later runs
"&currency=EUR"
"&experiences_per_grid=20"
"&fetch_filters=true"
"&guidebooks_per_grid=20"
"&has_zero_guest_treatment=true"
# "&infants=0"
"&is_guided_search=true"
"&is_new_cards_experiment=true"
"&is_standard_search=true"
f"&items_per_grid={batch_size}"
f"&key={api_key}" # API key - may have to change that for later runs too
f"&locale={locale}" # Locale, important
"&luxury_pre_launch=false"
"&metadata_only=false"
f"&query={query}" # If we want results for another location, change here
f"&refinement_paths[]={refinement_path}"
"&query_understanding_enabled=true"
"&satori_version=1.1.8"
"&screen_size=large"
"&selected_tab_id=experience_tab"
"&show_groupings=true"
"&supports_for_you_v3=true"
"&timezone_offset=60"
"&version=1.4.5"
)
# For some reasons, this must be set for the non-filtered query but not for
# refined ones, else we get no results. Well, don't ask.
base_url += "&is_user_submitted_query=true" if refinement_path is None else "" # yeah totally
refinements = []
experiences = []
click.secho(f"Loading experiences from {refinement_path}…", bold=True, fg="blue")
while True:
offset_query = f"&items_offset={offset}" if offset > 0 else ""
r = s.get(base_url + offset_query)
if not r.ok:
print(f"[!] Request failed: HTTP {r.status_code}")
continue
# print(r.url)
exp_list = r.json()
# print(exp_list)
if "explore_tabs" not in exp_list:
print("Missing explore_tabs")
continue
exp_tab = None
for tab in exp_list["explore_tabs"]:
if "tab_id" in tab and tab["tab_id"] == "experience_tab":
exp_tab = tab
break
if not exp_tab:
print("Missing experience_tab")
continue
if "sections" not in exp_tab:
print("Missing experience_tab.sections")
continue
exp_section = None
end_of_list = False
for section in exp_tab["sections"]:
# is there a “refinements” section?
if "result_type" in section and section["result_type"] == "refinements" and not refinements:
if "refinements" in section:
for refinement in section["refinements"]:
if "search_params" in refinement and "refinement_path" in refinement["search_params"]:
refinements.append(refinement["search_params"]["refinement_path"])
if "result_type" in section and section["result_type"] == "experiences":
exp_section = section
break
# is there a “no result” section?
elif "result_type" in section and section["result_type"] == "messages":
# may break later
if "messages" in section and section["messages"][0]["title"].lower() == "no results":
end_of_list = True
break
if end_of_list:
break
if not exp_section:
print("Missing section of type experiences")
continue
if "trip_templates" not in exp_section:
print("Missing trip templates in this section")
continue
for trip in exp_section["trip_templates"]:
trip_id = trip["id"] if "id" in trip else "__INVALID__"
trip_title = trip["title"] if "title" in trip else "(No Title Found)"
click.echo(WRITE_OVER_CHAR + (" " * get_terminal_size()[0]), nl=False)
click.echo(
WRITE_OVER_CHAR + f"Found trip #{trip_id}: {trip_title} - {len(experiences) + 1} so far…",
nl=False,
)
experiences.append(
{
"title": trip_title,
"link": f"https://www.airbnb.fr/experiences/{trip_id}", # FIXME Magic value
"classification": trip["primary_category_name"] if "primary_category_name" in trip else "",
}
)
offset += batch_size
# Prevents infinite loops
if offset > 2 ** 14:
break
click.echo(WRITE_OVER_CHAR + (" " * get_terminal_size()[0]), nl=False)
click.echo(WRITE_OVER_CHAR + f"{len(experiences)} experiences found for this filter.")
return experiences, refinements
click.secho("Retrieving all events links…\n", bold=True)
experiences, refinements = get_experiences()
experiences_links = set([exp["link"] for exp in experiences])
for refinement in refinements:
exps, _ = get_experiences(refinement_path=refinement)
for exp in exps:
if exp["link"] not in experiences_links:
experiences_links.add(exp["link"])
experiences.append(exp)
click.echo(f"\nNo more results ({len(experiences)} total).")
click.echo("Saving links… ", nl=False)
save(experiences, headers, filename)
click.echo("Done.")
# 2. Experiences data collection
click.secho("\nCollecting data from each AirBNB experience…\n", bold=True)
collections_since_last_save = 0
dirname = Path(dirname or filename.replace(".csv", "").replace(".", "-"))
dirname.makedirs_p()
data_dir = dirname / "data"
photos_dir = dirname / "host-photos"
data_dir.makedirs_p()
photos_dir.makedirs_p()
all_alcohol_req = set()
all_who_can_come = set()
no_json = set()
exp_internal_id = 0
with click.progressbar(
experiences,
item_show_func=lambda item: (item["link"] if item and "link" in item else "")
+ (f" [ ! {len(no_json)}]" if no_json else ""),
) as bar:
for exp in bar:
# If empty line or header
if not exp["link"] or exp["link"] == "link":
continue
r = s.get(exp["link"], params={"locale": locale})
if not r.ok:
print(f" → Cannot get experience {exp['link']}: HTTP {r.status_code} :'(")
continue
soup = BeautifulSoup(r.text, "html.parser")
# AirBNB's experience pages contains a blob of JSON with almost anything
# in it, in a <script type="application/json" data-hypernova-key="experience_pdpbundlejs" …></script>
# tag.
raw_embed_json = None
for script in soup.find_all("script", attrs={"data-hypernova-key": "experience_pdpbundlejs"}):
if script.string:
raw_embed_json = script.string
if not raw_embed_json:
no_json.add(exp["link"])
continue
if raw_embed_json.startswith("<!--"):
raw_embed_json = raw_embed_json[4:]
if raw_embed_json.endswith("-->"):
raw_embed_json = raw_embed_json[:-3]
embed = json.loads(raw_embed_json)
airbnb_id = exp["link"].replace("https://www.airbnb.fr/experiences/", "")
with open(data_dir / f"{airbnb_id}.json", "w+") as f:
json.dump(embed, f, indent=4)
if (
not "bootstrapData" in embed
or not "reduxBootstrap" in embed["bootstrapData"]
or not "trip_template" in embed["bootstrapData"]["reduxBootstrap"]
):
no_json.add(exp["link"])
continue
trip = embed["bootstrapData"]["reduxBootstrap"]["trip_template"]
desc = trip["description_native"] if "description_native" in trip else dict()
with open(data_dir / f"{airbnb_id}-trip.json", "w+") as f:
json.dump(trip, f, indent=4)
exp_internal_id += 1
exp["id"] = exp_internal_id
exp["date"] = datetime.now()
exp["title"] = desc.get("name")
exp["location"] = trip.get("city_native")
exp["category"] = trip.get("action_kicker", "")
exp["about_host"] = trip.get("about_host", "")
exp["price_per_person"] = trip.get("base_price", "")
exp["review_rating"] = trip.get("display_rating", "")
exp["number_of_reives"] = trip.get("review_count", "")
exp["group_size"] = trip.get("max_guests", "")
exp["number_of_languages"] = len(trip.get("offered_languages", []))
exp["languages"] = ", ".join(trip.get("offered_languages", []))
exp["duration"] = sum([e["duration_hours"] for e in trip.get("experiences", [])])
exp["government_id"] = 1 if trip.get("require_id_verification", False) else 0
exp["cancellation_policy"] = 1
# Content type:
# 1 => a single photo
# 2 => at least a video
# 3 => multiple photos
medias = (
trip.get("carousel_collection_multimedia", [])
or trip.get("carousel_collection_multimedia_derived", [])
or trip.get("carousel_collection_multimedia_v2", [])
)
content_type = 0
images_count = 0
if medias:
for media in medias:
if "video" in media:
content_type = 2
break
elif "picture" in media:
images_count += 1
if content_type == 0 and images_count > 0:
content_type = 1 if images_count == 1 else 3
exp["content_type"] = content_type
# Requirements (other than gov ID)
for section in trip.get("guest_requirement_list", dict()).get("sections", []):
req_type = section.get("requirement_type", "")
req_desc = section.get("description", "")
if req_type == "who_can_come":
exp["who_can_come"] = req_desc
if req_desc:
all_who_can_come.add(req_desc.splitlines()[0])
elif req_type == "alcohol_requirement":
all_alcohol_req.add(req_desc)
exp["alcohol"] = 1
if "alcohol" not in exp or not exp["alcohol"]:
exp["alcohol"] = 0
# Offer
for highlight in trip.get("highlights", []):
if highlight.get("airmoji_id", "") == "description_menu":
exp["offer"] = highlight.get("text", "")
# Amnesties
amnesties = []
we_will_do = []
what_else = []
for e in trip.get("experiences", []):
we_will_do.append(e.get("description_native", dict()).get("what_you_will_do", ""))
what_else.append(e.get("description_native", dict()).get("what_is_not_included", ""))
for amnesty in e.get("amenities", e.get("amenities_native", [])):
name = amnesty.get("name", None)
desc = amnesty.get("description", "")
if name:
name = name + "\n"
amnesties.append(f"{name}{desc}")
exp["what_well_do"] = "\n\n".join(we_will_do)
exp["what_else_you_should_do"] = "\n\n".join(what_else)
exp["what_well_provide"] = "\n\n".join(amnesties)
# Activity level
# 1 => all
# 2 => children
# 3 => adolescents
# 4 => adults
# Not displayed explicitly. But the first line of `who_can_come` contains hints.
who = exp["who_can_come"].splitlines()[0] if exp["who_can_come"] else None
lvl = 0
if who:
who = who.lower()
if "guests of all ages" in who:
lvl = 1
elif (
"guests ages 2 and up" in who
or "guests ages 3 and up" in who
or "guests ages 4 and up" in who
or "guests ages 5 and up" in who
or "guests ages 6 and up" in who
or "guests ages 7 and up" in who
or "guests ages 8 and up" in who
or "guests ages 9 and up" in who
or "guests ages 10 and up" in who
or "guests ages 11 and up" in who
):
lvl = 2
elif (
"guests ages 12 and up" in who
or "guests ages 13 and up" in who
or "guests ages 14 and up" in who
or "guests ages 15 and up" in who
or "guests ages 16 and up" in who
or "guests ages 17 and up" in who
):
lvl = 3
elif (
"guests ages 18 and up" in who
or "guests ages 19 and up" in who
or "guests ages 20 and up" in who
or "guests ages 21 and up" in who
or "guests ages 22 and up" in who
or "guests ages 23 and up" in who
):
lvl = 4
if lvl:
exp["activity_level"] = lvl
# Host photo
host_photo_url = (
trip.get("experience_host_profile", dict()).get("host", dict()).get("profile_pic_path", None)
)
if host_photo_url and host_photo_url.startswith("http"):
try:
r_photo = s.get(host_photo_url, stream=True)
if r_photo.ok:
with open(photos_dir / f"{exp_internal_id}.jpg", "wb+") as fd:
for chunk in r_photo.iter_content(chunk_size=128):
fd.write(chunk)
except Exception as e:
print(e)
collections_since_last_save += 1
if collections_since_last_save > 4:
save(experiences, headers, filename)
collections_since_last_save = 0
# break
sleep(wait_delay)
save(experiences, headers, filename)
click.echo("Done.")
if debug:
click.secho("\nDebug infos\n", bold=True)
click.echo("All “who can come” first lines")
for who in all_who_can_come:
click.echo(f" → {who}")
click.echo("\nAll alcohol requirements lines")
for alcohol in all_alcohol_req:
click.echo(f" → {alcohol}")
if no_json:
click.secho("\nAll pages with failed JSON extraction", bold=True, fg="red")
for page in no_json:
click.echo(f" → {page}")
click.echo()
if __name__ == "__main__":
cmd()
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
requests = "*"
beautifulsoup4 = "*"
click = "*"
path-py = "*"
[requires]
python_version = "3.7"
{
"_meta": {
"hash": {
"sha256": "06a1e6e5f039fb67e32f0ad6afa5b8b7c6615a855503a0d6a32e398d9af4efcd"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.7"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"beautifulsoup4": {
"hashes": [
"sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858",
"sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
"sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
],
"index": "pypi",
"version": "==4.7.1"
},
"certifi": {
"hashes": [
"sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5",
"sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae"
],
"version": "==2019.3.9"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"version": "==3.0.4"
},
"click": {
"hashes": [
"sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
"sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
],
"index": "pypi",
"version": "==7.0"
},
"idna": {
"hashes": [
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
],
"version": "==2.8"
},
"importlib-metadata": {
"hashes": [
"sha256:a17ce1a8c7bff1e8674cb12c992375d8d0800c9190177ecf0ad93e0097224095",
"sha256:b50191ead8c70adfa12495fba19ce6d75f2e0275c14c5a7beb653d6799b512bd"
],
"version": "==0.8"
},
"path-py": {
"hashes": [
"sha256:31ea790adf5f606c254599639f216234fb77d61d05b827c88ebe9b71e56266ef"
],
"index": "pypi",
"version": "==11.5.0"
},
"requests": {
"hashes": [
"sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
"sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"
],
"index": "pypi",
"version": "==2.21.0"
},
"soupsieve": {
"hashes": [
"sha256:afa56bf14907bb09403e5d15fbed6275caa4174d36b975226e3b67a3bb6e2c4b",
"sha256:eaed742b48b1f3e2d45ba6f79401b2ed5dc33b2123dfe216adb90d4bfa0ade26"
],
"version": "==1.8"
},
"urllib3": {
"hashes": [
"sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39",
"sha256:de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"
],
"version": "==1.24.1"
},
"zipp": {
"hashes": [
"sha256:55ca87266c38af6658b84db8cfb7343cdb0bf275f93c7afaea0d8e7a209c7478",
"sha256:682b3e1c62b7026afe24eadf6be579fb45fec54c07ea218bded8092af07a68c4"
],
"version": "==0.3.3"
}
},
"develop": {}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment