AmauryCarrade/Pipfile

## airbnb-experiences.py
"""
- Classification
- Date
- Location
- Category
- Title
- Price per person
- Review Rating
- Number of reives
- Content type : 1 => une photo, 2 => vidéo, 3 => multiple photos
- Duration => heures
- Offer => “Includes:1 repas”, “Includes:1 repas + boisson”, etc.
- Number of languages
- Languages
- Host photo
- About host
- What we'll do
- What else you should do
- What we'll provide
- Cancellation policy => ignore
- Who can come
- Government ID
- Group size
- Activity level
- Alcohol
- Link
"""
from time import sleep

import csv
import json
import os

from datetime import datetime

import click
import requests

from bs4 import BeautifulSoup
from click.termui import get_terminal_size
from click._termui_impl import BEFORE_BAR as WRITE_OVER_CHAR
from path import Path


# WARNING
# These were valid when this script was executed, but they are very likely to
# have a short TTL. If nothing work anymore, load a AirBNB page with a listing,
# lookup for XHR requests, and grab the client session ID & API key in the GET
# parameters.
CLIENT_SESSION_ID = "0b0ebe63-129a-4342-8ae1-f1f71f42ac9a"
API_KEY = "d306zoyjsyarp7ifhu67rjxn52tv0t20"

headers = [
    "id",  # ok
    "classification",  # ok
    "date",  # ok
    "location",  # ok
    "category",  # ok
    "title",  # ok
    "price_per_person",  # ok
    "review_rating",  # ok
    "number_of_reives",  # ok
    "content_type",  # ok
    "duration",  # ok
    "offer",  # ok
    "number_of_languages",  # ok
    "languages",  # ok
    "host_photo",  # ok
    "about_host",  # ok
    "what_well_do",  # ok
    "what_else_you_should_do",  # ok
    "what_well_provide",  # ok
    "cancellation_policy",  # ok
    "who_can_come",  # ok
    "government_id",  # ok
    "group_size",  # ok
    "activity_level",  # ok
    "alcohol",  # ok
    "link",  # ok
]


def save(experiences, h, to):
    """saves experiences with headers h to the csv filename `to`"""
    with open(to, "w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=h)
        # Only write header if required
        if "link" not in experiences[0] or experiences[0]["link"] != "link":
            writer.writeheader()
        for row in experiences:
            writer.writerow(row)


@click.command(options_metavar="[options]")
@click.argument("filename", type=click.Path(dir_okay=False, readable=True, writable=True), metavar="<filename.csv>")
@click.option(
    "--dirname",
    type=click.Path(dir_okay=True, file_okay=False, writable=True),
    help="The directory to write additional data into. If not specified, a "
    "directory derived from the file name will be used.",
)
@click.option(
    "--query", prompt="Please enter the search query (e.g. “Barcelona, Spain”)", help="The query sent to AirBNB."
)
@click.option("--locale", default="en", help="The localization of the results.", show_default=True)
@click.option(
    "--skip-links-collection",
    is_flag=True,
    default=False,
    help="If true, the given CSV will be loaded and only data from links "
    "already there will be collected, without re-loading all experiences.",
)
@click.option(
    "--client-session-id",
    default=CLIENT_SESSION_ID,
    help="The session ID to use for requests. If all requests are refused, you "
    "may want to extract your session ID from XHR requests executed by "
    "AirBNB's listing pages, somewhere in the GET parameters.",
)
@click.option(
    "--api-key",
    default=API_KEY,
    help="The API key to use for requests. Same as before, update this key by "
    "your own key found at the same place if the requests fail.",
)
@click.option("--wait-delay", default=4, help="The delay in seconds to wait between two requests.", show_default=True)
@click.option(
    "--debug/--no-debug", default=False, help="Prints debug infos after the data collection", show_default=True
)
def cmd(filename, dirname, query, client_session_id, api_key, locale, skip_links_collection, wait_delay, debug):
    if not os.path.exists(filename):
        with open(filename, "w+"):
            pass

    with open(filename, newline="") as csv_file:
        experiences = [row for row in csv.DictReader(csv_file, headers)]

    s = requests.Session()

    # Tells AirBNB we're a legitimate browser (+ firefox rpz)
    s.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0"

    # 1. All links

    if not skip_links_collection:
        """
        https://www.airbnb.com/api/v2/explore_tabs?locale=en&_format=for_explore_search_web&_intents=p1&adults=1&auto_ib=true&children=0&client_session_id=0b0ebe63-129a-4342-8ae1-f1f71f42ac9a&currency=EUR&experiences_per_grid=20&fetch_filters=true&guidebooks_per_grid=20&has_zero_guest_treatment=true&infants=0&is_guided_search=true&is_new_cards_experiment=true&is_standard_search=true&is_user_submitted_query=true&items_per_grid=18&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&locale=fr&luxury_pre_launch=false&metadata_only=false&query=Barcelone,%20Spain&query_understanding_enabled=true&refinement_paths[]=/experiences&satori_version=1.1.8&screen_size=large&selected_tab_id=experience_tab&show_groupings=true&supports_for_you_v3=true&timezone_offset=60&version=1.4.5
        https://www.airbnb.com/api/v2/explore_tabs?locale=en&_format=for_explore_search_web&_intents=p1&adults=1&auto_ib=true&children=0&client_session_id=0b0ebe63-129a-4342-8ae1-f1f71f42ac9a&currency=EUR&experiences_per_grid=20&federated_search_session_id=564ca790-d95e-43e4-a757-752d01d2fa31&fetch_filters=true&guidebooks_per_grid=20&has_zero_guest_treatment=true&infants=0&is_guided_search=true&is_new_cards_experiment=true&is_standard_search=true&is_user_submitted_query=true&items_offset=18&items_per_grid=18&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&locale=fr&luxury_pre_launch=true&metadata_only=false&query=Barcelone, Spain&query_understanding_enabled=true&refinement_paths[]=/experiences&satori_version=1.1.8&screen_size=large&section_offset=2&show_groupings=true&supports_for_you_v3=true&tab_id=experience_tab&timezone_offset=60&version=1.4.

        locale=en important
        items_offset=18
        items_per_grid=18
        """

        def get_experiences(refinement_path=None):
            if refinement_path is None:
                refinement_path = "/experiences"

            offset = 0

            # It looks like the batch size is capped by airbnb, this is a size below the
            # experimental max of 49 just to be sure
            batch_size = 32

            base_url = (
                "https://www.airbnb.com/api/v2/explore_tabs"
                "?_format=for_explore_search_web"
                "&_intents=p1"
                # "&adults=0"
                "&auto_ib=true"
                # "&children=0"
                f"&client_session_id={client_session_id}"  # may have to change that for later runs
                "&currency=EUR"
                "&experiences_per_grid=20"
                "&fetch_filters=true"
                "&guidebooks_per_grid=20"
                "&has_zero_guest_treatment=true"
                # "&infants=0"
                "&is_guided_search=true"
                "&is_new_cards_experiment=true"
                "&is_standard_search=true"
                f"&items_per_grid={batch_size}"
                f"&key={api_key}"  # API key - may have to change that for later runs too
                f"&locale={locale}"  # Locale, important
                "&luxury_pre_launch=false"
                "&metadata_only=false"
                f"&query={query}"  # If we want results for another location, change here
                f"&refinement_paths[]={refinement_path}"
                "&query_understanding_enabled=true"
                "&satori_version=1.1.8"
                "&screen_size=large"
                "&selected_tab_id=experience_tab"
                "&show_groupings=true"
                "&supports_for_you_v3=true"
                "&timezone_offset=60"
                "&version=1.4.5"
            )

            # For some reasons, this must be set for the non-filtered query but not for
            # refined ones, else we get no results. Well, don't ask.
            base_url += "&is_user_submitted_query=true" if refinement_path is None else ""  # yeah totally

            refinements = []
            experiences = []

            click.secho(f"Loading experiences from {refinement_path}…", bold=True, fg="blue")

            while True:
                offset_query = f"&items_offset={offset}" if offset > 0 else ""
                r = s.get(base_url + offset_query)
                if not r.ok:
                    print(f"[!] Request failed: HTTP {r.status_code}")
                    continue

                # print(r.url)

                exp_list = r.json()

                # print(exp_list)

                if "explore_tabs" not in exp_list:
                    print("Missing explore_tabs")
                    continue

                exp_tab = None
                for tab in exp_list["explore_tabs"]:
                    if "tab_id" in tab and tab["tab_id"] == "experience_tab":
                        exp_tab = tab
                        break

                if not exp_tab:
                    print("Missing experience_tab")
                    continue

                if "sections" not in exp_tab:
                    print("Missing experience_tab.sections")
                    continue

                exp_section = None
                end_of_list = False

                for section in exp_tab["sections"]:
                    # is there a “refinements” section?
                    if "result_type" in section and section["result_type"] == "refinements" and not refinements:
                        if "refinements" in section:
                            for refinement in section["refinements"]:
                                if "search_params" in refinement and "refinement_path" in refinement["search_params"]:
                                    refinements.append(refinement["search_params"]["refinement_path"])

                    if "result_type" in section and section["result_type"] == "experiences":
                        exp_section = section
                        break

                    # is there a “no result” section?
                    elif "result_type" in section and section["result_type"] == "messages":
                        # may break later
                        if "messages" in section and section["messages"][0]["title"].lower() == "no results":
                            end_of_list = True
                            break

                if end_of_list:
                    break

                if not exp_section:
                    print("Missing section of type experiences")
                    continue

                if "trip_templates" not in exp_section:
                    print("Missing trip templates in this section")
                    continue

                for trip in exp_section["trip_templates"]:
                    trip_id = trip["id"] if "id" in trip else "__INVALID__"
                    trip_title = trip["title"] if "title" in trip else "(No Title Found)"

                    click.echo(WRITE_OVER_CHAR + (" " * get_terminal_size()[0]), nl=False)
                    click.echo(
                        WRITE_OVER_CHAR + f"Found trip #{trip_id}: {trip_title} - {len(experiences) + 1} so far…",
                        nl=False,
                    )

                    experiences.append(
                        {
                            "title": trip_title,
                            "link": f"https://www.airbnb.fr/experiences/{trip_id}",  # FIXME Magic value
                            "classification": trip["primary_category_name"] if "primary_category_name" in trip else "",
                        }
                    )

                offset += batch_size

                # Prevents infinite loops
                if offset > 2 ** 14:
                    break

            click.echo(WRITE_OVER_CHAR + (" " * get_terminal_size()[0]), nl=False)
            click.echo(WRITE_OVER_CHAR + f"{len(experiences)} experiences found for this filter.")

            return experiences, refinements

        click.secho("Retrieving all events links…\n", bold=True)
        experiences, refinements = get_experiences()

        experiences_links = set([exp["link"] for exp in experiences])

        for refinement in refinements:
            exps, _ = get_experiences(refinement_path=refinement)
            for exp in exps:
                if exp["link"] not in experiences_links:
                    experiences_links.add(exp["link"])
                    experiences.append(exp)

        click.echo(f"\nNo more results ({len(experiences)} total).")

        click.echo("Saving links… ", nl=False)
        save(experiences, headers, filename)
        click.echo("Done.")

    # 2. Experiences data collection

    click.secho("\nCollecting data from each AirBNB experience…\n", bold=True)

    collections_since_last_save = 0

    dirname = Path(dirname or filename.replace(".csv", "").replace(".", "-"))
    dirname.makedirs_p()

    data_dir = dirname / "data"
    photos_dir = dirname / "host-photos"

    data_dir.makedirs_p()
    photos_dir.makedirs_p()

    all_alcohol_req = set()
    all_who_can_come = set()
    no_json = set()
    exp_internal_id = 0

    with click.progressbar(
        experiences,
        item_show_func=lambda item: (item["link"] if item and "link" in item else "")
        + (f" [ ! {len(no_json)}]" if no_json else ""),
    ) as bar:
        for exp in bar:
            # If empty line or header
            if not exp["link"] or exp["link"] == "link":
                continue

            r = s.get(exp["link"], params={"locale": locale})
            if not r.ok:
                print(f" → Cannot get experience {exp['link']}: HTTP {r.status_code} :'(")
                continue

            soup = BeautifulSoup(r.text, "html.parser")

            # AirBNB's experience pages contains a blob of JSON with almost anything
            # in it, in a <script type="application/json" data-hypernova-key="experience_pdpbundlejs" …></script>
            # tag.
            raw_embed_json = None
            for script in soup.find_all("script", attrs={"data-hypernova-key": "experience_pdpbundlejs"}):
                if script.string:
                    raw_embed_json = script.string

            if not raw_embed_json:
                no_json.add(exp["link"])
                continue

            if raw_embed_json.startswith("<!--"):
                raw_embed_json = raw_embed_json[4:]
            if raw_embed_json.endswith("-->"):
                raw_embed_json = raw_embed_json[:-3]

            embed = json.loads(raw_embed_json)
            airbnb_id = exp["link"].replace("https://www.airbnb.fr/experiences/", "")

            with open(data_dir / f"{airbnb_id}.json", "w+") as f:
                json.dump(embed, f, indent=4)

            if (
                not "bootstrapData" in embed
                or not "reduxBootstrap" in embed["bootstrapData"]
                or not "trip_template" in embed["bootstrapData"]["reduxBootstrap"]
            ):
                no_json.add(exp["link"])
                continue

            trip = embed["bootstrapData"]["reduxBootstrap"]["trip_template"]
            desc = trip["description_native"] if "description_native" in trip else dict()

            with open(data_dir / f"{airbnb_id}-trip.json", "w+") as f:
                json.dump(trip, f, indent=4)

            exp_internal_id += 1

            exp["id"] = exp_internal_id
            exp["date"] = datetime.now()
            exp["title"] = desc.get("name")
            exp["location"] = trip.get("city_native")
            exp["category"] = trip.get("action_kicker", "")
            exp["about_host"] = trip.get("about_host", "")
            exp["price_per_person"] = trip.get("base_price", "")
            exp["review_rating"] = trip.get("display_rating", "")
            exp["number_of_reives"] = trip.get("review_count", "")
            exp["group_size"] = trip.get("max_guests", "")
            exp["number_of_languages"] = len(trip.get("offered_languages", []))
            exp["languages"] = ", ".join(trip.get("offered_languages", []))
            exp["duration"] = sum([e["duration_hours"] for e in trip.get("experiences", [])])
            exp["government_id"] = 1 if trip.get("require_id_verification", False) else 0
            exp["cancellation_policy"] = 1

            # Content type:
            # 1 => a single photo
            # 2 => at least a video
            # 3 => multiple photos

            medias = (
                trip.get("carousel_collection_multimedia", [])
                or trip.get("carousel_collection_multimedia_derived", [])
                or trip.get("carousel_collection_multimedia_v2", [])
            )
            content_type = 0
            images_count = 0

            if medias:
                for media in medias:
                    if "video" in media:
                        content_type = 2
                        break
                    elif "picture" in media:
                        images_count += 1
                if content_type == 0 and images_count > 0:
                    content_type = 1 if images_count == 1 else 3

            exp["content_type"] = content_type

            # Requirements (other than gov ID)

            for section in trip.get("guest_requirement_list", dict()).get("sections", []):
                req_type = section.get("requirement_type", "")
                req_desc = section.get("description", "")
                if req_type == "who_can_come":
                    exp["who_can_come"] = req_desc
                    if req_desc:
                        all_who_can_come.add(req_desc.splitlines()[0])
                elif req_type == "alcohol_requirement":
                    all_alcohol_req.add(req_desc)
                    exp["alcohol"] = 1

            if "alcohol" not in exp or not exp["alcohol"]:
                exp["alcohol"] = 0

            # Offer

            for highlight in trip.get("highlights", []):
                if highlight.get("airmoji_id", "") == "description_menu":
                    exp["offer"] = highlight.get("text", "")

            # Amnesties

            amnesties = []
            we_will_do = []
            what_else = []

            for e in trip.get("experiences", []):
                we_will_do.append(e.get("description_native", dict()).get("what_you_will_do", ""))
                what_else.append(e.get("description_native", dict()).get("what_is_not_included", ""))
                for amnesty in e.get("amenities", e.get("amenities_native", [])):
                    name = amnesty.get("name", None)
                    desc = amnesty.get("description", "")
                    if name:
                        name = name + "\n"

                    amnesties.append(f"{name}{desc}")

            exp["what_well_do"] = "\n\n".join(we_will_do)
            exp["what_else_you_should_do"] = "\n\n".join(what_else)
            exp["what_well_provide"] = "\n\n".join(amnesties)

            # Activity level
            # 1 => all
            # 2 => children
            # 3 => adolescents
            # 4 => adults
            # Not displayed explicitly. But the first line of `who_can_come` contains hints.

            who = exp["who_can_come"].splitlines()[0] if exp["who_can_come"] else None
            lvl = 0

            if who:
                who = who.lower()
                if "guests of all ages" in who:
                    lvl = 1
                elif (
                    "guests ages 2 and up" in who
                    or "guests ages 3 and up" in who
                    or "guests ages 4 and up" in who
                    or "guests ages 5 and up" in who
                    or "guests ages 6 and up" in who
                    or "guests ages 7 and up" in who
                    or "guests ages 8 and up" in who
                    or "guests ages 9 and up" in who
                    or "guests ages 10 and up" in who
                    or "guests ages 11 and up" in who
                ):
                    lvl = 2
                elif (
                    "guests ages 12 and up" in who
                    or "guests ages 13 and up" in who
                    or "guests ages 14 and up" in who
                    or "guests ages 15 and up" in who
                    or "guests ages 16 and up" in who
                    or "guests ages 17 and up" in who
                ):
                    lvl = 3
                elif (
                    "guests ages 18 and up" in who
                    or "guests ages 19 and up" in who
                    or "guests ages 20 and up" in who
                    or "guests ages 21 and up" in who
                    or "guests ages 22 and up" in who
                    or "guests ages 23 and up" in who
                ):
                    lvl = 4

            if lvl:
                exp["activity_level"] = lvl

            # Host photo

            host_photo_url = (
                trip.get("experience_host_profile", dict()).get("host", dict()).get("profile_pic_path", None)
            )
            if host_photo_url and host_photo_url.startswith("http"):
                try:
                    r_photo = s.get(host_photo_url, stream=True)
                    if r_photo.ok:
                        with open(photos_dir / f"{exp_internal_id}.jpg", "wb+") as fd:
                            for chunk in r_photo.iter_content(chunk_size=128):
                                fd.write(chunk)
                except Exception as e:
                    print(e)

            collections_since_last_save += 1

            if collections_since_last_save > 4:
                save(experiences, headers, filename)
                collections_since_last_save = 0

            # break
            sleep(wait_delay)

    save(experiences, headers, filename)

    click.echo("Done.")

    if debug:
        click.secho("\nDebug infos\n", bold=True)

        click.echo("All “who can come” first lines")
        for who in all_who_can_come:
            click.echo(f" → {who}")

        click.echo("\nAll alcohol requirements lines")
        for alcohol in all_alcohol_req:
            click.echo(f" → {alcohol}")

    if no_json:
        click.secho("\nAll pages with failed JSON extraction", bold=True, fg="red")
        for page in no_json:
            click.echo(f" → {page}")

    click.echo()


if __name__ == "__main__":
    cmd()

## Pipfile
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]

[packages]
requests = "*"
beautifulsoup4 = "*"
click = "*"
path-py = "*"

[requires]
python_version = "3.7"

## Pipfile.lock
{
    "_meta": {
        "hash": {
            "sha256": "06a1e6e5f039fb67e32f0ad6afa5b8b7c6615a855503a0d6a32e398d9af4efcd"
        },
        "pipfile-spec": 6,
        "requires": {
            "python_version": "3.7"
        },
        "sources": [
            {
                "name": "pypi",
                "url": "https://pypi.org/simple",
                "verify_ssl": true
            }
        ]
    },
    "default": {
        "beautifulsoup4": {
            "hashes": [
                "sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858",
                "sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
                "sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
            ],
            "index": "pypi",
            "version": "==4.7.1"
        },
        "certifi": {
            "hashes": [
                "sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5",
                "sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae"
            ],
            "version": "==2019.3.9"
        },
        "chardet": {
            "hashes": [
                "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
                "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
            ],
            "version": "==3.0.4"
        },
        "click": {
            "hashes": [
                "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
                "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
            ],
            "index": "pypi",
            "version": "==7.0"
        },
        "idna": {
            "hashes": [
                "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
                "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
            ],
            "version": "==2.8"
        },
        "importlib-metadata": {
            "hashes": [
                "sha256:a17ce1a8c7bff1e8674cb12c992375d8d0800c9190177ecf0ad93e0097224095",
                "sha256:b50191ead8c70adfa12495fba19ce6d75f2e0275c14c5a7beb653d6799b512bd"
            ],
            "version": "==0.8"
        },
        "path-py": {
            "hashes": [
                "sha256:31ea790adf5f606c254599639f216234fb77d61d05b827c88ebe9b71e56266ef"
            ],
            "index": "pypi",
            "version": "==11.5.0"
        },
        "requests": {
            "hashes": [
                "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
                "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"
            ],
            "index": "pypi",
            "version": "==2.21.0"
        },
        "soupsieve": {
            "hashes": [
                "sha256:afa56bf14907bb09403e5d15fbed6275caa4174d36b975226e3b67a3bb6e2c4b",
                "sha256:eaed742b48b1f3e2d45ba6f79401b2ed5dc33b2123dfe216adb90d4bfa0ade26"
            ],
            "version": "==1.8"
        },
        "urllib3": {
            "hashes": [
                "sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39",
                "sha256:de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"
            ],
            "version": "==1.24.1"
        },
        "zipp": {
            "hashes": [
                "sha256:55ca87266c38af6658b84db8cfb7343cdb0bf275f93c7afaea0d8e7a209c7478",
                "sha256:682b3e1c62b7026afe24eadf6be579fb45fec54c07ea218bded8092af07a68c4"
            ],
            "version": "==0.3.3"
        }
    },
    "develop": {}
}
	"""
	- Classification
	- Date
	- Location
	- Category
	- Title
	- Price per person
	- Review Rating
	- Number of reives
	- Content type : 1 => une photo, 2 => vidéo, 3 => multiple photos
	- Duration => heures
	- Offer => “Includes:1 repas”, “Includes:1 repas + boisson”, etc.
	- Number of languages
	- Languages
	- Host photo
	- About host
	- What we'll do
	- What else you should do
	- What we'll provide
	- Cancellation policy => ignore
	- Who can come
	- Government ID
	- Group size
	- Activity level
	- Alcohol
	- Link
	"""
	from time import sleep

	import csv
	import json
	import os

	from datetime import datetime

	import click
	import requests

	from bs4 import BeautifulSoup
	from click.termui import get_terminal_size
	from click._termui_impl import BEFORE_BAR as WRITE_OVER_CHAR
	from path import Path


	# WARNING
	# These were valid when this script was executed, but they are very likely to
	# have a short TTL. If nothing work anymore, load a AirBNB page with a listing,
	# lookup for XHR requests, and grab the client session ID & API key in the GET
	# parameters.
	CLIENT_SESSION_ID = "0b0ebe63-129a-4342-8ae1-f1f71f42ac9a"
	API_KEY = "d306zoyjsyarp7ifhu67rjxn52tv0t20"

	headers = [
	"id", # ok
	"classification", # ok
	"date", # ok
	"location", # ok
	"category", # ok
	"title", # ok
	"price_per_person", # ok
	"review_rating", # ok
	"number_of_reives", # ok
	"content_type", # ok
	"duration", # ok
	"offer", # ok
	"number_of_languages", # ok
	"languages", # ok
	"host_photo", # ok
	"about_host", # ok
	"what_well_do", # ok
	"what_else_you_should_do", # ok
	"what_well_provide", # ok
	"cancellation_policy", # ok
	"who_can_come", # ok
	"government_id", # ok
	"group_size", # ok
	"activity_level", # ok
	"alcohol", # ok
	"link", # ok
	]


	def save(experiences, h, to):
	"""saves experiences with headers h to the csv filename `to`"""
	with open(to, "w", newline="") as csv_file:
	writer = csv.DictWriter(csv_file, fieldnames=h)
	# Only write header if required
	if "link" not in experiences[0] or experiences[0]["link"] != "link":
	writer.writeheader()
	for row in experiences:
	writer.writerow(row)


	@click.command(options_metavar="[options]")
	@click.argument("filename", type=click.Path(dir_okay=False, readable=True, writable=True), metavar="<filename.csv>")
	@click.option(
	"--dirname",
	type=click.Path(dir_okay=True, file_okay=False, writable=True),
	help="The directory to write additional data into. If not specified, a "
	"directory derived from the file name will be used.",
	)
	@click.option(
	"--query", prompt="Please enter the search query (e.g. “Barcelona, Spain”)", help="The query sent to AirBNB."
	)
	@click.option("--locale", default="en", help="The localization of the results.", show_default=True)
	@click.option(
	"--skip-links-collection",
	is_flag=True,
	default=False,
	help="If true, the given CSV will be loaded and only data from links "
	"already there will be collected, without re-loading all experiences.",
	)
	@click.option(
	"--client-session-id",
	default=CLIENT_SESSION_ID,
	help="The session ID to use for requests. If all requests are refused, you "
	"may want to extract your session ID from XHR requests executed by "
	"AirBNB's listing pages, somewhere in the GET parameters.",
	)
	@click.option(
	"--api-key",
	default=API_KEY,
	help="The API key to use for requests. Same as before, update this key by "
	"your own key found at the same place if the requests fail.",
	)
	@click.option("--wait-delay", default=4, help="The delay in seconds to wait between two requests.", show_default=True)
	@click.option(
	"--debug/--no-debug", default=False, help="Prints debug infos after the data collection", show_default=True
	)
	def cmd(filename, dirname, query, client_session_id, api_key, locale, skip_links_collection, wait_delay, debug):
	if not os.path.exists(filename):
	with open(filename, "w+"):
	pass

	with open(filename, newline="") as csv_file:
	experiences = [row for row in csv.DictReader(csv_file, headers)]

	s = requests.Session()

	# Tells AirBNB we're a legitimate browser (+ firefox rpz)
	s.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0"

	# 1. All links

	if not skip_links_collection:
	"""
	https://www.airbnb.com/api/v2/explore_tabs?locale=en&_format=for_explore_search_web&_intents=p1&adults=1&auto_ib=true&children=0&client_session_id=0b0ebe63-129a-4342-8ae1-f1f71f42ac9a&currency=EUR&experiences_per_grid=20&fetch_filters=true&guidebooks_per_grid=20&has_zero_guest_treatment=true&infants=0&is_guided_search=true&is_new_cards_experiment=true&is_standard_search=true&is_user_submitted_query=true&items_per_grid=18&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&locale=fr&luxury_pre_launch=false&metadata_only=false&query=Barcelone,%20Spain&query_understanding_enabled=true&refinement_paths[]=/experiences&satori_version=1.1.8&screen_size=large&selected_tab_id=experience_tab&show_groupings=true&supports_for_you_v3=true&timezone_offset=60&version=1.4.5
	https://www.airbnb.com/api/v2/explore_tabs?locale=en&_format=for_explore_search_web&_intents=p1&adults=1&auto_ib=true&children=0&client_session_id=0b0ebe63-129a-4342-8ae1-f1f71f42ac9a&currency=EUR&experiences_per_grid=20&federated_search_session_id=564ca790-d95e-43e4-a757-752d01d2fa31&fetch_filters=true&guidebooks_per_grid=20&has_zero_guest_treatment=true&infants=0&is_guided_search=true&is_new_cards_experiment=true&is_standard_search=true&is_user_submitted_query=true&items_offset=18&items_per_grid=18&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&locale=fr&luxury_pre_launch=true&metadata_only=false&query=Barcelone, Spain&query_understanding_enabled=true&refinement_paths[]=/experiences&satori_version=1.1.8&screen_size=large&section_offset=2&show_groupings=true&supports_for_you_v3=true&tab_id=experience_tab&timezone_offset=60&version=1.4.

	locale=en important
	items_offset=18
	items_per_grid=18
	"""

	def get_experiences(refinement_path=None):
	if refinement_path is None:
	refinement_path = "/experiences"

	offset = 0

	# It looks like the batch size is capped by airbnb, this is a size below the
	# experimental max of 49 just to be sure
	batch_size = 32

	base_url = (
	"https://www.airbnb.com/api/v2/explore_tabs"
	"?_format=for_explore_search_web"
	"&_intents=p1"
	# "&adults=0"
	"&auto_ib=true"
	# "&children=0"
	f"&client_session_id={client_session_id}" # may have to change that for later runs
	"&currency=EUR"
	"&experiences_per_grid=20"
	"&fetch_filters=true"
	"&guidebooks_per_grid=20"
	"&has_zero_guest_treatment=true"
	# "&infants=0"
	"&is_guided_search=true"
	"&is_new_cards_experiment=true"
	"&is_standard_search=true"
	f"&items_per_grid={batch_size}"
	f"&key={api_key}" # API key - may have to change that for later runs too
	f"&locale={locale}" # Locale, important
	"&luxury_pre_launch=false"
	"&metadata_only=false"
	f"&query={query}" # If we want results for another location, change here
	f"&refinement_paths[]={refinement_path}"
	"&query_understanding_enabled=true"
	"&satori_version=1.1.8"
	"&screen_size=large"
	"&selected_tab_id=experience_tab"
	"&show_groupings=true"
	"&supports_for_you_v3=true"
	"&timezone_offset=60"
	"&version=1.4.5"
	)

	# For some reasons, this must be set for the non-filtered query but not for
	# refined ones, else we get no results. Well, don't ask.
	base_url += "&is_user_submitted_query=true" if refinement_path is None else "" # yeah totally

	refinements = []
	experiences = []

	click.secho(f"Loading experiences from {refinement_path}…", bold=True, fg="blue")

	while True:
	offset_query = f"&items_offset={offset}" if offset > 0 else ""
	r = s.get(base_url + offset_query)
	if not r.ok:
	print(f"[!] Request failed: HTTP {r.status_code}")
	continue

	# print(r.url)

	exp_list = r.json()

	# print(exp_list)

	if "explore_tabs" not in exp_list:
	print("Missing explore_tabs")
	continue

	exp_tab = None
	for tab in exp_list["explore_tabs"]:
	if "tab_id" in tab and tab["tab_id"] == "experience_tab":
	exp_tab = tab
	break

	if not exp_tab:
	print("Missing experience_tab")
	continue

	if "sections" not in exp_tab:
	print("Missing experience_tab.sections")
	continue

	exp_section = None
	end_of_list = False

	for section in exp_tab["sections"]:
	# is there a “refinements” section?
	if "result_type" in section and section["result_type"] == "refinements" and not refinements:
	if "refinements" in section:
	for refinement in section["refinements"]:
	if "search_params" in refinement and "refinement_path" in refinement["search_params"]:
	refinements.append(refinement["search_params"]["refinement_path"])

	if "result_type" in section and section["result_type"] == "experiences":
	exp_section = section
	break

	# is there a “no result” section?
	elif "result_type" in section and section["result_type"] == "messages":
	# may break later
	if "messages" in section and section["messages"][0]["title"].lower() == "no results":
	end_of_list = True
	break

	if end_of_list:
	break

	if not exp_section:
	print("Missing section of type experiences")
	continue

	if "trip_templates" not in exp_section:
	print("Missing trip templates in this section")
	continue

	for trip in exp_section["trip_templates"]:
	trip_id = trip["id"] if "id" in trip else "__INVALID__"
	trip_title = trip["title"] if "title" in trip else "(No Title Found)"

	click.echo(WRITE_OVER_CHAR + (" " * get_terminal_size()[0]), nl=False)
	click.echo(
	WRITE_OVER_CHAR + f"Found trip #{trip_id}: {trip_title} - {len(experiences) + 1} so far…",
	nl=False,
	)

	experiences.append(
	{
	"title": trip_title,
	"link": f"https://www.airbnb.fr/experiences/{trip_id}", # FIXME Magic value
	"classification": trip["primary_category_name"] if "primary_category_name" in trip else "",
	}
	)

	offset += batch_size

	# Prevents infinite loops
	if offset > 2 ** 14:
	break

	click.echo(WRITE_OVER_CHAR + (" " * get_terminal_size()[0]), nl=False)
	click.echo(WRITE_OVER_CHAR + f"{len(experiences)} experiences found for this filter.")

	return experiences, refinements

	click.secho("Retrieving all events links…\n", bold=True)
	experiences, refinements = get_experiences()

	experiences_links = set([exp["link"] for exp in experiences])

	for refinement in refinements:
	exps, _ = get_experiences(refinement_path=refinement)
	for exp in exps:
	if exp["link"] not in experiences_links:
	experiences_links.add(exp["link"])
	experiences.append(exp)

	click.echo(f"\nNo more results ({len(experiences)} total).")

	click.echo("Saving links… ", nl=False)
	save(experiences, headers, filename)
	click.echo("Done.")

	# 2. Experiences data collection

	click.secho("\nCollecting data from each AirBNB experience…\n", bold=True)

	collections_since_last_save = 0

	dirname = Path(dirname or filename.replace(".csv", "").replace(".", "-"))
	dirname.makedirs_p()

	data_dir = dirname / "data"
	photos_dir = dirname / "host-photos"

	data_dir.makedirs_p()
	photos_dir.makedirs_p()

	all_alcohol_req = set()
	all_who_can_come = set()
	no_json = set()
	exp_internal_id = 0

	with click.progressbar(
	experiences,
	item_show_func=lambda item: (item["link"] if item and "link" in item else "")
	+ (f" [ ! {len(no_json)}]" if no_json else ""),
	) as bar:
	for exp in bar:
	# If empty line or header
	if not exp["link"] or exp["link"] == "link":
	continue

	r = s.get(exp["link"], params={"locale": locale})
	if not r.ok:
	print(f" → Cannot get experience {exp['link']}: HTTP {r.status_code} :'(")
	continue

	soup = BeautifulSoup(r.text, "html.parser")

	# AirBNB's experience pages contains a blob of JSON with almost anything
	# in it, in a <script type="application/json" data-hypernova-key="experience_pdpbundlejs" …></script>
	# tag.
	raw_embed_json = None
	for script in soup.find_all("script", attrs={"data-hypernova-key": "experience_pdpbundlejs"}):
	if script.string:
	raw_embed_json = script.string

	if not raw_embed_json:
	no_json.add(exp["link"])
	continue

	if raw_embed_json.startswith("<!--"):
	raw_embed_json = raw_embed_json[4:]
	if raw_embed_json.endswith("-->"):
	raw_embed_json = raw_embed_json[:-3]

	embed = json.loads(raw_embed_json)
	airbnb_id = exp["link"].replace("https://www.airbnb.fr/experiences/", "")

	with open(data_dir / f"{airbnb_id}.json", "w+") as f:
	json.dump(embed, f, indent=4)

	if (
	not "bootstrapData" in embed
	or not "reduxBootstrap" in embed["bootstrapData"]
	or not "trip_template" in embed["bootstrapData"]["reduxBootstrap"]
	):
	no_json.add(exp["link"])
	continue

	trip = embed["bootstrapData"]["reduxBootstrap"]["trip_template"]
	desc = trip["description_native"] if "description_native" in trip else dict()

	with open(data_dir / f"{airbnb_id}-trip.json", "w+") as f:
	json.dump(trip, f, indent=4)

	exp_internal_id += 1

	exp["id"] = exp_internal_id
	exp["date"] = datetime.now()
	exp["title"] = desc.get("name")
	exp["location"] = trip.get("city_native")
	exp["category"] = trip.get("action_kicker", "")
	exp["about_host"] = trip.get("about_host", "")
	exp["price_per_person"] = trip.get("base_price", "")
	exp["review_rating"] = trip.get("display_rating", "")
	exp["number_of_reives"] = trip.get("review_count", "")
	exp["group_size"] = trip.get("max_guests", "")
	exp["number_of_languages"] = len(trip.get("offered_languages", []))
	exp["languages"] = ", ".join(trip.get("offered_languages", []))
	exp["duration"] = sum([e["duration_hours"] for e in trip.get("experiences", [])])
	exp["government_id"] = 1 if trip.get("require_id_verification", False) else 0
	exp["cancellation_policy"] = 1

	# Content type:
	# 1 => a single photo
	# 2 => at least a video
	# 3 => multiple photos

	medias = (
	trip.get("carousel_collection_multimedia", [])
	or trip.get("carousel_collection_multimedia_derived", [])
	or trip.get("carousel_collection_multimedia_v2", [])
	)
	content_type = 0
	images_count = 0

	if medias:
	for media in medias:
	if "video" in media:
	content_type = 2
	break
	elif "picture" in media:
	images_count += 1
	if content_type == 0 and images_count > 0:
	content_type = 1 if images_count == 1 else 3

	exp["content_type"] = content_type

	# Requirements (other than gov ID)

	for section in trip.get("guest_requirement_list", dict()).get("sections", []):
	req_type = section.get("requirement_type", "")
	req_desc = section.get("description", "")
	if req_type == "who_can_come":
	exp["who_can_come"] = req_desc
	if req_desc:
	all_who_can_come.add(req_desc.splitlines()[0])
	elif req_type == "alcohol_requirement":
	all_alcohol_req.add(req_desc)
	exp["alcohol"] = 1

	if "alcohol" not in exp or not exp["alcohol"]:
	exp["alcohol"] = 0

	# Offer

	for highlight in trip.get("highlights", []):
	if highlight.get("airmoji_id", "") == "description_menu":
	exp["offer"] = highlight.get("text", "")

	# Amnesties

	amnesties = []
	we_will_do = []
	what_else = []

	for e in trip.get("experiences", []):
	we_will_do.append(e.get("description_native", dict()).get("what_you_will_do", ""))
	what_else.append(e.get("description_native", dict()).get("what_is_not_included", ""))
	for amnesty in e.get("amenities", e.get("amenities_native", [])):
	name = amnesty.get("name", None)
	desc = amnesty.get("description", "")
	if name:
	name = name + "\n"

	amnesties.append(f"{name}{desc}")

	exp["what_well_do"] = "\n\n".join(we_will_do)
	exp["what_else_you_should_do"] = "\n\n".join(what_else)
	exp["what_well_provide"] = "\n\n".join(amnesties)

	# Activity level
	# 1 => all
	# 2 => children
	# 3 => adolescents
	# 4 => adults
	# Not displayed explicitly. But the first line of `who_can_come` contains hints.

	who = exp["who_can_come"].splitlines()[0] if exp["who_can_come"] else None
	lvl = 0

	if who:
	who = who.lower()
	if "guests of all ages" in who:
	lvl = 1
	elif (
	"guests ages 2 and up" in who
	or "guests ages 3 and up" in who
	or "guests ages 4 and up" in who
	or "guests ages 5 and up" in who
	or "guests ages 6 and up" in who
	or "guests ages 7 and up" in who
	or "guests ages 8 and up" in who
	or "guests ages 9 and up" in who
	or "guests ages 10 and up" in who
	or "guests ages 11 and up" in who
	):
	lvl = 2
	elif (
	"guests ages 12 and up" in who
	or "guests ages 13 and up" in who
	or "guests ages 14 and up" in who
	or "guests ages 15 and up" in who
	or "guests ages 16 and up" in who
	or "guests ages 17 and up" in who
	):
	lvl = 3
	elif (
	"guests ages 18 and up" in who
	or "guests ages 19 and up" in who
	or "guests ages 20 and up" in who
	or "guests ages 21 and up" in who
	or "guests ages 22 and up" in who
	or "guests ages 23 and up" in who
	):
	lvl = 4

	if lvl:
	exp["activity_level"] = lvl

	# Host photo

	host_photo_url = (
	trip.get("experience_host_profile", dict()).get("host", dict()).get("profile_pic_path", None)
	)
	if host_photo_url and host_photo_url.startswith("http"):
	try:
	r_photo = s.get(host_photo_url, stream=True)
	if r_photo.ok:
	with open(photos_dir / f"{exp_internal_id}.jpg", "wb+") as fd:
	for chunk in r_photo.iter_content(chunk_size=128):
	fd.write(chunk)
	except Exception as e:
	print(e)

	collections_since_last_save += 1

	if collections_since_last_save > 4:
	save(experiences, headers, filename)
	collections_since_last_save = 0

	# break
	sleep(wait_delay)

	save(experiences, headers, filename)

	click.echo("Done.")

	if debug:
	click.secho("\nDebug infos\n", bold=True)

	click.echo("All “who can come” first lines")
	for who in all_who_can_come:
	click.echo(f" → {who}")

	click.echo("\nAll alcohol requirements lines")
	for alcohol in all_alcohol_req:
	click.echo(f" → {alcohol}")

	if no_json:
	click.secho("\nAll pages with failed JSON extraction", bold=True, fg="red")
	for page in no_json:
	click.echo(f" → {page}")

	click.echo()


	if __name__ == "__main__":
	cmd()
	[[source]]
	name = "pypi"
	url = "https://pypi.org/simple"
	verify_ssl = true

	[dev-packages]

	[packages]
	requests = "*"
	beautifulsoup4 = "*"
	click = "*"
	path-py = "*"

	[requires]
	python_version = "3.7"
	{
	"_meta": {
	"hash": {
	"sha256": "06a1e6e5f039fb67e32f0ad6afa5b8b7c6615a855503a0d6a32e398d9af4efcd"
	},
	"pipfile-spec": 6,
	"requires": {
	"python_version": "3.7"
	},
	"sources": [
	{
	"name": "pypi",
	"url": "https://pypi.org/simple",
	"verify_ssl": true
	}
	]
	},
	"default": {
	"beautifulsoup4": {
	"hashes": [
	"sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858",
	"sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
	"sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
	],
	"index": "pypi",
	"version": "==4.7.1"
	},
	"certifi": {
	"hashes": [
	"sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5",
	"sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae"
	],
	"version": "==2019.3.9"
	},
	"chardet": {
	"hashes": [
	"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
	"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
	],
	"version": "==3.0.4"
	},
	"click": {
	"hashes": [
	"sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
	"sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
	],
	"index": "pypi",
	"version": "==7.0"
	},
	"idna": {
	"hashes": [
	"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
	"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
	],
	"version": "==2.8"
	},
	"importlib-metadata": {
	"hashes": [
	"sha256:a17ce1a8c7bff1e8674cb12c992375d8d0800c9190177ecf0ad93e0097224095",
	"sha256:b50191ead8c70adfa12495fba19ce6d75f2e0275c14c5a7beb653d6799b512bd"
	],
	"version": "==0.8"
	},
	"path-py": {
	"hashes": [
	"sha256:31ea790adf5f606c254599639f216234fb77d61d05b827c88ebe9b71e56266ef"
	],
	"index": "pypi",
	"version": "==11.5.0"
	},
	"requests": {
	"hashes": [
	"sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
	"sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"
	],
	"index": "pypi",
	"version": "==2.21.0"
	},
	"soupsieve": {
	"hashes": [
	"sha256:afa56bf14907bb09403e5d15fbed6275caa4174d36b975226e3b67a3bb6e2c4b",
	"sha256:eaed742b48b1f3e2d45ba6f79401b2ed5dc33b2123dfe216adb90d4bfa0ade26"
	],
	"version": "==1.8"
	},
	"urllib3": {
	"hashes": [
	"sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39",
	"sha256:de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"
	],
	"version": "==1.24.1"
	},
	"zipp": {
	"hashes": [
	"sha256:55ca87266c38af6658b84db8cfb7343cdb0bf275f93c7afaea0d8e7a209c7478",
	"sha256:682b3e1c62b7026afe24eadf6be579fb45fec54c07ea218bded8092af07a68c4"
	],
	"version": "==0.3.3"
	}
	},
	"develop": {}
	}