Skip to content

Instantly share code, notes, and snippets.

@benwtrent
Last active November 11, 2019 17:24
Show Gist options
  • Save benwtrent/a8f83d2ccf01eb515004280d42fe9310 to your computer and use it in GitHub Desktop.
Save benwtrent/a8f83d2ccf01eb515004280d42fe9310 to your computer and use it in GitHub Desktop.
building out avg price prediction on a house given ashville air bnb listing data
{
"listings-ash" : {
"aliases" : { },
"mappings" : {
"_meta" : {
"created_by" : "ml-file-data-visualizer"
},
"properties" : {
"@timestamp" : {
"type" : "date"
},
"access" : {
"type" : "text"
},
"accommodates" : {
"type" : "long"
},
"amenities" : {
"type" : "text"
},
"availability_30" : {
"type" : "long"
},
"availability_365" : {
"type" : "long"
},
"availability_60" : {
"type" : "long"
},
"availability_90" : {
"type" : "long"
},
"bathrooms" : {
"type" : "double"
},
"bed_type" : {
"type" : "keyword"
},
"bedrooms" : {
"type" : "long"
},
"beds" : {
"type" : "long"
},
"calculated_host_listings_count" : {
"type" : "long"
},
"calculated_host_listings_count_entire_homes" : {
"type" : "long"
},
"calculated_host_listings_count_private_rooms" : {
"type" : "long"
},
"calculated_host_listings_count_shared_rooms" : {
"type" : "long"
},
"calendar_last_scraped" : {
"type" : "date",
"format" : "iso8601"
},
"calendar_updated" : {
"type" : "keyword"
},
"cancellation_policy" : {
"type" : "keyword"
},
"city" : {
"type" : "keyword"
},
"cleaning_fee" : {
"type" : "double"
},
"country" : {
"type" : "keyword"
},
"country_code" : {
"type" : "keyword"
},
"description" : {
"type" : "text"
},
"experiences_offered" : {
"type" : "keyword"
},
"extra_people" : {
"type" : "double"
},
"first_review" : {
"type" : "date",
"format" : "iso8601"
},
"guests_included" : {
"type" : "long"
},
"has_availability" : {
"type" : "keyword"
},
"host_about" : {
"type" : "text"
},
"host_acceptance_rate" : {
"type" : "keyword"
},
"host_has_profile_pic" : {
"type" : "keyword"
},
"host_id" : {
"type" : "long"
},
"host_identity_verified" : {
"type" : "keyword"
},
"host_is_superhost" : {
"type" : "keyword"
},
"host_listings_count" : {
"type" : "long"
},
"host_location" : {
"type" : "keyword"
},
"host_name" : {
"type" : "keyword"
},
"host_neighbourhood" : {
"type" : "keyword"
},
"host_picture_url" : {
"type" : "keyword"
},
"host_response_rate" : {
"type" : "keyword"
},
"host_response_time" : {
"type" : "keyword"
},
"host_since" : {
"type" : "date",
"format" : "iso8601"
},
"host_thumbnail_url" : {
"type" : "keyword"
},
"host_total_listings_count" : {
"type" : "long"
},
"host_url" : {
"type" : "keyword"
},
"host_verifications" : {
"type" : "text"
},
"house_rules" : {
"type" : "text"
},
"instant_bookable" : {
"type" : "keyword"
},
"interaction" : {
"type" : "text"
},
"is_business_travel_ready" : {
"type" : "keyword"
},
"is_location_exact" : {
"type" : "keyword"
},
"jurisdiction_names" : {
"type" : "keyword"
},
"last_review" : {
"type" : "date",
"format" : "iso8601"
},
"last_scraped" : {
"type" : "date",
"format" : "iso8601"
},
"latitude" : {
"type" : "double"
},
"listing_id" : {
"type" : "long"
},
"listing_url" : {
"type" : "keyword"
},
"longitude" : {
"type" : "double"
},
"market" : {
"type" : "keyword"
},
"maximum_maximum_nights" : {
"type" : "long"
},
"maximum_minimum_nights" : {
"type" : "long"
},
"maximum_nights" : {
"type" : "long"
},
"maximum_nights_avg_ntm" : {
"type" : "double"
},
"minimum_maximum_nights" : {
"type" : "long"
},
"minimum_minimum_nights" : {
"type" : "long"
},
"minimum_nights" : {
"type" : "long"
},
"minimum_nights_avg_ntm" : {
"type" : "double"
},
"monthly_price" : {
"type" : "double"
},
"name" : {
"type" : "text"
},
"neighborhood_overview" : {
"type" : "text"
},
"neighbourhood_cleansed" : {
"type" : "long"
},
"notes" : {
"type" : "text"
},
"number_of_reviews" : {
"type" : "long"
},
"number_of_reviews_ltm" : {
"type" : "long"
},
"picture_url" : {
"type" : "keyword"
},
"price" : {
"type" : "double"
},
"property_type" : {
"type" : "keyword"
},
"require_guest_phone_verification" : {
"type" : "keyword"
},
"require_guest_profile_picture" : {
"type" : "keyword"
},
"requires_license" : {
"type" : "keyword"
},
"review_scores_accuracy" : {
"type" : "long"
},
"review_scores_checkin" : {
"type" : "long"
},
"review_scores_cleanliness" : {
"type" : "long"
},
"review_scores_communication" : {
"type" : "long"
},
"review_scores_location" : {
"type" : "long"
},
"review_scores_rating" : {
"type" : "long"
},
"review_scores_value" : {
"type" : "long"
},
"reviews_per_month" : {
"type" : "double"
},
"room_type" : {
"type" : "keyword"
},
"scrape_id" : {
"type" : "date",
"format" : "yyyyMMddHHmmss"
},
"security_deposit" : {
"type" : "double"
},
"smart_location" : {
"type" : "keyword"
},
"space" : {
"type" : "text"
},
"square_feet" : {
"type" : "long"
},
"state" : {
"type" : "keyword"
},
"street" : {
"type" : "text"
},
"summary" : {
"type" : "text"
},
"transit" : {
"type" : "text"
},
"weekly_price" : {
"type" : "double"
},
"zipcode" : {
"type" : "long"
}
}
},
"settings" : {
"index" : {
"creation_date" : "1572547796565",
"number_of_shards" : "1",
"number_of_replicas" : "1",
"uuid" : "HplZ3Q8qTai7kGpm0e-C5A",
"version" : {
"created" : "7040199"
},
"provided_name" : "listings-ash"
}
}
},
"listings-ash-calendar" : {
"aliases" : { },
"mappings" : {
"_meta" : {
"created_by" : "ml-file-data-visualizer"
},
"properties" : {
"@timestamp" : {
"type" : "date"
},
"adjusted_price" : {
"type" : "double"
},
"available" : {
"type" : "keyword"
},
"date" : {
"type" : "date",
"format" : "iso8601"
},
"listing_id" : {
"type" : "long"
},
"maximum_nights" : {
"type" : "long"
},
"minimum_nights" : {
"type" : "long"
},
"price" : {
"type" : "double"
}
}
},
"settings" : {
"index" : {
"creation_date" : "1572548275257",
"number_of_shards" : "1",
"number_of_replicas" : "1",
"uuid" : "jrU9jPUARlWFrB0i6YjaTQ",
"version" : {
"created" : "7040199"
},
"provided_name" : "listings-ash-calendar"
}
}
},
"listings-ash-review" : {
"aliases" : { },
"mappings" : {
"_meta" : {
"created_by" : "ml-file-data-visualizer"
},
"properties" : {
"@timestamp" : {
"type" : "date"
},
"comments" : {
"type" : "text"
},
"date" : {
"type" : "date",
"format" : "iso8601"
},
"listing_id" : {
"type" : "long"
},
"review_id" : {
"type" : "long"
},
"reviewer_id" : {
"type" : "long"
},
"reviewer_name" : {
"type" : "keyword"
}
}
},
"settings" : {
"index" : {
"creation_date" : "1572548566924",
"number_of_shards" : "1",
"number_of_replicas" : "1",
"uuid" : "px9n2sDvSPmU0cIDQHmXaA",
"version" : {
"created" : "7040199"
},
"provided_name" : "listings-ash-review"
}
}
}
}
#######
# Preview transform merge of 3 indices on a common listing_id field
#######
POST _data_frame/transforms/_preview
{
"source": {
"index": [
"listings-calendars-ash",
"listings-reviews-ash",
"listings-ash-verbose-1"
]
},
"pivot": {
"group_by": {
"listing_id": {
"terms": {
"field": "listing_id"
}
}
},
"aggregations": {
"max_price": {
"max": {
"field": "price"
}
},
"min_price": {
"min": {
"field": "price"
}
},
"avg_price": {
"avg": {
"field": "price"
}
},
"num_reviews": {
"cardinality": {
"field": "id"
}
},
"num_bedrooms": {
"max": {
"field": "bedrooms"
}
},
"num_beds": {
"max": {
"field": "beds"
}
},
"security_deposit": {
"max": {
"field": "security_deposit"
}
},
"num_bathrooms": {
"max": {
"field": "bathrooms"
}
},
"accommodates": {
"max": {
"field": "accommodates"
}
},
"location_geo": {
"geo_centroid": {
"field": "location_geo"
}
},
"review_score_rating": {
"avg": {
"field": "review_scores_rating"
}
},
"description": {
"scripted_metric": {
"init_script": "state.description = null",
"map_script": "state.description = params._source.description",
"combine_script": "return state.description",
"reduce_script": "for (d in states) if (d != null) return d"
}
}
}
}
}
#######
# Build and start transform
#######
PUT _data_frame/transforms/listings-ash-pivot
{
"source": {
"index": [
"listings-calendars-ash",
"listings-reviews-ash",
"listings-ash-verbose-1"
]
},
"dest": {
"index": "listings-ash-pivot"
},
"pivot": {
"group_by": {
"listing_id": {
"terms": {
"field": "listing_id"
}
}
},
"aggregations": {
"max_price": {
"max": {
"field": "price"
}
},
"min_price": {
"min": {
"field": "price"
}
},
"avg_price": {
"avg": {
"field": "price"
}
},
"num_reviews": {
"cardinality": {
"field": "id"
}
},
"num_bedrooms": {
"max": {
"field": "bedrooms"
}
},
"num_beds": {
"max": {
"field": "beds"
}
},
"security_deposit": {
"max": {
"field": "security_deposit"
}
},
"num_bathrooms": {
"max": {
"field": "bathrooms"
}
},
"accommodates": {
"max": {
"field": "accommodates"
}
},
"location_geo": {
"geo_centroid": {
"field": "location_geo"
}
},
"review_score_rating": {
"avg": {
"field": "review_scores_rating"
}
},
"description": {
"scripted_metric": {
"init_script": "state.description = null",
"map_script": "state.description = params._source.description",
"combine_script": "return state.description",
"reduce_script": "for (d in states) if (d != null) return d"
}
}
}
}
}
POST _data_frame/transforms/listings-ash-pivot/_start
GET _data_frame/transforms/listings-ash-pivot/_stats
####
# Build outlier detection to weed out outliers on the pivoted data
####
PUT _ml/data_frame/analytics/listings-ash-pivot-outliers
{
"source": {
"index": [
"listings-ash-pivot"
]
},
"dest": {
"index": "listings-ash-pivot-outliers",
"results_field": "ml"
},
"analysis": {
"outlier_detection": {}
},
"analyzed_fields": {
"excludes": [
"listing_id"
]
}
}
#######
# Add a new document with my desired Air BnB parameters given our pivoted params
#######
POST listings-ash-pivot-outliers/_doc
{
"num_bathrooms" : 2.0,
"num_beds" : 4.0,
"review_score_rating" : 90.0,
"max_price" : 270.0,
"min_price" : 100.0,
"accommodates" : 4.0,
"num_reviews" : 20.0,
"num_bedrooms" : 2.0,
"ml": {
"outlier_score": 0.0
}
}
######
# Build regression job to predict the average price given our pivoted features
# dependent_variable = predicted field "avg_price"
######
PUT _ml/data_frame/analytics/listings-ash-pivot-outliers-pred
{
"source": {
"index": [
"listings-ash-pivot-outliers"
],
"query": {
"range": {
"ml.outlier_score": {
"lt": 0.6
}
}
}
},
"dest": {
"index": "listings-ash-pivot-pred",
"results_field": "ml_regression"
},
"analysis": {
"regression": {
"dependent_variable": "avg_price"
}
},
"analyzed_fields": {
"includes": [
"num_bathrooms",
"num_beds",
"avg_price",
"review_score_rating",
"max_price",
"min_price",
"accommodates",
"num_reviews",
"num_bedrooms"
]
}
}
GET listings-ash-pivot-pred/_search
{
"query": {
"term": {
"ml_regression.is_training": {
"value": false
}
}
}
}
#######
# How accurate is our regression model?
#######
POST _ml/data_frame/_evaluate
{
"index": "listings-ash-pivot-pred",
"query": {
"term": {
"ml_regression.is_training": {
"value": true
}
}
},
"evaluation": {
"regression": {
"actual_field": "avg_price",
"predicted_field": "ml_regression.avg_price_prediction",
"metrics": {
"r_squared": {},
"mean_squared_error": {}
}
}
}
}
#######
# On average how far are we off?
#######
POST /_scripts/painless/_execute
{
"script": {
"source": "Math.sqrt(params.number)",
"params": {
"number": 348.4186538329173
}
}
}
@benwtrent
Copy link
Author

Data is available here, it is a combination of the last years calendar data, listing details, and reviews: http://insideairbnb.com/get-the-data.html

I added all the data via the CSV uploader.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment