Last active
November 11, 2019 17:24
-
-
Save benwtrent/a8f83d2ccf01eb515004280d42fe9310 to your computer and use it in GitHub Desktop.
building out avg price prediction on a house given ashville air bnb listing data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"listings-ash" : { | |
"aliases" : { }, | |
"mappings" : { | |
"_meta" : { | |
"created_by" : "ml-file-data-visualizer" | |
}, | |
"properties" : { | |
"@timestamp" : { | |
"type" : "date" | |
}, | |
"access" : { | |
"type" : "text" | |
}, | |
"accommodates" : { | |
"type" : "long" | |
}, | |
"amenities" : { | |
"type" : "text" | |
}, | |
"availability_30" : { | |
"type" : "long" | |
}, | |
"availability_365" : { | |
"type" : "long" | |
}, | |
"availability_60" : { | |
"type" : "long" | |
}, | |
"availability_90" : { | |
"type" : "long" | |
}, | |
"bathrooms" : { | |
"type" : "double" | |
}, | |
"bed_type" : { | |
"type" : "keyword" | |
}, | |
"bedrooms" : { | |
"type" : "long" | |
}, | |
"beds" : { | |
"type" : "long" | |
}, | |
"calculated_host_listings_count" : { | |
"type" : "long" | |
}, | |
"calculated_host_listings_count_entire_homes" : { | |
"type" : "long" | |
}, | |
"calculated_host_listings_count_private_rooms" : { | |
"type" : "long" | |
}, | |
"calculated_host_listings_count_shared_rooms" : { | |
"type" : "long" | |
}, | |
"calendar_last_scraped" : { | |
"type" : "date", | |
"format" : "iso8601" | |
}, | |
"calendar_updated" : { | |
"type" : "keyword" | |
}, | |
"cancellation_policy" : { | |
"type" : "keyword" | |
}, | |
"city" : { | |
"type" : "keyword" | |
}, | |
"cleaning_fee" : { | |
"type" : "double" | |
}, | |
"country" : { | |
"type" : "keyword" | |
}, | |
"country_code" : { | |
"type" : "keyword" | |
}, | |
"description" : { | |
"type" : "text" | |
}, | |
"experiences_offered" : { | |
"type" : "keyword" | |
}, | |
"extra_people" : { | |
"type" : "double" | |
}, | |
"first_review" : { | |
"type" : "date", | |
"format" : "iso8601" | |
}, | |
"guests_included" : { | |
"type" : "long" | |
}, | |
"has_availability" : { | |
"type" : "keyword" | |
}, | |
"host_about" : { | |
"type" : "text" | |
}, | |
"host_acceptance_rate" : { | |
"type" : "keyword" | |
}, | |
"host_has_profile_pic" : { | |
"type" : "keyword" | |
}, | |
"host_id" : { | |
"type" : "long" | |
}, | |
"host_identity_verified" : { | |
"type" : "keyword" | |
}, | |
"host_is_superhost" : { | |
"type" : "keyword" | |
}, | |
"host_listings_count" : { | |
"type" : "long" | |
}, | |
"host_location" : { | |
"type" : "keyword" | |
}, | |
"host_name" : { | |
"type" : "keyword" | |
}, | |
"host_neighbourhood" : { | |
"type" : "keyword" | |
}, | |
"host_picture_url" : { | |
"type" : "keyword" | |
}, | |
"host_response_rate" : { | |
"type" : "keyword" | |
}, | |
"host_response_time" : { | |
"type" : "keyword" | |
}, | |
"host_since" : { | |
"type" : "date", | |
"format" : "iso8601" | |
}, | |
"host_thumbnail_url" : { | |
"type" : "keyword" | |
}, | |
"host_total_listings_count" : { | |
"type" : "long" | |
}, | |
"host_url" : { | |
"type" : "keyword" | |
}, | |
"host_verifications" : { | |
"type" : "text" | |
}, | |
"house_rules" : { | |
"type" : "text" | |
}, | |
"instant_bookable" : { | |
"type" : "keyword" | |
}, | |
"interaction" : { | |
"type" : "text" | |
}, | |
"is_business_travel_ready" : { | |
"type" : "keyword" | |
}, | |
"is_location_exact" : { | |
"type" : "keyword" | |
}, | |
"jurisdiction_names" : { | |
"type" : "keyword" | |
}, | |
"last_review" : { | |
"type" : "date", | |
"format" : "iso8601" | |
}, | |
"last_scraped" : { | |
"type" : "date", | |
"format" : "iso8601" | |
}, | |
"latitude" : { | |
"type" : "double" | |
}, | |
"listing_id" : { | |
"type" : "long" | |
}, | |
"listing_url" : { | |
"type" : "keyword" | |
}, | |
"longitude" : { | |
"type" : "double" | |
}, | |
"market" : { | |
"type" : "keyword" | |
}, | |
"maximum_maximum_nights" : { | |
"type" : "long" | |
}, | |
"maximum_minimum_nights" : { | |
"type" : "long" | |
}, | |
"maximum_nights" : { | |
"type" : "long" | |
}, | |
"maximum_nights_avg_ntm" : { | |
"type" : "double" | |
}, | |
"minimum_maximum_nights" : { | |
"type" : "long" | |
}, | |
"minimum_minimum_nights" : { | |
"type" : "long" | |
}, | |
"minimum_nights" : { | |
"type" : "long" | |
}, | |
"minimum_nights_avg_ntm" : { | |
"type" : "double" | |
}, | |
"monthly_price" : { | |
"type" : "double" | |
}, | |
"name" : { | |
"type" : "text" | |
}, | |
"neighborhood_overview" : { | |
"type" : "text" | |
}, | |
"neighbourhood_cleansed" : { | |
"type" : "long" | |
}, | |
"notes" : { | |
"type" : "text" | |
}, | |
"number_of_reviews" : { | |
"type" : "long" | |
}, | |
"number_of_reviews_ltm" : { | |
"type" : "long" | |
}, | |
"picture_url" : { | |
"type" : "keyword" | |
}, | |
"price" : { | |
"type" : "double" | |
}, | |
"property_type" : { | |
"type" : "keyword" | |
}, | |
"require_guest_phone_verification" : { | |
"type" : "keyword" | |
}, | |
"require_guest_profile_picture" : { | |
"type" : "keyword" | |
}, | |
"requires_license" : { | |
"type" : "keyword" | |
}, | |
"review_scores_accuracy" : { | |
"type" : "long" | |
}, | |
"review_scores_checkin" : { | |
"type" : "long" | |
}, | |
"review_scores_cleanliness" : { | |
"type" : "long" | |
}, | |
"review_scores_communication" : { | |
"type" : "long" | |
}, | |
"review_scores_location" : { | |
"type" : "long" | |
}, | |
"review_scores_rating" : { | |
"type" : "long" | |
}, | |
"review_scores_value" : { | |
"type" : "long" | |
}, | |
"reviews_per_month" : { | |
"type" : "double" | |
}, | |
"room_type" : { | |
"type" : "keyword" | |
}, | |
"scrape_id" : { | |
"type" : "date", | |
"format" : "yyyyMMddHHmmss" | |
}, | |
"security_deposit" : { | |
"type" : "double" | |
}, | |
"smart_location" : { | |
"type" : "keyword" | |
}, | |
"space" : { | |
"type" : "text" | |
}, | |
"square_feet" : { | |
"type" : "long" | |
}, | |
"state" : { | |
"type" : "keyword" | |
}, | |
"street" : { | |
"type" : "text" | |
}, | |
"summary" : { | |
"type" : "text" | |
}, | |
"transit" : { | |
"type" : "text" | |
}, | |
"weekly_price" : { | |
"type" : "double" | |
}, | |
"zipcode" : { | |
"type" : "long" | |
} | |
} | |
}, | |
"settings" : { | |
"index" : { | |
"creation_date" : "1572547796565", | |
"number_of_shards" : "1", | |
"number_of_replicas" : "1", | |
"uuid" : "HplZ3Q8qTai7kGpm0e-C5A", | |
"version" : { | |
"created" : "7040199" | |
}, | |
"provided_name" : "listings-ash" | |
} | |
} | |
}, | |
"listings-ash-calendar" : { | |
"aliases" : { }, | |
"mappings" : { | |
"_meta" : { | |
"created_by" : "ml-file-data-visualizer" | |
}, | |
"properties" : { | |
"@timestamp" : { | |
"type" : "date" | |
}, | |
"adjusted_price" : { | |
"type" : "double" | |
}, | |
"available" : { | |
"type" : "keyword" | |
}, | |
"date" : { | |
"type" : "date", | |
"format" : "iso8601" | |
}, | |
"listing_id" : { | |
"type" : "long" | |
}, | |
"maximum_nights" : { | |
"type" : "long" | |
}, | |
"minimum_nights" : { | |
"type" : "long" | |
}, | |
"price" : { | |
"type" : "double" | |
} | |
} | |
}, | |
"settings" : { | |
"index" : { | |
"creation_date" : "1572548275257", | |
"number_of_shards" : "1", | |
"number_of_replicas" : "1", | |
"uuid" : "jrU9jPUARlWFrB0i6YjaTQ", | |
"version" : { | |
"created" : "7040199" | |
}, | |
"provided_name" : "listings-ash-calendar" | |
} | |
} | |
}, | |
"listings-ash-review" : { | |
"aliases" : { }, | |
"mappings" : { | |
"_meta" : { | |
"created_by" : "ml-file-data-visualizer" | |
}, | |
"properties" : { | |
"@timestamp" : { | |
"type" : "date" | |
}, | |
"comments" : { | |
"type" : "text" | |
}, | |
"date" : { | |
"type" : "date", | |
"format" : "iso8601" | |
}, | |
"listing_id" : { | |
"type" : "long" | |
}, | |
"review_id" : { | |
"type" : "long" | |
}, | |
"reviewer_id" : { | |
"type" : "long" | |
}, | |
"reviewer_name" : { | |
"type" : "keyword" | |
} | |
} | |
}, | |
"settings" : { | |
"index" : { | |
"creation_date" : "1572548566924", | |
"number_of_shards" : "1", | |
"number_of_replicas" : "1", | |
"uuid" : "px9n2sDvSPmU0cIDQHmXaA", | |
"version" : { | |
"created" : "7040199" | |
}, | |
"provided_name" : "listings-ash-review" | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####### | |
# Preview transform merge of 3 indices on a common listing_id field | |
####### | |
POST _data_frame/transforms/_preview | |
{ | |
"source": { | |
"index": [ | |
"listings-calendars-ash", | |
"listings-reviews-ash", | |
"listings-ash-verbose-1" | |
] | |
}, | |
"pivot": { | |
"group_by": { | |
"listing_id": { | |
"terms": { | |
"field": "listing_id" | |
} | |
} | |
}, | |
"aggregations": { | |
"max_price": { | |
"max": { | |
"field": "price" | |
} | |
}, | |
"min_price": { | |
"min": { | |
"field": "price" | |
} | |
}, | |
"avg_price": { | |
"avg": { | |
"field": "price" | |
} | |
}, | |
"num_reviews": { | |
"cardinality": { | |
"field": "id" | |
} | |
}, | |
"num_bedrooms": { | |
"max": { | |
"field": "bedrooms" | |
} | |
}, | |
"num_beds": { | |
"max": { | |
"field": "beds" | |
} | |
}, | |
"security_deposit": { | |
"max": { | |
"field": "security_deposit" | |
} | |
}, | |
"num_bathrooms": { | |
"max": { | |
"field": "bathrooms" | |
} | |
}, | |
"accommodates": { | |
"max": { | |
"field": "accommodates" | |
} | |
}, | |
"location_geo": { | |
"geo_centroid": { | |
"field": "location_geo" | |
} | |
}, | |
"review_score_rating": { | |
"avg": { | |
"field": "review_scores_rating" | |
} | |
}, | |
"description": { | |
"scripted_metric": { | |
"init_script": "state.description = null", | |
"map_script": "state.description = params._source.description", | |
"combine_script": "return state.description", | |
"reduce_script": "for (d in states) if (d != null) return d" | |
} | |
} | |
} | |
} | |
} | |
####### | |
# Build and start transform | |
####### | |
PUT _data_frame/transforms/listings-ash-pivot | |
{ | |
"source": { | |
"index": [ | |
"listings-calendars-ash", | |
"listings-reviews-ash", | |
"listings-ash-verbose-1" | |
] | |
}, | |
"dest": { | |
"index": "listings-ash-pivot" | |
}, | |
"pivot": { | |
"group_by": { | |
"listing_id": { | |
"terms": { | |
"field": "listing_id" | |
} | |
} | |
}, | |
"aggregations": { | |
"max_price": { | |
"max": { | |
"field": "price" | |
} | |
}, | |
"min_price": { | |
"min": { | |
"field": "price" | |
} | |
}, | |
"avg_price": { | |
"avg": { | |
"field": "price" | |
} | |
}, | |
"num_reviews": { | |
"cardinality": { | |
"field": "id" | |
} | |
}, | |
"num_bedrooms": { | |
"max": { | |
"field": "bedrooms" | |
} | |
}, | |
"num_beds": { | |
"max": { | |
"field": "beds" | |
} | |
}, | |
"security_deposit": { | |
"max": { | |
"field": "security_deposit" | |
} | |
}, | |
"num_bathrooms": { | |
"max": { | |
"field": "bathrooms" | |
} | |
}, | |
"accommodates": { | |
"max": { | |
"field": "accommodates" | |
} | |
}, | |
"location_geo": { | |
"geo_centroid": { | |
"field": "location_geo" | |
} | |
}, | |
"review_score_rating": { | |
"avg": { | |
"field": "review_scores_rating" | |
} | |
}, | |
"description": { | |
"scripted_metric": { | |
"init_script": "state.description = null", | |
"map_script": "state.description = params._source.description", | |
"combine_script": "return state.description", | |
"reduce_script": "for (d in states) if (d != null) return d" | |
} | |
} | |
} | |
} | |
} | |
POST _data_frame/transforms/listings-ash-pivot/_start | |
GET _data_frame/transforms/listings-ash-pivot/_stats | |
#### | |
# Build outlier detection to weed out outliers on the pivoted data | |
#### | |
PUT _ml/data_frame/analytics/listings-ash-pivot-outliers | |
{ | |
"source": { | |
"index": [ | |
"listings-ash-pivot" | |
] | |
}, | |
"dest": { | |
"index": "listings-ash-pivot-outliers", | |
"results_field": "ml" | |
}, | |
"analysis": { | |
"outlier_detection": {} | |
}, | |
"analyzed_fields": { | |
"excludes": [ | |
"listing_id" | |
] | |
} | |
} | |
####### | |
# Add a new document with my desired Air BnB parameters given our pivoted params | |
####### | |
POST listings-ash-pivot-outliers/_doc | |
{ | |
"num_bathrooms" : 2.0, | |
"num_beds" : 4.0, | |
"review_score_rating" : 90.0, | |
"max_price" : 270.0, | |
"min_price" : 100.0, | |
"accommodates" : 4.0, | |
"num_reviews" : 20.0, | |
"num_bedrooms" : 2.0, | |
"ml": { | |
"outlier_score": 0.0 | |
} | |
} | |
###### | |
# Build regression job to predict the average price given our pivoted features | |
# dependent_variable = predicted field "avg_price" | |
###### | |
PUT _ml/data_frame/analytics/listings-ash-pivot-outliers-pred | |
{ | |
"source": { | |
"index": [ | |
"listings-ash-pivot-outliers" | |
], | |
"query": { | |
"range": { | |
"ml.outlier_score": { | |
"lt": 0.6 | |
} | |
} | |
} | |
}, | |
"dest": { | |
"index": "listings-ash-pivot-pred", | |
"results_field": "ml_regression" | |
}, | |
"analysis": { | |
"regression": { | |
"dependent_variable": "avg_price" | |
} | |
}, | |
"analyzed_fields": { | |
"includes": [ | |
"num_bathrooms", | |
"num_beds", | |
"avg_price", | |
"review_score_rating", | |
"max_price", | |
"min_price", | |
"accommodates", | |
"num_reviews", | |
"num_bedrooms" | |
] | |
} | |
} | |
GET listings-ash-pivot-pred/_search | |
{ | |
"query": { | |
"term": { | |
"ml_regression.is_training": { | |
"value": false | |
} | |
} | |
} | |
} | |
####### | |
# How accurate is our regression model? | |
####### | |
POST _ml/data_frame/_evaluate | |
{ | |
"index": "listings-ash-pivot-pred", | |
"query": { | |
"term": { | |
"ml_regression.is_training": { | |
"value": true | |
} | |
} | |
}, | |
"evaluation": { | |
"regression": { | |
"actual_field": "avg_price", | |
"predicted_field": "ml_regression.avg_price_prediction", | |
"metrics": { | |
"r_squared": {}, | |
"mean_squared_error": {} | |
} | |
} | |
} | |
} | |
####### | |
# On average how far are we off? | |
####### | |
POST /_scripts/painless/_execute | |
{ | |
"script": { | |
"source": "Math.sqrt(params.number)", | |
"params": { | |
"number": 348.4186538329173 | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Data is available here, it is a combination of the last years calendar data, listing details, and reviews: http://insideairbnb.com/get-the-data.html
I added all the data via the CSV uploader.