Created
June 6, 2022 11:17
-
-
Save ku-kim/d3bc8b4b4351dd105e48736bf0a3bded to your computer and use it in GitHub Desktop.
Airbnb - Dummy Data : rooms
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import random | |
from faker import Faker | |
import numpy as np | |
import pandas as pd | |
from scipy.stats import skewnorm | |
# Faker Library Locale setup | |
fake = Faker('ko-KR') | |
df_rooms = pd.DataFrame(columns=['host_id', 'title', 'description', 'address' , 'lat', 'lng' , | |
'bathroom_count', 'bed_count', 'bedroom_count', 'header_count_capacity', | |
'cleaning_fee', 'daily_price', 'lodging_tax_ratio', 'sale_ratio', 'service_fee', | |
'rating_star_score', 'review_count']) | |
def create_rooms(df_rooms, room_size, host_id_size): | |
room_types = ['호텔', '민박', '모텔', '도미토리', '아파트', '멘션', '게스트하우스', '한옥'] | |
room_title = fake.company() + ' ' + np.random.choice(room_types) | |
price_normal_distribution = np.random.randint(0, 120) | |
skew_norm_distribution = skewnorm.rvs(20, size=room_size + 300) | |
price_distribution = np.round((skew_norm_distribution[skew_norm_distribution > 0] * 300000) + 3000, 1) | |
for row_i in range(room_size): | |
lat_lng = fake.local_latlng(country_code= 'KR') # ('37.1759', '128.9889', 'T’aebaek', 'KR', 'Asia/Seoul') | |
host_id = random.randint(1, host_id_size) | |
title = room_title = fake.company() + ' ' + np.random.choice(room_types) # 숙소 이름 e.g. 김김이 게스트하우스 | |
description = fake.text() # 숙소 설명 e.g. Quibusdam voluptatem omnis odio. Veniam nihil amet. Ratione minus repudiandae enim accusamus possimus. | |
address = fake.address() # | |
lat = lat_lng[0] # 위도 e.g 34.8825 | |
lng = lat_lng[1] # 경도 e.g 128.62667 | |
bathroom_count = random.randint(1, 3) # 화장실 수 e.g. 3 | |
bed_count = random.randint(1, 5) # 침대 수 e.g. 2 | |
bedroom_count = random.randint(1, 5) # 침실 수 e.g. 2 | |
headcount_capacity = random.randint(1, 10) # 최대 인원수 e.g. 8 | |
cleaning_fee = random.randint(10, 100) * 100 # 청소비 e.g. 5000 | |
daily_price = int(price_distribution[row_i]) # 일일 가격 e.g. 30000 ~ 1500000 편향분포 | |
lodging_tax_ratio = 10 # 세금(10%) e.g. 10 | |
sale_ratio = random.randint(1, 10) # 할인률(10%) e.g. 10 | |
service_fee = random.randint(5, 50) * 100 # 서비스 비용 e.g. 5000 | |
rating_star_score = round(random.uniform(1, 5), 2) # 평점 e.g. 3.3 | |
review_count = random.randint(10, 500) # 리뷰개수 | |
df_rooms.loc[row_i] = [host_id, title, description, address, lat, lng, | |
bathroom_count, bed_count, bedroom_count, headcount_capacity, | |
cleaning_fee, daily_price, lodging_tax_ratio, sale_ratio, service_fee, | |
rating_star_score, review_count] | |
return df_rooms | |
df_rooms = create_rooms(df_rooms, 5000, 2000) | |
df_rooms.to_csv('./rooms_plus.csv', index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment