Last active
June 6, 2022 12:31
-
-
Save ku-kim/26669ad5f4a90f4815adc291505f4980 to your computer and use it in GitHub Desktop.
숙소 더미데이터 생성
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import random | |
import numpy as np | |
import pandas as pd | |
from scipy.stats import skewnorm | |
from faker import Faker | |
def create_rooms(room_size, host_id_size): | |
fake = Faker('ko-KR') | |
df_rooms = pd.DataFrame(columns=['host_id', 'title', 'description', 'address' , 'lat', 'lng' , | |
'bathroom_count', 'bed_count', 'bedroom_count', 'header_count_capacity', | |
'cleaning_fee', 'daily_price', 'lodging_tax_ratio', 'sale_ratio', 'service_fee', | |
'rating_star_score', 'review_count']) | |
room_types = ['호텔', '민박', '모텔', '도미토리', '아파트', '멘션', '게스트하우스', '한옥'] | |
skew_norm_distribution = skewnorm.rvs(20, size=room_size + 300) | |
price_distribution = np.round((skew_norm_distribution[skew_norm_distribution > 0] * 300000) + 3000, 1) | |
for row_i in range(room_size): | |
lat_lng = fake.local_latlng(country_code= 'KR') # ('37.1759', '128.9889', 'T’aebaek', 'KR', 'Asia/Seoul') | |
host_id = random.randint(1, host_id_size) | |
title = fake.company() + ' ' + np.random.choice(room_types) # 숙소 이름 e.g. 김김이 게스트하우스 | |
description = fake.text() # 숙소 설명 e.g. Quibusdam voluptatem omnis odio. Veniam nihil amet. Ratione minus repudiandae enim accusamus possimus. | |
address = fake.address() # | |
lat = lat_lng[0] # 위도 e.g 34.8825 | |
lng = lat_lng[1] # 경도 e.g 128.62667 | |
bathroom_count = random.randint(1, 3) # 화장실 수 e.g. 3 | |
bed_count = random.randint(1, 5) # 침대 수 e.g. 2 | |
bedroom_count = random.randint(1, 5) # 침실 수 e.g. 2 | |
headcount_capacity = random.randint(1, 10) # 최대 인원수 e.g. 8 | |
cleaning_fee = random.randint(10, 100) * 100 # 청소비 e.g. 5000 | |
daily_price = int(price_distribution[row_i]) # 일일 가격 e.g. 30000 ~ 1500000 편향분포 | |
lodging_tax_ratio = 10 # 세금(10%) e.g. 10 | |
sale_ratio = random.randint(1, 10) # 할인률(10%) e.g. 10 | |
service_fee = random.randint(5, 50) * 100 # 서비스 비용 e.g. 5000 | |
rating_star_score = round(random.uniform(1, 5), 2) # 평점 e.g. 3.3 | |
review_count = random.randint(10, 500) # 리뷰개수 | |
df_rooms.loc[row_i] = [host_id, title, description, address, lat, lng, | |
bathroom_count, bed_count, bedroom_count, headcount_capacity, | |
cleaning_fee, daily_price, lodging_tax_ratio, sale_ratio, service_fee, | |
rating_star_score, review_count] | |
return df_rooms | |
def create_room_images(room_size, random_image_size = 100, image_size_row = 400, image_size_col = 400): | |
df_room_image = pd.DataFrame(columns=['room_id', 'image_url']) | |
room_image_list = [] | |
for _ in range(1, random_image_size): | |
image_url = 'https://picsum.photos/' + str(image_size_row) + '/' + str(image_size_col) + '?random=1' | |
r = requests.get(image_url, allow_redirects=False) | |
room_image_list.append(str(r.headers['Location'])) | |
global_image_i = 0 | |
for room_i in range(1, room_size + 1): | |
for _ in range(random.randint(1, 7)): | |
df_room_image.loc[global_image_i] = [room_i, random.choice(room_image_list)] | |
global_image_i = global_image_i + 1 | |
return df_room_image | |
df_rooms = create_rooms(5000, 2000) # 5000개의 숙소 데이터 생성, FK host id는 2000 까지 있다고 가정 | |
df_room_image = create_room_images(500) | |
#df_room_image = create_room_images(500, 100, 400, 400) # 랜덤 이미지개수, 사이즈 조정 가능 (400x400) | |
df_rooms.to_csv('./dummy_rooms.csv', index = False) | |
df_room_image.to_csv('./dummy_room_images.csv', index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment