Skip to content

Instantly share code, notes, and snippets.

@dq-hustlecoding
Last active December 2, 2021 08:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dq-hustlecoding/6132f159a47db4c36f978d2ba886c80a to your computer and use it in GitHub Desktop.
Save dq-hustlecoding/6132f159a47db4c36f978d2ba886c80a to your computer and use it in GitHub Desktop.
create dataset for AWS Personalize
def create_dataset() -> None:
personalize = boto3.client('personalize', region_name='your region')
# dataset group 의 이름이 중복되면 안되므로 적당히 time 함수를 써서 생성시점에 따라 다른 이름으로 생성되도록 해줍니다.
response = personalize.create_dataset_group(name=f'dsg-{int(time.time())}')
dsg_arn = response['datasetGroupArn']
description = personalize.describe_dataset_group(datasetGroupArn=dsg_arn)['datasetGroup']
print('1. Name: ' + description['name'])
print('1. ARN: ' + description['datasetGroupArn'])
print('1. Status: ' + description['status'])
# dataset 생성과 코드실행간의 타이밍 오류를 방지하기 위해서 sleep을 사용합니다.
time.sleep(5)
# schema로 정의한 파일들은 이 코드와 동일한 폴더에 위치해야합니다.
# 14~25 line 을 통해 personalize 내부에 schema 를 생성해줍니다.
with open('event_schema.json') as f:
createSchemaResponse = personalize.create_schema( name=f'event_schema_{int(time.time())}', schema=f.read() )
event_schema_arn = createSchemaResponse['schemaArn']
print("1-1. EVENT SCHEMA :: ", event_schema_arn)
with open('user_schema.json') as f:
createSchemaResponse = personalize.create_schema( name=f'user_schema_{int(time.time())}', schema=f.read() )
user_schema_arn = createSchemaResponse['schemaArn']
print("1-1. USER SCHEMA :: ", user_schema_arn)
with open('item_schema.json') as f:
createSchemaResponse = personalize.create_schema( name=f'item_schema_{int(time.time())}', schema=f.read() )
item_schema_arn = createSchemaResponse['schemaArn']
print("1-1. item SCHEMA :: ", item_schema_arn)
# Dataset을 생성하는 코드입니다.
# Dataset이 생성되는 것에 시간이 오래걸리기 때문에
# 이런식으로 30초에 한번씩 Dataset 생성을 체크하며
# 모든 Dataset이 생성되기 전까지는 다음 코드로의 진행을 막아줍니다.
DSG_STATUS = ''
while DSG_STATUS != 'ACTIVE':
print("wait DSG to created... ::", DSG_STATUS)
time.sleep(30)
DSG_STATUS = personalize.describe_dataset_group( datasetGroupArn=dsg_arn )['datasetGroup']['status']
e_response = personalize.create_dataset(
name=f'event-{int(time.time())}',
schemaArn=event_schema_arn,
datasetGroupArn=dsg_arn,
datasetType='interactions'
)
print('2. Event Dataset Arn: ' + e_response['datasetArn'])
event_ds_arn = e_response['datasetArn']
u_response = personalize.create_dataset(
name=f'user-{int(time.time())}',
schemaArn=user_schema_arn,
datasetGroupArn=dsg_arn,
datasetType='users'
)
print('2. User Dataset Arn: ' + u_response['datasetArn'])
user_ds_arn = u_response['datasetArn']
i_response = personalize.create_dataset(
name=f'item-{int(time.time())}',
schemaArn=item_schema_arn,
datasetGroupArn=dsg_arn,
datasetType='items'
)
print('2. Item Dataset Arn: ' + i_response['datasetArn'])
item_ds_arn = i_response['datasetArn']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment