Last active
December 8, 2023 09:06
-
-
Save JacoCheung/f1aef93bded502de9c9e3d29b9cca683 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python ./din_seq3.py --model_name din_1k_seq3_v3_modify_v2 --train_keyset_num 1 --keyset_dir './data_parquet/keyset ' --batch_size 36000 --batchsize_eval 36000 --max_eval_batches 20 --gpus '0' --train_dir '/data' --start_date '20231012' --end_date '20231013' --datePath '20231107' --workspace_size_per_gpu_in_mb 1200 | |
import hugectr | |
# from mpi4py import MPI | |
import time | |
import os | |
import datetime | |
import hugectr2onnx | |
import json | |
import sys | |
# logger = Log(__name__).getlog() | |
import argparse | |
arg_parser = argparse.ArgumentParser(description="模型离线训练") | |
arg_parser.add_argument("--model_name", type=str, default="din_1k_seq_v1") | |
arg_parser.add_argument("--features_num", type=int, required=False,default=0) | |
arg_parser.add_argument("--train_keyset_num", type=int, default=3) | |
arg_parser.add_argument("--keyset_dir", type=str, required=True) | |
arg_parser.add_argument('--batch_size', type=int, default=36000) | |
arg_parser.add_argument('--batchsize_eval', type=int, default=36000) | |
arg_parser.add_argument('--max_eval_batches', type=int, default=5000) | |
arg_parser.add_argument('--lr', type=float, default=0.0001) | |
arg_parser.add_argument('--gpus', type=str, default='0') | |
arg_parser.add_argument('--num_workers', type=int, default=30) | |
arg_parser.add_argument('--slice', type=int, default=10) | |
arg_parser.add_argument('--label_name', type=str, default='label') | |
arg_parser.add_argument('--sparse_embedding_name', type=str, default='sparse_embedding1') | |
arg_parser.add_argument("--train_dir", type=str, required=True, default='/data') | |
arg_parser.add_argument('--json_dir', type=str, default='/json') | |
arg_parser.add_argument('--embedding_vec_size', type=int, default=25) | |
arg_parser.add_argument('--workspace_size_per_gpu_in_mb', type=int, default=10000) # 40000 | |
arg_parser.add_argument('--workspace_size_per_gpu_in_mb_null', type=int, default=5000) # 8700 | |
arg_parser.add_argument('--start_date', type=str, required=True) | |
arg_parser.add_argument('--end_date', type=str, required=True) | |
arg_parser.add_argument('--datePath', type=str, required=True) | |
args = arg_parser.parse_args() | |
os.system(f"rm -rf /root/{args.model_name}") | |
os.system(f"mkdir /root/{args.model_name}") | |
gpus = list(map(int, args.gpus.split(','))) | |
slot_size_array=[7, 10, 5, 14, 2, 32565, 195, 1, 1, 51, 20, 3, 8, 32, 9, 224, 7, 93, 3, 1, 7, 7, 110000000, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 110000000, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 110000000, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 110000000, 1, 4, 1, 402, 20770, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 1000000, 1000000, 1000000, 1000000, 1000000, 1000000, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 53, 31, 7, 21, 34, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 4, 500, 1, 1, 27, 1, 1, 482] | |
solver = hugectr.CreateSolver(model_name = args.model_name, | |
max_eval_batches = args.max_eval_batches, | |
batchsize_eval = args.batchsize_eval, | |
batchsize = args.batch_size, | |
lr = args.lr, | |
vvgpu = [gpus], | |
i64_input_key = True, | |
use_mixed_precision = False, | |
repeat_dataset = False, | |
use_cuda_graph = True | |
) | |
reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet, | |
source = ["./data_parquet/file_list.txt"], | |
keyset = ["./data_parquet/keyset"], | |
eval_source = "./data_parquet/file_list_test.txt", | |
num_workers=args.num_workers, | |
slot_size_array=slot_size_array, | |
check_type = hugectr.Check_t.Sum) | |
optimizer = hugectr.CreateOptimizer( | |
optimizer_type=hugectr.Optimizer_t.Adam, | |
update_type=hugectr.Update_t.Global, | |
beta1=0.9, | |
beta2=0.999, | |
epsilon=0.000000001, | |
) | |
etc = hugectr.CreateETC(ps_types = [hugectr.TrainPSType_t.Staged for _ in range(4)], | |
sparse_models = [f"/root/{args.model_name}/sparse_file{i}" for i in range(4)], | |
local_paths = ["/root/"]) | |
# etc = hugectr.CreateETC(ps_types = [hugectr.TrainPSType_t.Staged for _ in range(2)], | |
# sparse_models = [f"/root/{args.model_name}/sparse_file{i}" for i in range(2)], | |
# local_paths = ["/root/"]) | |
model = hugectr.Model(solver, reader, optimizer, etc) | |
model.add(hugectr.Input(label_dim = 1, label_name = args.label_name, | |
dense_dim = 0, dense_name = "dense", | |
data_reader_sparse_param_array = | |
[hugectr.DataReaderSparseParam("data0",1,True, 910), | |
hugectr.DataReaderSparseParam("seq", 1, False, 3), | |
hugectr.DataReaderSparseParam("cate", 1, False, 3), | |
hugectr.DataReaderSparseParam("data_other", 1, False, 90)])) | |
model.add( | |
hugectr.SparseEmbedding( | |
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, | |
workspace_size_per_gpu_in_mb=20000, | |
embedding_vec_size=args.embedding_vec_size, | |
combiner="sum", | |
sparse_embedding_name=f"sparse_embedding_name0", | |
bottom_name=f"data0", | |
optimizer=optimizer, | |
) | |
) | |
model.add( | |
hugectr.SparseEmbedding( | |
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, | |
workspace_size_per_gpu_in_mb=args.workspace_size_per_gpu_in_mb, | |
embedding_vec_size=args.embedding_vec_size, | |
combiner="sum", | |
sparse_embedding_name="sparse_embedding_seq", | |
bottom_name="seq", | |
optimizer=optimizer, | |
) | |
) | |
model.add( | |
hugectr.SparseEmbedding( | |
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, | |
workspace_size_per_gpu_in_mb=args.workspace_size_per_gpu_in_mb, | |
embedding_vec_size=args.embedding_vec_size, | |
combiner="sum", | |
sparse_embedding_name="sparse_embedding_cate", | |
bottom_name="cate", | |
optimizer=optimizer, | |
) | |
) | |
model.add( | |
hugectr.SparseEmbedding( | |
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, | |
workspace_size_per_gpu_in_mb=args.workspace_size_per_gpu_in_mb, | |
embedding_vec_size=args.embedding_vec_size, | |
combiner="sum", | |
sparse_embedding_name="sparse_embedding_data2", | |
bottom_name="data_other", | |
optimizer=optimizer, | |
) | |
) | |
model.add( | |
hugectr.DenseLayer( | |
layer_type=hugectr.Layer_t.Reshape, | |
bottom_names=["sparse_embedding_name0"], | |
top_names=["reshape0"], | |
leading_dim=910*args.embedding_vec_size, | |
) | |
) | |
model.add( | |
hugectr.DenseLayer( | |
layer_type=hugectr.Layer_t.Reshape, | |
bottom_names=["sparse_embedding_seq"], | |
top_names=["reshape1"], | |
leading_dim=3*args.embedding_vec_size, | |
) | |
) | |
model.add( | |
hugectr.DenseLayer( | |
layer_type=hugectr.Layer_t.Reshape, | |
bottom_names=["sparse_embedding_cate"], | |
top_names=["reshape2"], | |
leading_dim=3*args.embedding_vec_size, | |
) | |
) | |
model.add( | |
hugectr.DenseLayer( | |
layer_type=hugectr.Layer_t.Reshape, | |
bottom_names=["sparse_embedding_data2"], | |
top_names=["reshape3"], | |
leading_dim=90*args.embedding_vec_size, | |
) | |
) | |
model.add( | |
hugectr.DenseLayer( | |
layer_type=hugectr.Layer_t.Concat, | |
bottom_names=[ | |
"reshape0", | |
"reshape1", | |
"reshape2", | |
"reshape3", | |
], | |
top_names=["concat"], | |
) | |
) | |
model.add( | |
hugectr.DenseLayer( | |
layer_type=hugectr.Layer_t.InnerProduct, | |
bottom_names=["concat"], | |
top_names=["fc3"], | |
num_output=1, | |
) | |
) | |
model.add( | |
hugectr.DenseLayer( | |
layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, | |
bottom_names=["fc3", "label"], | |
top_names=["loss"], | |
) | |
) | |
model_reader = model.get_data_reader_train() | |
# iters=1000 | |
# print("start reading") | |
# # import pdb | |
# t_start = time.time() | |
# model.start_data_reading() | |
# for i in range(iters): | |
# start = time.time() | |
# batchsize = model_reader.read_a_batch_to_device() | |
# # lens = model.check_out_tensor("length0",hugectr.Tensor_t.Train) | |
# print("batchsize is ", batchsize) | |
# if(batchsize==0): | |
# break | |
# end = time.time() | |
model.compile() | |
model.summary() | |
model.graph_to_json(graph_config_file = "/root/" + args.model_name + ".json") | |
model.fit(num_epochs = 1, display = 50, eval_interval = 10000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment