Skip to content

Instantly share code, notes, and snippets.

@JacoCheung
Last active December 8, 2023 09:06
Show Gist options
  • Save JacoCheung/f1aef93bded502de9c9e3d29b9cca683 to your computer and use it in GitHub Desktop.
Save JacoCheung/f1aef93bded502de9c9e3d29b9cca683 to your computer and use it in GitHub Desktop.
# python ./din_seq3.py --model_name din_1k_seq3_v3_modify_v2 --train_keyset_num 1 --keyset_dir './data_parquet/keyset ' --batch_size 36000 --batchsize_eval 36000 --max_eval_batches 20 --gpus '0' --train_dir '/data' --start_date '20231012' --end_date '20231013' --datePath '20231107' --workspace_size_per_gpu_in_mb 1200
import hugectr
# from mpi4py import MPI
import time
import os
import datetime
import hugectr2onnx
import json
import sys
# logger = Log(__name__).getlog()
import argparse
arg_parser = argparse.ArgumentParser(description="模型离线训练")
arg_parser.add_argument("--model_name", type=str, default="din_1k_seq_v1")
arg_parser.add_argument("--features_num", type=int, required=False,default=0)
arg_parser.add_argument("--train_keyset_num", type=int, default=3)
arg_parser.add_argument("--keyset_dir", type=str, required=True)
arg_parser.add_argument('--batch_size', type=int, default=36000)
arg_parser.add_argument('--batchsize_eval', type=int, default=36000)
arg_parser.add_argument('--max_eval_batches', type=int, default=5000)
arg_parser.add_argument('--lr', type=float, default=0.0001)
arg_parser.add_argument('--gpus', type=str, default='0')
arg_parser.add_argument('--num_workers', type=int, default=30)
arg_parser.add_argument('--slice', type=int, default=10)
arg_parser.add_argument('--label_name', type=str, default='label')
arg_parser.add_argument('--sparse_embedding_name', type=str, default='sparse_embedding1')
arg_parser.add_argument("--train_dir", type=str, required=True, default='/data')
arg_parser.add_argument('--json_dir', type=str, default='/json')
arg_parser.add_argument('--embedding_vec_size', type=int, default=25)
arg_parser.add_argument('--workspace_size_per_gpu_in_mb', type=int, default=10000) # 40000
arg_parser.add_argument('--workspace_size_per_gpu_in_mb_null', type=int, default=5000) # 8700
arg_parser.add_argument('--start_date', type=str, required=True)
arg_parser.add_argument('--end_date', type=str, required=True)
arg_parser.add_argument('--datePath', type=str, required=True)
args = arg_parser.parse_args()
os.system(f"rm -rf /root/{args.model_name}")
os.system(f"mkdir /root/{args.model_name}")
gpus = list(map(int, args.gpus.split(',')))
slot_size_array=[7, 10, 5, 14, 2, 32565, 195, 1, 1, 51, 20, 3, 8, 32, 9, 224, 7, 93, 3, 1, 7, 7, 110000000, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 110000000, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 110000000, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 110000000, 1, 4, 1, 402, 20770, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 1000000, 1000000, 1000000, 1000000, 1000000, 1000000, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 53, 31, 7, 21, 34, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 4, 500, 1, 1, 27, 1, 1, 482]
solver = hugectr.CreateSolver(model_name = args.model_name,
max_eval_batches = args.max_eval_batches,
batchsize_eval = args.batchsize_eval,
batchsize = args.batch_size,
lr = args.lr,
vvgpu = [gpus],
i64_input_key = True,
use_mixed_precision = False,
repeat_dataset = False,
use_cuda_graph = True
)
reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,
source = ["./data_parquet/file_list.txt"],
keyset = ["./data_parquet/keyset"],
eval_source = "./data_parquet/file_list_test.txt",
num_workers=args.num_workers,
slot_size_array=slot_size_array,
check_type = hugectr.Check_t.Sum)
optimizer = hugectr.CreateOptimizer(
optimizer_type=hugectr.Optimizer_t.Adam,
update_type=hugectr.Update_t.Global,
beta1=0.9,
beta2=0.999,
epsilon=0.000000001,
)
etc = hugectr.CreateETC(ps_types = [hugectr.TrainPSType_t.Staged for _ in range(4)],
sparse_models = [f"/root/{args.model_name}/sparse_file{i}" for i in range(4)],
local_paths = ["/root/"])
# etc = hugectr.CreateETC(ps_types = [hugectr.TrainPSType_t.Staged for _ in range(2)],
# sparse_models = [f"/root/{args.model_name}/sparse_file{i}" for i in range(2)],
# local_paths = ["/root/"])
model = hugectr.Model(solver, reader, optimizer, etc)
model.add(hugectr.Input(label_dim = 1, label_name = args.label_name,
dense_dim = 0, dense_name = "dense",
data_reader_sparse_param_array =
[hugectr.DataReaderSparseParam("data0",1,True, 910),
hugectr.DataReaderSparseParam("seq", 1, False, 3),
hugectr.DataReaderSparseParam("cate", 1, False, 3),
hugectr.DataReaderSparseParam("data_other", 1, False, 90)]))
model.add(
hugectr.SparseEmbedding(
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
workspace_size_per_gpu_in_mb=20000,
embedding_vec_size=args.embedding_vec_size,
combiner="sum",
sparse_embedding_name=f"sparse_embedding_name0",
bottom_name=f"data0",
optimizer=optimizer,
)
)
model.add(
hugectr.SparseEmbedding(
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
workspace_size_per_gpu_in_mb=args.workspace_size_per_gpu_in_mb,
embedding_vec_size=args.embedding_vec_size,
combiner="sum",
sparse_embedding_name="sparse_embedding_seq",
bottom_name="seq",
optimizer=optimizer,
)
)
model.add(
hugectr.SparseEmbedding(
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
workspace_size_per_gpu_in_mb=args.workspace_size_per_gpu_in_mb,
embedding_vec_size=args.embedding_vec_size,
combiner="sum",
sparse_embedding_name="sparse_embedding_cate",
bottom_name="cate",
optimizer=optimizer,
)
)
model.add(
hugectr.SparseEmbedding(
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
workspace_size_per_gpu_in_mb=args.workspace_size_per_gpu_in_mb,
embedding_vec_size=args.embedding_vec_size,
combiner="sum",
sparse_embedding_name="sparse_embedding_data2",
bottom_name="data_other",
optimizer=optimizer,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.Reshape,
bottom_names=["sparse_embedding_name0"],
top_names=["reshape0"],
leading_dim=910*args.embedding_vec_size,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.Reshape,
bottom_names=["sparse_embedding_seq"],
top_names=["reshape1"],
leading_dim=3*args.embedding_vec_size,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.Reshape,
bottom_names=["sparse_embedding_cate"],
top_names=["reshape2"],
leading_dim=3*args.embedding_vec_size,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.Reshape,
bottom_names=["sparse_embedding_data2"],
top_names=["reshape3"],
leading_dim=90*args.embedding_vec_size,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.Concat,
bottom_names=[
"reshape0",
"reshape1",
"reshape2",
"reshape3",
],
top_names=["concat"],
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.InnerProduct,
bottom_names=["concat"],
top_names=["fc3"],
num_output=1,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
bottom_names=["fc3", "label"],
top_names=["loss"],
)
)
model_reader = model.get_data_reader_train()
# iters=1000
# print("start reading")
# # import pdb
# t_start = time.time()
# model.start_data_reading()
# for i in range(iters):
# start = time.time()
# batchsize = model_reader.read_a_batch_to_device()
# # lens = model.check_out_tensor("length0",hugectr.Tensor_t.Train)
# print("batchsize is ", batchsize)
# if(batchsize==0):
# break
# end = time.time()
model.compile()
model.summary()
model.graph_to_json(graph_config_file = "/root/" + args.model_name + ".json")
model.fit(num_epochs = 1, display = 50, eval_interval = 10000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment