Skip to content

Instantly share code, notes, and snippets.

@legatoo
Created February 13, 2019 04:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save legatoo/3f7e91a9c79d9171401c481a957b2d8f to your computer and use it in GitHub Desktop.
Save legatoo/3f7e91a9c79d9171401c481a957b2d8f to your computer and use it in GitHub Desktop.
#!/bin/bash
####################################################################################################################
#
# 说明:
# 无论单机版本还是分布式版本,请使用本脚本启动您的程序。本脚本默认项目具有一个run.py的主入口。
# 脚本通过用户输入的参数确认执行本地版还是AFO版本。脚本默认从conf文件夹内加载配置。
# * 对于local(默认)模式,从local_settings中以及hyper_params文件夹中抽取程序运行参数,抽取后输送给run.py执行
# * 对于afo模式,加载afo_settings.xml文件,并将hyper_params中指定的超参配置添加入afo_settings.xml,传递给
# afo进行执行
# * (注)配置文件中可以使用顶头的#添加注释
#
# 用法:
# sh run.sh [ --mode local | afo ] [ --hparam ][ --tag ]
#
# 参数说明:
# mode 必传参数。有local(本地版)和afo(分布式)两种模式。
# hparam 必传参数。为了方便大家超参调试,我们允许用户将每一轮的超参放入独立的文件,并将文件名传入供脚本加载
# tag 选填参数。为了方便大家比较多轮的训练,例如在tensorboard中展示多轮曲线。我们会在用户指定的checkpoint文件夹下保存各轮的结果
# 每轮的结果保存在以tag名命名的文件夹内。tag默认为yyyyMMddHHmm格式,用户也可以通过tag参数手动指定
#
####################################################################################################################
#解析脚本Options
MODE="local"
HYPER_PARAM=""
TAG_ID=`/bin/date +"%Y%m%d%H%M"`
TIME_STAMP=`/bin/date +"%Y%m%d%H%M%S"`
while [[ $# -gt 0 ]]
do
case "$1" in
--mode)
MODE="$2"
shift
shift
;;
--hparam)
HYPER_PARAM="$2"
shift
shift
;;
--tag)
TAG_ID="$2"
shift
shift
;;
*)
echo "Unknown option $1. Supported options are [--mode local|afo] [--hparam] [--tag]"
exit 3
;;
esac
done
echo "Running in [$MODE] mode with hyper param file [$HYPER_PARAM] and tag [${TAG_ID}] by [$MIS_ID]"
##配置文件中可以有注释,注意请使用顶头的#开始
function skip_notation(){
if [[ $1 =~ ^#.* ]];then
echo "skip annotation $1"
return 0
fi
return 1
}
# local_settings和hyper_param的参数使用 name=value的格式,一行一个
function format_check(){
if [[ ! $1 =~ .*=.* ]];then
echo "illegal param format $1"
echo "param has to defined in format name=value. one param one line."
return 1
fi
#true
return 0
}
function join() {
# $1 is return variable name
# $2 is sep
# $3... are the elements to join
local retname=$1 sep=$2 ret=$3
shift 3 || shift $(($#))
printf -v "$retname" "%s" "$ret${@/#/$sep}"
}
function run_local(){
params=()
params+=("--script_mode=local")
while read -r line || [[ -n "$line" ]]; do
# 跳过注释
if skip_notation $line;then
continue
fi
# 确保格式
if ! format_check $line;then
exit 3
fi
if [[ $line == *"@run_id@"* ]]; then
line="${line/@run_id@/$TAG_ID}"
fi
params+=("--$line")
done < conf/local_settings
if [[ -e conf/hyper_params/${HYPER_PARAM} ]]; then
while read -r line || [[ -n "$line" ]]; do
if ! format_check $line;then
exit 3
fi
params+=("--$line")
done < conf/hyper_params/${HYPER_PARAM}
fi
echo "collected ${#params[@]} params."
printf '%s\n' "${params[@]}"
python run.py "${params[@]}"
}
function run_afo(){
params=()
params+=("<!--${TIME_STAMP}-->")
params+=("<property><name>args.script_mode<\/name><value>afo<\/value><\/property>")
if [[ -e conf/hyper_params/${HYPER_PARAM} ]]; then
while read -r line || [[ -n "$line" ]]; do
if ! format_check $line;then
exit 3
fi
# 先把参数用=split开
IFS='=' read -r -a array <<< "$line"
#对每一个参数构建一个xml项
params+=("<property><name>args.${array[0]}<\/name><value>${array[1]}<\/value><\/property>")
done < conf/hyper_params/${HYPER_PARAM}
fi
params+=("<!--${TIME_STAMP}-->")
#把param展开到afo_settings.xml的尾部
join append '\
' "${params[@]}" "<\/configuration>"
modified=/tmp/afo-settings.xml.backup.${TAG_ID}
sed "s/<\/configuration>/$append/" conf/afo_settings.xml > $modified
cat $modified > conf/afo_settings.xml
join to_print '\n' "${params[@]}" "<\/configuration>"
to_print=$(echo $to_print | sed "s/\\\//g")
echo "Params about to pass to AFO:\n$to_print"
#/opt/meituan/tensorflow-release/bin/tensorflow-submit.sh -conf ctr_hw_submit.xml -files ctr_main.py,ctr_estimator.py,run_setting.py,ctr_dataset.py,analysis.txt
}
function recovery_afo_settings(){
recovery=/tmp/afo-settings.xml.recovery.${TAG_ID}
sed "/<!--${TIME_STAMP}-->/,/<!--${TIME_STAMP}-->/d" conf/afo_settings.xml > $recovery
cat $recovery > conf/afo_settings.xml
}
function finish() {
recovery_afo_settings
echo "bye."
exit 1
}
#catch用户的ctrl-c行为,进行脚本恢复
trap finish SIGINT
if [[ "$MODE" = "local" ]];then
if [[ ! -e conf/local_settings ]];then
echo "Unable to find file local_settings under conf directory. "
exit 3
fi
run_local
fi
if [[ "$MODE" = "afo" ]];then
if [[ ! -e conf/afo_settings.xml ]];then
echo "Unable to find file afo_settings.xml under conf directory. "
exit 3
fi
run_afo
recovery_afo_settings
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment