lunik1/tZq_bdt_example.yaml

## tZq_bdt_example.yaml
# Directory containing input files for the classifier.
# The input directory is expected to contain a ROOT file for each process named
# histofile_$PROCESS.root. This should contain a Ttree named
# Ttree_$PROCESS. Additional trees named with the schema
# Ttree_$PROCESS__$SYSTEMATIC__(plus/minus) should be included for each
# relevant shape nuisance. The Ttrees should contain a branch for each
# observable and a branch containing event weights.
input_dir: /scratch/data/TopPhysics/mvaDirs/inputs/2016/all/mz50mw50/

# Random number generation seed
seed: 52

# ROOT selection string specifying the cuts that should be made before
# classifier training takes place.
selection: >-
    zMass > 71.2 && zMass < 111.2 &&
    wPairMass > 60.4 && wPairMass < 100.4 &&
    chi2 < 40 &&
    Channel == 1

# Name of the channel, only used to name output and does not apply any selection
channel: ee

# List of channels which should be considered signal
signals:
    - tZq

# List of channels which should be considered background
backgrounds:
    - DYToLL_M10To50_aMCatNLO
    - DYToLL_M50_aMCatNLO
    - FakeEG
    - FakeMu
    - TbartChan
    - TbartW
    - THQ
    - TsChan
    - TT
    - TtChan
    - ttH
    - TTW
    - TtW
    - TTZ
    - TWZ
    - Wjets
    - WW
    - WWW
    - WWZ
    - WZ
    - WZZ
    - ZZ
    - ZZZ

# Name of process containing collision data
# The location of the true data needs to be known when combine/THETA output is
# generated
data_process: DataEG

# Directories plots, root files, and trained classifiers should be output into
plot_dir: plots/
root_dir: root/
mva_dir: mva/

# Fraction of data to be reserved in test sample
test_fraction: 0.25

# If true, the weights of the signal channels are linearly scaled so that the
# overall normalisation for both the signal and background channels is the same
equalise_signal: true

# How negative event weights should be treated
#   passthrough: negative weights are unaltered
#   abs: the absolute value of all negative weights is taken
#   reweight: The absolute value of all negative weights is taken. The original
#             normalisation for each process is then restored by linearly
#             scaling the resulting weights down. This will fail if any
#             processes have an overall negative weight.
#   zero: negative weights are set to 0
negative_weight_treatment: passthrough

# Classifier selection
#   bdt_grad: Gradient Boosted Decision Tree (scikit-learn)
#   bdt_xgb: Gradient Boosted Decision Tree (XGBoost)
#   bdt_lgbm: Gradient Boosted Decision Tree (LightGBM)
#   random_forest: Random Forest
#   mlp: Multi-Layer Perceptron (Keras)
#   load: load classfier specfied by classifer_path option
classifier: bdt_grad

# BDT configuration. Passed to scikit-learn's GradientBoostingClassifier()
# See scikit-learn documentation for more information
bdt_grad:
    n_estimators: 100
    verbose: 1
    min_samples_split: 0.1
    subsample: 0.75
    learning_rate: 0.02
    max_depth: 5

# Options governing the root file output
root_out:
    # Whether output should be in the format for combine (true) or THETA (false)
    combine: true

    # What form the (pseudo)-data in the files should take
    # empty: Empty histograms
    # poisson: Sum the Monte Carlo histograms, and perform a Poisson jump on
    #          each bin
    # real: Use the real data
    data: empty

    # The strategy used to bin the MVA response in the resulting root files
    #   equal: specified number of equal-width bins in the (0, 1) range
    #          (default).
    #   quantile: specified number of equally-populated bins, achieved by
    #             placing bin edges at quantiles. Bin population does not take
    #             event weight into account.
    #   recursive_median: response is recursively bisected at the median
    strategy: equal

    # Set the number of bins for the equal or quantile binning stategies
    bins: 20

    # The recursive binning strategies will stop splitting once these limits
    # are reached
    min_signal_events: 0
    min_background_events: 1
    max_signal_error: 0.3
    max_background_error: 0.3

# Features to be included in the classifier training
features:
    - bTagDisc
    - fourthJetPt
    - jetMass
    - jjdelR
    - leadJetEta
	# Directory containing input files for the classifier.
	# The input directory is expected to contain a ROOT file for each process named
	# histofile_$PROCESS.root. This should contain a Ttree named
	# Ttree_$PROCESS. Additional trees named with the schema
	# Ttree_$PROCESS__$SYSTEMATIC__(plus/minus) should be included for each
	# relevant shape nuisance. The Ttrees should contain a branch for each
	# observable and a branch containing event weights.
	input_dir: /scratch/data/TopPhysics/mvaDirs/inputs/2016/all/mz50mw50/

	# Random number generation seed
	seed: 52

	# ROOT selection string specifying the cuts that should be made before
	# classifier training takes place.
	selection: >-
	zMass > 71.2 && zMass < 111.2 &&
	wPairMass > 60.4 && wPairMass < 100.4 &&
	chi2 < 40 &&
	Channel == 1

	# Name of the channel, only used to name output and does not apply any selection
	channel: ee

	# List of channels which should be considered signal
	signals:
	- tZq

	# List of channels which should be considered background
	backgrounds:
	- DYToLL_M10To50_aMCatNLO
	- DYToLL_M50_aMCatNLO
	- FakeEG
	- FakeMu
	- TbartChan
	- TbartW
	- THQ
	- TsChan
	- TT
	- TtChan
	- ttH
	- TTW
	- TtW
	- TTZ
	- TWZ
	- Wjets
	- WW
	- WWW
	- WWZ
	- WZ
	- WZZ
	- ZZ
	- ZZZ

	# Name of process containing collision data
	# The location of the true data needs to be known when combine/THETA output is
	# generated
	data_process: DataEG

	# Directories plots, root files, and trained classifiers should be output into
	plot_dir: plots/
	root_dir: root/
	mva_dir: mva/

	# Fraction of data to be reserved in test sample
	test_fraction: 0.25

	# If true, the weights of the signal channels are linearly scaled so that the
	# overall normalisation for both the signal and background channels is the same
	equalise_signal: true

	# How negative event weights should be treated
	# passthrough: negative weights are unaltered
	# abs: the absolute value of all negative weights is taken
	# reweight: The absolute value of all negative weights is taken. The original
	# normalisation for each process is then restored by linearly
	# scaling the resulting weights down. This will fail if any
	# processes have an overall negative weight.
	# zero: negative weights are set to 0
	negative_weight_treatment: passthrough

	# Classifier selection
	# bdt_grad: Gradient Boosted Decision Tree (scikit-learn)
	# bdt_xgb: Gradient Boosted Decision Tree (XGBoost)
	# bdt_lgbm: Gradient Boosted Decision Tree (LightGBM)
	# random_forest: Random Forest
	# mlp: Multi-Layer Perceptron (Keras)
	# load: load classfier specfied by classifer_path option
	classifier: bdt_grad

	# BDT configuration. Passed to scikit-learn's GradientBoostingClassifier()
	# See scikit-learn documentation for more information
	bdt_grad:
	n_estimators: 100
	verbose: 1
	min_samples_split: 0.1
	subsample: 0.75
	learning_rate: 0.02
	max_depth: 5

	# Options governing the root file output
	root_out:
	# Whether output should be in the format for combine (true) or THETA (false)
	combine: true

	# What form the (pseudo)-data in the files should take
	# empty: Empty histograms
	# poisson: Sum the Monte Carlo histograms, and perform a Poisson jump on
	# each bin
	# real: Use the real data
	data: empty

	# The strategy used to bin the MVA response in the resulting root files
	# equal: specified number of equal-width bins in the (0, 1) range
	# (default).
	# quantile: specified number of equally-populated bins, achieved by
	# placing bin edges at quantiles. Bin population does not take
	# event weight into account.
	# recursive_median: response is recursively bisected at the median
	strategy: equal

	# Set the number of bins for the equal or quantile binning stategies
	bins: 20

	# The recursive binning strategies will stop splitting once these limits
	# are reached
	min_signal_events: 0
	min_background_events: 1
	max_signal_error: 0.3
	max_background_error: 0.3

	# Features to be included in the classifier training
	features:
	- bTagDisc
	- fourthJetPt
	- jetMass
	- jjdelR
	- leadJetEta