ronggong/boundaryPatternClassification.sh

## boundaryPatternClassification.sh
#!/bin/sh
# Name the process
# ----------------
#$ -N compIden_$JOB_ID
#
# Call from the current working directory; no need to cd; using default.q configuration, HPC will distribute the best choice of the nodes
# ------------------------------------------------------
#$ -cwd
# -q default.q
#
# Max time limits, s_rt soft running time, h_rt hard running time
# ---------------
#$ -l s_rt=5:00:00
#$ -l h_rt=10:00:00
#
# Output/Error Text
# ----------------
#$ -o ./python.$JOB_ID.out
#$ -e ./python.$JOB_ID.err
#
# Create an array job = !!!!!!number of audio in the target folder!!!!!!, create 49 tasks, every task corresponds to a parameter setting for SVM
# ----------------
#$ -t 1-49:1
#
# Send me a mail when processed and when finished:
# ------------------------------------------------
#$ -M yourname@upf.edu
#$ -m bea
#
# Start script
# --------------------------------


printf "Starting execution of job $JOB_ID from user $SGE_O_LOGNAME at `date`\n"

# force UTF 8
export LANG="en_US.utf8"

# change python version
module load python/2.7.5

# enter virtualenv, change this to your virtualenv name, yourenv folder is suppose to be at the previous level of the working folder
source ../yourenv/bin/activate

# test import sklearn
python svm_cv.py ${SGE_TASK_ID}

# Print job Done
printf "Job $JOB_ID done at `date`\n"

## svm_cv.py
# python code for parameter search

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,StratifiedKFold
import numpy as np
import sys

def svm_cv(fv_train,target_train,C,gamma):
	clf 	= SVC(kernel='rbf', gamma=gamma, C=C)
	mycv 	= StratifiedKFold(n_splits=5)
	scores 	= cross_val_score(clf,fv_train,target_train,cv=mycv,scoring='recall_weighted')
	print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

if __name__ == '__main__':

	# 49 parameter settings
	parameters = [[0.001, 0.001], [0.001, 0.01], [0.001, 0.1], [0.001, 1], [0.001, 10], [0.001, 100], [0.001, 1000], [0.01, 0.001], [0.01, 0.01], [0.01, 0.1], [0.01, 1], [0.01, 10], [0.01, 100], [0.01, 1000], [0.1, 0.001], [0.1, 0.01], [0.1, 0.1], [0.1, 1], [0.1, 10], [0.1, 100], [0.1, 1000], [1, 0.001], [1, 0.01], [1, 0.1], [1, 1], [1, 10], [1, 100], [1, 1000], [10, 0.001], [10, 0.01], [10, 0.1], [10, 1], [10, 10], [10, 100], [10, 1000], [100, 0.001], [100, 0.01], [100, 0.1], [100, 1], [100, 10], [100, 100], [100, 1000], [1000, 0.001], [1000, 0.01], [1000, 0.1], [1000, 1], [1000, 10], [1000, 100], [1000, 1000]]

	# pass the first arg as the parameter setting index
	index = int(sys.argv[1])
	index -= 1
	if index<0 or index>48:
		print "this parameter doesn''t exist"

	p 		= parameters[index]
	C 		= p[0]
	gamma 	= p[1]
	print 'C:',C,'gamma:',gamma

	fv_train 		= np.load('fv_train.npy')
	target_train 	= np.load('target_train.npy')

	# start to run cross validation with parameter C and gamma
	svm_cv(fv_train,target_train,C,gamma)

## svm_dtic_hpc.md

      
    Raw
  

              svm_dtic_hpc.md
            
          
    We want to do the cross_validation of a binary classification task by using the scikit-learn SVM


we log firstly into hpc

ssh user_name@hpc.dtic.upf.edu


change the python environment to PYTHON/2.7.5, for some reason I don't know, I can't install any python packages on default PYTHON 2.7.3 environment

module load PYTHON/2.7.5


make a virtual environment, enter into this environment, and install scikit-learn

virtualenv yourenv

source yourenv/bin/activate

pip install --upgrade pip

pip install numpy,scipy,scikit-learn


there should be four files in your working folder, they are:


boundaryPatternClassification.sh: bash which execute jobs in HPC
svm_cv.py: python script for running the cross validation of the scikit-learn SVM
fv_train.npy: feature vectors for training the SVM model download it
target_train.npy: target vector for training the SVM model download it


the explaination of the .sh and .py are inside the scripts.


to run this recipe:

qsub boundaryPatternClassification.sh


to view the stat of the running jobs, use qstat. The error log and output log will appear in your directory as .err and .out.
	#!/bin/sh
	# Name the process
	# ----------------
	#$ -N compIden_$JOB_ID
	#
	# Call from the current working directory; no need to cd; using default.q configuration, HPC will distribute the best choice of the nodes
	# ------------------------------------------------------
	#$ -cwd
	# -q default.q
	#
	# Max time limits, s_rt soft running time, h_rt hard running time
	# ---------------
	#$ -l s_rt=5:00:00
	#$ -l h_rt=10:00:00
	#
	# Output/Error Text
	# ----------------
	#$ -o ./python.$JOB_ID.out
	#$ -e ./python.$JOB_ID.err
	#
	# Create an array job = !!!!!!number of audio in the target folder!!!!!!, create 49 tasks, every task corresponds to a parameter setting for SVM
	# ----------------
	#$ -t 1-49:1
	#
	# Send me a mail when processed and when finished:
	# ------------------------------------------------
	#$ -M yourname@upf.edu
	#$ -m bea
	#
	# Start script
	# --------------------------------


	printf "Starting execution of job $JOB_ID from user $SGE_O_LOGNAME at `date`\n"

	# force UTF 8
	export LANG="en_US.utf8"

	# change python version
	module load python/2.7.5

	# enter virtualenv, change this to your virtualenv name, yourenv folder is suppose to be at the previous level of the working folder
	source ../yourenv/bin/activate

	# test import sklearn
	python svm_cv.py ${SGE_TASK_ID}

	# Print job Done
	printf "Job $JOB_ID done at `date`\n"
	# python code for parameter search

	from sklearn.svm import SVC
	from sklearn.model_selection import cross_val_score,StratifiedKFold
	import numpy as np
	import sys

	def svm_cv(fv_train,target_train,C,gamma):
	clf = SVC(kernel='rbf', gamma=gamma, C=C)
	mycv = StratifiedKFold(n_splits=5)
	scores = cross_val_score(clf,fv_train,target_train,cv=mycv,scoring='recall_weighted')
	print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

	if __name__ == '__main__':

	# 49 parameter settings
	parameters = [[0.001, 0.001], [0.001, 0.01], [0.001, 0.1], [0.001, 1], [0.001, 10], [0.001, 100], [0.001, 1000], [0.01, 0.001], [0.01, 0.01], [0.01, 0.1], [0.01, 1], [0.01, 10], [0.01, 100], [0.01, 1000], [0.1, 0.001], [0.1, 0.01], [0.1, 0.1], [0.1, 1], [0.1, 10], [0.1, 100], [0.1, 1000], [1, 0.001], [1, 0.01], [1, 0.1], [1, 1], [1, 10], [1, 100], [1, 1000], [10, 0.001], [10, 0.01], [10, 0.1], [10, 1], [10, 10], [10, 100], [10, 1000], [100, 0.001], [100, 0.01], [100, 0.1], [100, 1], [100, 10], [100, 100], [100, 1000], [1000, 0.001], [1000, 0.01], [1000, 0.1], [1000, 1], [1000, 10], [1000, 100], [1000, 1000]]

	# pass the first arg as the parameter setting index
	index = int(sys.argv[1])
	index -= 1
	if index<0 or index>48:
	print "this parameter doesn''t exist"

	p = parameters[index]
	C = p[0]
	gamma = p[1]
	print 'C:',C,'gamma:',gamma

	fv_train = np.load('fv_train.npy')
	target_train = np.load('target_train.npy')

	# start to run cross validation with parameter C and gamma
	svm_cv(fv_train,target_train,C,gamma)