Created
October 8, 2014 20:48
-
-
Save kdiverson/be67509ac57d6479d1a2 to your computer and use it in GitHub Desktop.
do.science notebook
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"do.science() #A function to help you spend more time on twitter" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"* awk and pipes\n", | |
"* GNU parallel\n", | |
"* job arrays\n", | |
"* job dependency\n", | |
"* quicksubmit" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"awk and pipes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"awk '{ sum += $1 } END { print sum }' counts.txt" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"samtools view -F4 derep_aligned2_16s.bam | awk -F\"\\t\" '{print $1}' | uniq | tee mapped216s.txt | cut -f2 -d\"-\" | awk '{ sum += $1 } END { print sum }' " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"comm -12 10f.depth.gt5.sorted 34f.depth.gt5.sorted | comm -12 - 35m.depth.gt5.sorted | comm -12 - 4m.depth.gt5.sorted | comm -12 - 6m.depth.gt5.sorted | comm -12 - 8m.depth.gt5.sorted > all.gt5" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#get depth and save all genes with coverage > 5\n", | |
"samtools depth 4m.bam | awk '{a[$1]+=$3;count[$1]+=1;}END{for(i in a)print i\"\\t\"a[i]/count[i];}' | tee 4m.depth.summary | awk '{if ($2 > 5) {a+=1;}}END{print a\"\\t\"FNR;}'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# replaces ALL instances in a line\n", | |
"awk '{gsub(/foo/,\"bar\");print}' \n", | |
"# substitute \"foo\" with \"bar\" ONLY for lines which contain \"baz\"\n", | |
"awk '/baz/{gsub(/foo/, \"bar\")};{print}'\n", | |
"# substitute \"foo\" with \"bar\" EXCEPT for lines which contain \"baz\"\n", | |
"awk '!/baz/{gsub(/foo/, \"bar\")};{print}'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#get match and next line, same as grep -A1\n", | |
"awk '/regex/ { print $0; getline; print $0 }'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"alias fq2fa=\"awk '{print \\\">\\\" substr(\\$0,2);getline;print;getline;getline}'\" \n", | |
"fq2fa seqs.fq > seqs.fa" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# qdel every job with jobname \"cat\" (can also use ~)\n", | |
"qstat -u $USER | awk '{if($4==\"cat\") {split($1,a,\".\"); cmd=\"qdel \" a[1]; system(cmd); close(cmd)}}'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#get average seq length fasta\n", | |
"awk '{/>/&&++a||b+=length()}END{print b/a}' file.fa" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#get lines > 100 (single line fasta only) sed -i.bak 's/foo/bar/g' myfile.\n", | |
"awk '!/^>/ {next} {getline s} length(s) >= 100 { print $0 \"\\n\" s }'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"GNU parallel" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for i in *gz; \n", | |
"do\n", | |
" gunzip $i\n", | |
"done" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"parallel 'gunzip {} ' ::: *.gz" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"Count number of reads that were an exact match" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"samtools view 10f.bam.sorted.bam | awk '$6 == \"101M\" ' | grep -o \"NM:i:0\" | wc -l" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"parallel \"samtools view {} | awk '\\$6 == \\\"101M\\\" ' | grep -o \\\"NM:i:0\\\" | wc -l; echo {}\" ::: *.bam.sorted.bam" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"cat 1gb.fasta | parallel --block 100k --recstart '>' --pipe blastp -evalue 0.01 -outfmt 6 -db db.fa -query - > results" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"cat 1gb.fasta | parallel -S :,server1,server2 --block 100k --recstart '>' --pipe blastp -evalue 0.01 -outfmt 6 -db db.fa -query - > result" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"job arrays" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"#PBS -t 1-10\n", | |
"qdel 5432[4]\n", | |
"qdel 5432[]\n", | |
"qstat -t 5432[]" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"#!/bin/sh\n", | |
"#PBS -N yourjobname\n", | |
"#PBS -l nodes=1,walltime=05:00:00 \n", | |
"#PBS -S /bin/sh\n", | |
"#PBS -M uniqname@umich.edu\n", | |
"#PBS -t 1-10\n", | |
"#PBS -m abe\n", | |
"#PBS -A example_flux\n", | |
"#PBS -l qos=flux\n", | |
"#PBS -q flux\n", | |
"#PBS -j oe\n", | |
"#PBS -V\n", | |
"cd /path/to/my/program./myprogram -input=file-${PBS_ARRAYID}" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"myscript < input_${PBS_ARRAY_INDEX}.tsv" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"Nth filename in a directory\n", | |
"ls -1 | tail -n +N | head -1" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"# get the filename in the inputs folder at position PBS_ARRAY_INDEX\n", | |
"filename=`ls -1 inputs/ | tail -n +${PBS_ARRAY_INDEX} | head -1`\n", | |
"\n", | |
"# use that file as an input to myscript\n", | |
"myscript < inputs/$filename" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"job dependency" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"qsub -W depend=afterok:jobid myscript.pbs\n", | |
"qsub -W depend=afterokarray:arrayid[count] myscript.pbs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"quicksubmit" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"/mnt/EXT/Schloss-data/bin/quicksubmit\n", | |
"\n", | |
"usage: quicksubmit [-h] [--walltime WALLTIME] [--cput CPUT] [--pm PM]\n", | |
" [--mailto MAILTO] [--afterok AFTEROK] [--jobname JOBNAME]\n", | |
" cmd [cmd ...]\n", | |
"\n", | |
"Quickly submit oneliners to pbs.\n", | |
"\n", | |
"positional arguments:\n", | |
" cmd\n", | |
"\n", | |
"optional arguments:\n", | |
" -h, --help show this help message and exit\n", | |
" --walltime WALLTIME how long the job should run in wall time\n", | |
" --cput CPUT how long the job should run in cpu time\n", | |
" --pm PM processors and memory limits in the form of\n", | |
" nodes=1:ppn=1,mem=30gb\n", | |
" --mailto MAILTO email address to which job status should be sent\n", | |
" --afterok AFTEROK run this job after JOBID has completed successfully\n", | |
" --jobname JOBNAME name of your job\n", | |
"\n", | |
"quicksubmit \"sh align2contigs.sh 6m.seqprep.txt.gz\" --walltime 500:00:00 --cput 500:00:00 --pm nodes=1:ppn=4,mem=20gb\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#!/usr/bin/python2.7\n", | |
"'''USAGE: quicksubmit.py \"command -l parameter\" [options]\n", | |
"This script submits your one line command as a job to the pbs job scheduler. \n", | |
"'''\n", | |
"import subprocess as s\n", | |
"import sys\n", | |
"import argparse\n", | |
"\n", | |
"def submitjob(job_name, command, subtype = \"qsub\", walltime = \"10:00:00\", cput = \"10:00:00\", pm = \"nodes=1:ppn=1:mem=46gb\", mailto = \"\", afterok = \"\", verbose = False):\n", | |
" '''USAGE: submitjob('my_job', 'echo \"hello world!\"') \n", | |
" This function submits a job to the pbs job scheduler. You may need to alter the job_string as your pbs implementation requires.\n", | |
" \n", | |
" '''\n", | |
" job_string = \"\"\"#!/bin/bash\n", | |
" #PBS -N {0}\n", | |
" #PBS -l walltime={1}\n", | |
" #PBS -l {2}\n", | |
" #PBS -l cput={4}\n", | |
" #PBS -j oe\n", | |
" #PBS -o ./{0}.$PBS_JOBID.pbsout\n", | |
" #PBS -V\n", | |
" #PBS -t 1-10\n", | |
" #PBS -q first\n", | |
" #PBS -m abe {5}\n", | |
" {aok}\n", | |
" \n", | |
" echo \"ncpus-2.pbs\"\n", | |
" cat $PBS_NODEFILE\n", | |
" qstat -f $PBS_JOBID\n", | |
"\n", | |
" cd $PBS_O_WORKDIR\n", | |
" \n", | |
" NCPUS=`wc -l $PBS_NODEFILE | awk '{{print $1}}'`\n", | |
"\n", | |
" {3}\n", | |
" echo \"qsub working directory absolute is\"\n", | |
" echo $PBS_O_WORKDIR\n", | |
" exit\"\"\".format(job_name, walltime, pm, command, cput, mailto, aok=\"#PBS -W depend=afterok:%s\" %(afterok) if afterok != \"\" else \"\")\n", | |
" \n", | |
" # Send job_string to qsub\n", | |
" \n", | |
" p = s.Popen(subtype, stdin=s.PIPE, stdout=s.PIPE, stderr=s.STDOUT)\n", | |
" p.stdin.write(job_string)\n", | |
" out, err = p.communicate()\n", | |
" \n", | |
" # Print your job and the response to the screen\n", | |
" if verbose:\n", | |
" print job_string\n", | |
" print out\n", | |
" \n", | |
" #returns job ID\n", | |
" return out.split(\".\")[0]\n", | |
"\n", | |
"#class MyParser(argparse.ArgumentParser):\n", | |
"# def error(self, message):\n", | |
"# sys.stderr.write('error: %s\\n' % message)\n", | |
"# self.print_help()\n", | |
"# sys.exit(2)\n", | |
"\n", | |
"parser = argparse.ArgumentParser(description='Quickly submit oneliners to pbs.')\n", | |
"#parser = MyParser()\n", | |
"parser.add_argument('cmd', nargs='+')\n", | |
"parser.add_argument('--walltime', default=\"1:00:00\", help='how long the job should run in wall time')\n", | |
"parser.add_argument('--cput', default=\"1:00:00\", help='how long the job should run in cpu time')\n", | |
"parser.add_argument('--pm', default=\"nodes=1:ppn=1,mem=2gb\", help='processors and memory limits in the form of nodes=1:ppn=1,mem=30gb')\n", | |
"parser.add_argument('--mailto', default=\"\", help='email address to which job status should be sent')\n", | |
"parser.add_argument('--afterok', default=\"\", help='run this job after JOBID has completed successfully')\n", | |
"parser.add_argument('--jobname', default=\"\", help='name of your job')\n", | |
"\n", | |
"if len(sys.argv)==1:\n", | |
" parser.print_help()\n", | |
" sys.exit(1)\n", | |
"\n", | |
"\n", | |
"args = parser.parse_args()\n", | |
" \n", | |
"joblist = []\n", | |
"\n", | |
"#print args\n", | |
"cmd = \" \".join(args.cmd)\n", | |
"print \"submitting command: \", cmd\n", | |
" \n", | |
"joblist.append(submitjob(job_name = args.jobname if args.jobname !=\"\" else cmd.split()[0], command = cmd, walltime = args.walltime, cput = args.cput, pm = args.pm, mailto = args.m\n", | |
"ailto, afterok = args.afterok))\n", | |
"\n", | |
"print \"Job id(s) \", joblist" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"incorporating job submission into your scripts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import subprocess as s\n", | |
"import sys\n", | |
"\n", | |
"def submitjob(job_name, command, subtype = \"qsub\", walltime = \"10:00:00\", cput = \"10:00:00\", pm = \"nodes=1:ppn=1\", afterok = \"\", verbose = False):\n", | |
" '''USAGE: submitjob('my_job', 'echo \"hello world!\"') \n", | |
" This function submits a job to the pbs job scheduler. You may need to alter the job_string as your pbs implementation requires.\n", | |
" \n", | |
" '''\n", | |
" job_string = \"\"\"#!/bin/bash\n", | |
" #PBS -N {0}\n", | |
" #PBS -l walltime={1}\n", | |
" #PBS -l {2}\n", | |
" #PBS -l cput={4}\n", | |
" #PBS -j oe\n", | |
" #PBS -o ./{0}.$PBS_JOBID.pbsout\n", | |
" #PBS -V\n", | |
" #PBS -q first\n", | |
" {aok}\n", | |
" \n", | |
" \n", | |
" echo \"ncpus-2.pbs\"\n", | |
" cat $PBS_NODEFILE\n", | |
" qstat -f $PBS_JOBID\n", | |
"\n", | |
" cd $PBS_O_WORKDIR\n", | |
" \n", | |
" NCPUS=`wc -l $PBS_NODEFILE | awk '{{print $1}}'`\n", | |
"\n", | |
" {3}\n", | |
" echo \"qsub working directory absolute is\"\n", | |
" echo $PBS_O_WORKDIR\n", | |
" exit\"\"\".format(job_name, walltime, pm, command, cput, aok=\"#PBS -W depend=afterok:%s\" %(afterok) if afterok != \"\" else \"\")\n", | |
" \n", | |
" # Send job_string to qsub\n", | |
"\n", | |
" p = s.Popen(subtype, stdin=s.PIPE, stdout=s.PIPE, stderr=s.STDOUT)\n", | |
" p.stdin.write(job_string)\n", | |
" out, err = p.communicate()\n", | |
" \n", | |
" # Print your job and the response to the screen\n", | |
" if verbose:\n", | |
" print job_string\n", | |
" print out\n", | |
" \n", | |
" #returns job ID\n", | |
" return out.split(\".\")[0]\n", | |
" \n", | |
"joblist = []\n", | |
"\n", | |
"#after = sys.argv[1]\n", | |
"infile = sys.argv[1]\n", | |
"base = infile.split(\".\")[0]\n", | |
"\n", | |
"for k in sys.argv[2:]:\n", | |
" cmd = \"~/bin/velveth {0}{1} {1} -fasta -short {0}.seqprep.txt.fq.keep.abundfilt; ~/bin/velvetg {0}{1} -exp_cov auto -cov_cutoff 0 -scaffolding no\".format(base, k)\n", | |
" jobname = base+k+\".velvet\"\n", | |
" #joblist.append(submitjob(jobname, cmd, walltime=\"476:00:00\", cput=\"9999:00:00\", pm=\"nodes=1:ppn=8,mem=46gb\", afterok=after))\n", | |
" joblist.append(submitjob(jobname, cmd, walltime=\"476:00:00\", cput=\"9999:00:00\", pm=\"nodes=1:ppn=8,mem=46gb\"))\n", | |
"print joblist\n", | |
"#~/bin/velveth all27 27 -fasta -short all.seqprep.fq.keep.abundfilt; ~/bin/velvetg all27 -exp_cov auto -cov_cutoff 0 -scaffolding no\n", | |
"#~/bin/velveth all31 31 -fasta -short all.seqprep.fq.keep.abundfilt; ~/bin/velvetg all31 -exp_cov auto -cov_cutoff 0 -scaffolding no\n", | |
"#~/bin/velveth all35 35 -fasta -short all.seqprep.fq.keep.abundfilt; ~/bin/velvetg all35 -exp_cov auto -cov_cutoff 0 -scaffolding no" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"python runvelvet.py 6m 35 31" | |
] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment