Skip to content

Instantly share code, notes, and snippets.

@kdiverson
Created October 8, 2014 20:48
Show Gist options
  • Save kdiverson/be67509ac57d6479d1a2 to your computer and use it in GitHub Desktop.
Save kdiverson/be67509ac57d6479d1a2 to your computer and use it in GitHub Desktop.
do.science notebook
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "raw",
"metadata": {},
"source": [
"do.science() #A function to help you spend more time on twitter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* awk and pipes\n",
"* GNU parallel\n",
"* job arrays\n",
"* job dependency\n",
"* quicksubmit"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"awk and pipes"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"awk '{ sum += $1 } END { print sum }' counts.txt"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"samtools view -F4 derep_aligned2_16s.bam | awk -F\"\\t\" '{print $1}' | uniq | tee mapped216s.txt | cut -f2 -d\"-\" | awk '{ sum += $1 } END { print sum }' "
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"comm -12 10f.depth.gt5.sorted 34f.depth.gt5.sorted | comm -12 - 35m.depth.gt5.sorted | comm -12 - 4m.depth.gt5.sorted | comm -12 - 6m.depth.gt5.sorted | comm -12 - 8m.depth.gt5.sorted > all.gt5"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#get depth and save all genes with coverage > 5\n",
"samtools depth 4m.bam | awk '{a[$1]+=$3;count[$1]+=1;}END{for(i in a)print i\"\\t\"a[i]/count[i];}' | tee 4m.depth.summary | awk '{if ($2 > 5) {a+=1;}}END{print a\"\\t\"FNR;}'"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# replaces ALL instances in a line\n",
"awk '{gsub(/foo/,\"bar\");print}' \n",
"# substitute \"foo\" with \"bar\" ONLY for lines which contain \"baz\"\n",
"awk '/baz/{gsub(/foo/, \"bar\")};{print}'\n",
"# substitute \"foo\" with \"bar\" EXCEPT for lines which contain \"baz\"\n",
"awk '!/baz/{gsub(/foo/, \"bar\")};{print}'"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#get match and next line, same as grep -A1\n",
"awk '/regex/ { print $0; getline; print $0 }'"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"alias fq2fa=\"awk '{print \\\">\\\" substr(\\$0,2);getline;print;getline;getline}'\" \n",
"fq2fa seqs.fq > seqs.fa"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# qdel every job with jobname \"cat\" (can also use ~)\n",
"qstat -u $USER | awk '{if($4==\"cat\") {split($1,a,\".\"); cmd=\"qdel \" a[1]; system(cmd); close(cmd)}}'"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#get average seq length fasta\n",
"awk '{/>/&&++a||b+=length()}END{print b/a}' file.fa"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#get lines > 100 (single line fasta only) sed -i.bak 's/foo/bar/g' myfile.\n",
"awk '!/^>/ {next} {getline s} length(s) >= 100 { print $0 \"\\n\" s }'"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"GNU parallel"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for i in *gz; \n",
"do\n",
" gunzip $i\n",
"done"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"parallel 'gunzip {} ' ::: *.gz"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"Count number of reads that were an exact match"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"samtools view 10f.bam.sorted.bam | awk '$6 == \"101M\" ' | grep -o \"NM:i:0\" | wc -l"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"parallel \"samtools view {} | awk '\\$6 == \\\"101M\\\" ' | grep -o \\\"NM:i:0\\\" | wc -l; echo {}\" ::: *.bam.sorted.bam"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cat 1gb.fasta | parallel --block 100k --recstart '>' --pipe blastp -evalue 0.01 -outfmt 6 -db db.fa -query - > results"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cat 1gb.fasta | parallel -S :,server1,server2 --block 100k --recstart '>' --pipe blastp -evalue 0.01 -outfmt 6 -db db.fa -query - > result"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"job arrays"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"#PBS -t 1-10\n",
"qdel 5432[4]\n",
"qdel 5432[]\n",
"qstat -t 5432[]"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"#!/bin/sh\n",
"#PBS -N yourjobname\n",
"#PBS -l nodes=1,walltime=05:00:00 \n",
"#PBS -S /bin/sh\n",
"#PBS -M uniqname@umich.edu\n",
"#PBS -t 1-10\n",
"#PBS -m abe\n",
"#PBS -A example_flux\n",
"#PBS -l qos=flux\n",
"#PBS -q flux\n",
"#PBS -j oe\n",
"#PBS -V\n",
"cd /path/to/my/program./myprogram -input=file-${PBS_ARRAYID}"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"myscript < input_${PBS_ARRAY_INDEX}.tsv"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"Nth filename in a directory\n",
"ls -1 | tail -n +N | head -1"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"# get the filename in the inputs folder at position PBS_ARRAY_INDEX\n",
"filename=`ls -1 inputs/ | tail -n +${PBS_ARRAY_INDEX} | head -1`\n",
"\n",
"# use that file as an input to myscript\n",
"myscript < inputs/$filename"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"job dependency"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"qsub -W depend=afterok:jobid myscript.pbs\n",
"qsub -W depend=afterokarray:arrayid[count] myscript.pbs"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"quicksubmit"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"/mnt/EXT/Schloss-data/bin/quicksubmit\n",
"\n",
"usage: quicksubmit [-h] [--walltime WALLTIME] [--cput CPUT] [--pm PM]\n",
" [--mailto MAILTO] [--afterok AFTEROK] [--jobname JOBNAME]\n",
" cmd [cmd ...]\n",
"\n",
"Quickly submit oneliners to pbs.\n",
"\n",
"positional arguments:\n",
" cmd\n",
"\n",
"optional arguments:\n",
" -h, --help show this help message and exit\n",
" --walltime WALLTIME how long the job should run in wall time\n",
" --cput CPUT how long the job should run in cpu time\n",
" --pm PM processors and memory limits in the form of\n",
" nodes=1:ppn=1,mem=30gb\n",
" --mailto MAILTO email address to which job status should be sent\n",
" --afterok AFTEROK run this job after JOBID has completed successfully\n",
" --jobname JOBNAME name of your job\n",
"\n",
"quicksubmit \"sh align2contigs.sh 6m.seqprep.txt.gz\" --walltime 500:00:00 --cput 500:00:00 --pm nodes=1:ppn=4,mem=20gb\n"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#!/usr/bin/python2.7\n",
"'''USAGE: quicksubmit.py \"command -l parameter\" [options]\n",
"This script submits your one line command as a job to the pbs job scheduler. \n",
"'''\n",
"import subprocess as s\n",
"import sys\n",
"import argparse\n",
"\n",
"def submitjob(job_name, command, subtype = \"qsub\", walltime = \"10:00:00\", cput = \"10:00:00\", pm = \"nodes=1:ppn=1:mem=46gb\", mailto = \"\", afterok = \"\", verbose = False):\n",
" '''USAGE: submitjob('my_job', 'echo \"hello world!\"') \n",
" This function submits a job to the pbs job scheduler. You may need to alter the job_string as your pbs implementation requires.\n",
" \n",
" '''\n",
" job_string = \"\"\"#!/bin/bash\n",
" #PBS -N {0}\n",
" #PBS -l walltime={1}\n",
" #PBS -l {2}\n",
" #PBS -l cput={4}\n",
" #PBS -j oe\n",
" #PBS -o ./{0}.$PBS_JOBID.pbsout\n",
" #PBS -V\n",
" #PBS -t 1-10\n",
" #PBS -q first\n",
" #PBS -m abe {5}\n",
" {aok}\n",
" \n",
" echo \"ncpus-2.pbs\"\n",
" cat $PBS_NODEFILE\n",
" qstat -f $PBS_JOBID\n",
"\n",
" cd $PBS_O_WORKDIR\n",
" \n",
" NCPUS=`wc -l $PBS_NODEFILE | awk '{{print $1}}'`\n",
"\n",
" {3}\n",
" echo \"qsub working directory absolute is\"\n",
" echo $PBS_O_WORKDIR\n",
" exit\"\"\".format(job_name, walltime, pm, command, cput, mailto, aok=\"#PBS -W depend=afterok:%s\" %(afterok) if afterok != \"\" else \"\")\n",
" \n",
" # Send job_string to qsub\n",
" \n",
" p = s.Popen(subtype, stdin=s.PIPE, stdout=s.PIPE, stderr=s.STDOUT)\n",
" p.stdin.write(job_string)\n",
" out, err = p.communicate()\n",
" \n",
" # Print your job and the response to the screen\n",
" if verbose:\n",
" print job_string\n",
" print out\n",
" \n",
" #returns job ID\n",
" return out.split(\".\")[0]\n",
"\n",
"#class MyParser(argparse.ArgumentParser):\n",
"# def error(self, message):\n",
"# sys.stderr.write('error: %s\\n' % message)\n",
"# self.print_help()\n",
"# sys.exit(2)\n",
"\n",
"parser = argparse.ArgumentParser(description='Quickly submit oneliners to pbs.')\n",
"#parser = MyParser()\n",
"parser.add_argument('cmd', nargs='+')\n",
"parser.add_argument('--walltime', default=\"1:00:00\", help='how long the job should run in wall time')\n",
"parser.add_argument('--cput', default=\"1:00:00\", help='how long the job should run in cpu time')\n",
"parser.add_argument('--pm', default=\"nodes=1:ppn=1,mem=2gb\", help='processors and memory limits in the form of nodes=1:ppn=1,mem=30gb')\n",
"parser.add_argument('--mailto', default=\"\", help='email address to which job status should be sent')\n",
"parser.add_argument('--afterok', default=\"\", help='run this job after JOBID has completed successfully')\n",
"parser.add_argument('--jobname', default=\"\", help='name of your job')\n",
"\n",
"if len(sys.argv)==1:\n",
" parser.print_help()\n",
" sys.exit(1)\n",
"\n",
"\n",
"args = parser.parse_args()\n",
" \n",
"joblist = []\n",
"\n",
"#print args\n",
"cmd = \" \".join(args.cmd)\n",
"print \"submitting command: \", cmd\n",
" \n",
"joblist.append(submitjob(job_name = args.jobname if args.jobname !=\"\" else cmd.split()[0], command = cmd, walltime = args.walltime, cput = args.cput, pm = args.pm, mailto = args.m\n",
"ailto, afterok = args.afterok))\n",
"\n",
"print \"Job id(s) \", joblist"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"incorporating job submission into your scripts"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import subprocess as s\n",
"import sys\n",
"\n",
"def submitjob(job_name, command, subtype = \"qsub\", walltime = \"10:00:00\", cput = \"10:00:00\", pm = \"nodes=1:ppn=1\", afterok = \"\", verbose = False):\n",
" '''USAGE: submitjob('my_job', 'echo \"hello world!\"') \n",
" This function submits a job to the pbs job scheduler. You may need to alter the job_string as your pbs implementation requires.\n",
" \n",
" '''\n",
" job_string = \"\"\"#!/bin/bash\n",
" #PBS -N {0}\n",
" #PBS -l walltime={1}\n",
" #PBS -l {2}\n",
" #PBS -l cput={4}\n",
" #PBS -j oe\n",
" #PBS -o ./{0}.$PBS_JOBID.pbsout\n",
" #PBS -V\n",
" #PBS -q first\n",
" {aok}\n",
" \n",
" \n",
" echo \"ncpus-2.pbs\"\n",
" cat $PBS_NODEFILE\n",
" qstat -f $PBS_JOBID\n",
"\n",
" cd $PBS_O_WORKDIR\n",
" \n",
" NCPUS=`wc -l $PBS_NODEFILE | awk '{{print $1}}'`\n",
"\n",
" {3}\n",
" echo \"qsub working directory absolute is\"\n",
" echo $PBS_O_WORKDIR\n",
" exit\"\"\".format(job_name, walltime, pm, command, cput, aok=\"#PBS -W depend=afterok:%s\" %(afterok) if afterok != \"\" else \"\")\n",
" \n",
" # Send job_string to qsub\n",
"\n",
" p = s.Popen(subtype, stdin=s.PIPE, stdout=s.PIPE, stderr=s.STDOUT)\n",
" p.stdin.write(job_string)\n",
" out, err = p.communicate()\n",
" \n",
" # Print your job and the response to the screen\n",
" if verbose:\n",
" print job_string\n",
" print out\n",
" \n",
" #returns job ID\n",
" return out.split(\".\")[0]\n",
" \n",
"joblist = []\n",
"\n",
"#after = sys.argv[1]\n",
"infile = sys.argv[1]\n",
"base = infile.split(\".\")[0]\n",
"\n",
"for k in sys.argv[2:]:\n",
" cmd = \"~/bin/velveth {0}{1} {1} -fasta -short {0}.seqprep.txt.fq.keep.abundfilt; ~/bin/velvetg {0}{1} -exp_cov auto -cov_cutoff 0 -scaffolding no\".format(base, k)\n",
" jobname = base+k+\".velvet\"\n",
" #joblist.append(submitjob(jobname, cmd, walltime=\"476:00:00\", cput=\"9999:00:00\", pm=\"nodes=1:ppn=8,mem=46gb\", afterok=after))\n",
" joblist.append(submitjob(jobname, cmd, walltime=\"476:00:00\", cput=\"9999:00:00\", pm=\"nodes=1:ppn=8,mem=46gb\"))\n",
"print joblist\n",
"#~/bin/velveth all27 27 -fasta -short all.seqprep.fq.keep.abundfilt; ~/bin/velvetg all27 -exp_cov auto -cov_cutoff 0 -scaffolding no\n",
"#~/bin/velveth all31 31 -fasta -short all.seqprep.fq.keep.abundfilt; ~/bin/velvetg all31 -exp_cov auto -cov_cutoff 0 -scaffolding no\n",
"#~/bin/velveth all35 35 -fasta -short all.seqprep.fq.keep.abundfilt; ~/bin/velvetg all35 -exp_cov auto -cov_cutoff 0 -scaffolding no"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"python runvelvet.py 6m 35 31"
]
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment