This document aggregates snippets that I found useful when working in bioinfromatics. It is work in progress and I will extend it as needed. It is mainly in form of toy examples or pseudcode. Please adjust it accodring to your needs.
mysql --user=root -p
/*get preview */
show databases;
USE <some_database>;
show tables;
describe <some_table>;
/* qouoting just one column */
SELECT t1.<field>, t2.<field>, CONCAT('"',t1.<long-text-field>,'"')
INTO outfile '/tmp/outfile.tsv'
FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
FROM <table1> t1
JOIN <table2> t2 ON t1.<t1_id>=t2.<t2t1_id>
JOIN <table3> t3 ON t3.<t3_id>=t2.<t2t3_id>;
mongo
show databases
use some_db
/* look what is inside the database */
show collections
db.some_collection.findOne()
/*get just some fileds, possibly nested */
db.some_collection.findOne({}, {<field1>:1, comments:1})
db.some_collection.findOne({}, {<field1>:1, "comments.comment":1})
/* pull out some fileds and simplify*/
db.some_collection.aggregate([
{$project: {_id: "$<field1>", comments: "$comments.comment"}},
{$out: "results"}
])
Export and clean up
mongoexport --db some_db -c results --out /tmp/output.json # from shell
/* back in mongo */
db.results.drop()
comm -23 <(sort file1.txt) <(sort file2.txt) # show just unique to file1
comm -13 <(sort file1.txt) <(sort file2.txt) # show just unique to file2
comm -12 <(sort file1.txt) <(sort file2.txt) # show lines appearing in both
# drop lines that have *rna* in some column and reduce output to other columns
awk -F"\t" -v OFS="\t" 'tolower($3) !~ /rna/ {print $4,$6}' file1.tsv | sort | uniq
# append column to file and concatatenate along row-dimension
v1=$(awk -F"\t" -v OFS="\t" '{print $0, "1"}' positive.txt)
v2=$(awk -F"\t" -v OFS="\t" '{print $0, "0"}' negative.txt)
echo "$v2" "$v1" | sort | uniq > training_labels.tsv
# Count number of columns in file:
head -n 1 FILE | awk '{print NF}'
cat file_to_be_filtered.txt | grep -Fwf strings_to_be_matched.txt > output.txt # keep just lines that match string
cat strings_to_be_matched.txt | grep -Fwf - file_to_be_filtered.txt > output.txt # this is the same
cat file_to_be_filtered.txt | grep -v -i -P "(string1|string2|...)" > output.txt # drop lines that match string
chmod -R --reference=folderA/ folderB/ # recursively apply permissions as per mask
chown -R username:group directory # recursively set owner and group for a folder
# copy a list of folders (or files) from remote to local
scp -r user@server:/location/{"$(cat folder_list.txt | tr '\n' ',' | sed 's/,$//')"} .
scp -r $(cut -f1 -d" " ../folder_list.txt | tr '\n' ' ' ) user@server:/location # do it the other way around
Count number of lines in file:
wc -l <file>`
Get length of a string:
expr length "CTGGAAAAGCAAAATTTGGATTTGTGGTTCAATCCACCATCTTTACTCAG"
Redirecting output:
2> <file>
stderr, 1> <file>
stdout, 2> <file> 1>&2
both
Get list of installed packages:
dpkg --list | grep compiler
Add executable in to your PATH (if 'permanent' <- change ~/.bashrc
or ~/.profile
)
export PATH=$PATH:/path/to/folder-with-executable && source ~/.bashrc && source ~/.profile
See what is on your path:
echo $PATH
Remove Entry from Path:
export PATH=${PATH%:/home/user/Software/some_tool}
def rev_complement_seq(seq):
""" reverse complements a sequence
For ANY probe to hybridise it has to be the RC of cDNA.
Does not work with any special characters in the sequence.
"""
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
# alphabet = set(["C","T","A","G"])
assert all([i in complement.keys() for i in set(seq)])
revseq = reversed(seq)
revcompseq = "".join(complement.get(base) for base in revseq)
return revcompseq
Right-adjust to common length:
r_adjusted = "".join([x.rjust(4,"0") for x in coords.split(":")])
Example execution wrapper for python scripts exectued by Snakemake.
def main(snakemake):
"""Execution wrapper"""
logfile = snakemake.log[0]
if logfile:
orig_stdout = sys.stdout
lf = open (logfile, mode = "w")
sys.stdout = lf
start_time = timeit.default_timer()
# do stuff
end_time = timeit.default_timer()
print("Finished in {:.3f} s".format(end_time - start_time))
if logfile:
sys_stdout = orig_stdout
lf.close()
if __name__ == "__main__":
main(snakemake)
rule bowtie_build:
"""
Build bowtie index from unzipped reference fasta
input: ancient(rules.merge_ref.output)
shell: "bowtie-build -f --threads {threads} --seed 0 {input} {params.base_name} &> {log}"
References:
# Option 1:
# Input to bowtie-build must be COMMA SEPARATED list of UNZIPPED files.
input: ancient(rules.unzip_data.output.ref)
shell: "bowtie-build -f --threads {threads} --seed 0 $(echo {input} | sed 's/ /,/g') {params.base_name} &> {log}"
# Option 2: (preferred)
# Input can be also a single file that was already combined
bowtie-build: http://bowtie-bio.sourceforge.net/manual.shtml#the-bowtie-build-indexer
"""
input:
ancient(rules.merge_ref.output)
output:
index = expand("{ind_dir}/{prefix}.{suffix}.ebwt",
suffix = suffix, prefix = prefix, ind_dir = ind_dir)
params:
base_name = join(ind_dir, prefix),
log:
"logs/bowtie_build/{}.log".format(align_prefix)
threads: 2
shell:
"bowtie-build -f --threads {threads} --seed 0 {input} "
"{params.base_name} &> {log}"
rule samtools_sort:
"""
Sorts alignments and converts to binary format
References:
samtools: http://www.htslib.org/doc/samtools.html
"""
input:
rules.bowtie_align.output.sam
# rules.clean_alignments.output.filt # for BLAST
output:
expand( "{ali_dir}/{align_prefix}.bam", ali_dir = ali_dir,
align_prefix = align_prefix)
threads: 2
log:
"logs/samtools/{}.log".format(align_prefix)
params:
tmp_dir = join(ali_dir, "tmp")
shell:
"samtools view -bS {input} | samtools sort -T {params.tmp_dir} "
"-o {output} -@ {threads} &> {log}"
Detach all explicitly loaded packages:
lapply(paste('package:',names(sessionInfo()$otherPkgs),sep=""),detach,character.only=TRUE,unload=TRUE)
List all loaded packages:
pkg_list <- c(search()[(grepl("package:*", search()) & !grepl("package:base", search()))])
Install packages from local file:
install.packages("/path/to/pckg/pckg.tar.gz", source = TRUE, repos = NULL)
Inspecting objects:
# To see how many parameters you can change try the args function:
args(align)
# To see source code of any function:
package:::function
# Inspecting R objects:
attributes(object)
str(object)
Loading and saving variables to/from .RData file
save.image("file.RData")
e1 <- new.env()
load("file.RData", e1) # Load the workspace image to a new environment
myvar <- get("myvar", e1) # Get the desired object from the environment
rm(e1)
To obtain a valid binary CDF file from either binary or ASCII cdf file:
#' Get a valid CDF annotation file from ASCII cdf file
#'
.make_binary_cdf_from_file <- function(annodb, annodata_path, chiptype){
assertthat::assert_that(file.exists(annodb))
out_path <- file.path(annodata_path, paste0(chiptype, ".cdf"))
if (file.exists(out_path)){
print(paste("Using file at:", out_path))
} else {
print("Converting ASCII CDF to binary....")
affxparser::convertCdf(annodb, out_path, force = TRUE)
}
return(tools::file_path_sans_ext(basename(out_path)))
}
To pull out a CDF from existing package:
#' Get a valid CDF annotation file from existing R package
#'
#' @details
#' package must be installed (e.g. from bioconductor)
.get_cdf_from_package <- function(annodb, annodata_path, rawdata_path){
suppressPackageStartupMessages(library(annodb, character.only = T))
fnames <- list.files(rawdata_path, full.names = T)
targetf <- file.path(annodata_path, paste0(gsub("cdf$", "",annodb), ".cdf"))
if (!(file.exists(targetf))){
print("Pulling out CDF from package....")
pathname <- aroma.affymetrix::env2Cdf(annodb, fnames[[1]])
system(paste0("mv -v ", pathname, " ", annodata_path))
} else{
print(paste0("Using existing cdf at: ",targetf," ...."))
pathname <- targetf
}
return(tools::file_path_sans_ext(basename(pathname)))
}
To make a CDF package from binary CDF:
#' @title make_cdf_package: Create R package from binary CDF
#'
#' @description ....
#'
#' @param cdf_name path to cdf file
#' @param species_name caption for the species
#' @param verbose bool, controls vebrosity
#'
#' @return TRUE, invisibly
#'
#' @importFrom makecdfenv make.cdf.package
#'
#' @export
make_cdf_package <- function(cdf_file, species_name, verbose = TRUE){
require(makecdfenv)
filename <- cdf_file
cdf_name <- tools::file_path_sans_ext(basename(cdf_file))
if (!grepl(".cdf$", filename)){ # append extension if needed
filename <- paste0(filename,".cdf")
}
cdf_path <- dirname(filename)
makecdfenv::make.cdf.package(basename(filename), cdf_name, cdf_path, cdf_path,
author = "Me", unlink = TRUE, verbose = verbose,
species = species_name)
invisible(TRUE)
}
tba
cp $(git ls-files -m) ~/some/folder # copy all modified files
Undo changes to all files but some
git add file-or-folder/to/keep
git checkout . # throws away all changes
git reset # unstages the added files