martinholub/bioinformatics.md

## bioinformatics.md

      
    Raw
  

              bioinformatics.md
            
          
    Bioinformatician's Toolbox

This document aggregates snippets that I found useful when working in bioinfromatics. It is work in progress and I will extend it as needed. It is mainly in form of toy examples or pseudcode. Please adjust it accodring to your needs.
MariaDB

mysql --user=root -p
/*get preview */
show databases;
USE <some_database>;
show tables;
describe <some_table>;

/* qouoting just one column */
SELECT t1.<field>, t2.<field>, CONCAT('"',t1.<long-text-field>,'"')
INTO outfile '/tmp/outfile.tsv'
FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'
FROM <table1> t1
JOIN <table2> t2 ON t1.<t1_id>=t2.<t2t1_id>
JOIN <table3> t3 ON t3.<t3_id>=t2.<t2t3_id>;
MongoDB

Aggregation pipeline
mongo
show databases
use some_db
/* look what is inside the database */
show collections
db.some_collection.findOne()
/*get just some fileds, possibly nested */
db.some_collection.findOne({}, {<field1>:1, comments:1})
db.some_collection.findOne({}, {<field1>:1, "comments.comment":1})
/* pull out some fileds and simplify*/
db.some_collection.aggregate([
  {$project: {_id: "$<field1>", comments: "$comments.comment"}},
  {$out: "results"}
])
Export and clean up
mongoexport --db some_db -c results --out /tmp/output.json # from shell
/* back in mongo */
db.results.drop()
Bash

comm

comm -23 <(sort file1.txt) <(sort file2.txt) # show just unique to file1
comm -13 <(sort file1.txt) <(sort file2.txt) # show just unique to file2
comm -12 <(sort file1.txt) <(sort file2.txt) # show lines appearing in both
awk

# drop lines that have *rna* in some column and reduce output to other columns
awk -F"\t" -v OFS="\t" 'tolower($3) !~ /rna/ {print $4,$6}' file1.tsv | sort | uniq

# append column to file and concatatenate along row-dimension
v1=$(awk -F"\t" -v OFS="\t" '{print $0, "1"}' positive.txt)
v2=$(awk -F"\t" -v OFS="\t" '{print $0, "0"}' negative.txt)
echo "$v2" "$v1" | sort | uniq > training_labels.tsv

# Count number of columns in file:
head -n 1 FILE | awk '{print NF}'
grep

cat file_to_be_filtered.txt | grep -Fwf strings_to_be_matched.txt > output.txt # keep just lines that match string
cat strings_to_be_matched.txt | grep -Fwf - file_to_be_filtered.txt > output.txt # this is the same
cat file_to_be_filtered.txt | grep -v -i -P "(string1|string2|...)" > output.txt # drop lines  that match string
permissions

chmod -R --reference=folderA/ folderB/ # recursively apply permissions as per mask
chown -R username:group directory # recursively set owner and group for a folder
scp

# copy a list of folders (or files) from remote to local
scp -r user@server:/location/{"$(cat folder_list.txt | tr '\n' ',' | sed 's/,$//')"} .
scp -r $(cut -f1 -d" " ../folder_list.txt | tr '\n' ' ' ) user@server:/location # do it the other way around
other

Count number of lines in file:
wc -l <file>`
Get length of a string:
expr length "CTGGAAAAGCAAAATTTGGATTTGTGGTTCAATCCACCATCTTTACTCAG"
Redirecting output:
2> <file> stderr, 1> <file> stdout, 2> <file> 1>&2 both
Get list of installed packages:
dpkg --list | grep compiler
path manipulations

Add executable in to your PATH (if 'permanent' <- change ~/.bashrc or ~/.profile)
export PATH=$PATH:/path/to/folder-with-executable && source ~/.bashrc && source ~/.profile
See what is on your path:
echo $PATH
Remove Entry from Path:
export PATH=${PATH%:/home/user/Software/some_tool}
Python

def rev_complement_seq(seq):
    """ reverse complements a sequence

    For ANY probe to hybridise it has to be the RC of cDNA.
    Does not work with any special characters in the sequence.
    """
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    # alphabet = set(["C","T","A","G"])
    assert all([i in complement.keys() for i in set(seq)])
    revseq = reversed(seq)
    revcompseq = "".join(complement.get(base) for base in revseq)
    return revcompseq
Right-adjust to common length:
r_adjusted = "".join([x.rjust(4,"0") for x in coords.split(":")])
Snakemake

Example execution wrapper for python scripts exectued by Snakemake.
def main(snakemake):
    """Execution wrapper"""

    logfile = snakemake.log[0]
    if logfile:
        orig_stdout = sys.stdout
        lf = open (logfile, mode = "w")
        sys.stdout = lf

    start_time = timeit.default_timer()
    
    # do stuff
    
    end_time = timeit.default_timer()
    print("Finished in {:.3f} s".format(end_time - start_time))

    if logfile:
        sys_stdout = orig_stdout
        lf.close()

if __name__ == "__main__":
    main(snakemake)
Bowtie build

rule bowtie_build:
    """
    Build bowtie index from unzipped reference fasta
        input: ancient(rules.merge_ref.output)
        shell: "bowtie-build -f --threads {threads} --seed 0 {input} {params.base_name} &> {log}"

    References:

    # Option 1:
        # Input to bowtie-build must be COMMA SEPARATED list of UNZIPPED files.
        input: ancient(rules.unzip_data.output.ref)
        shell: "bowtie-build -f --threads {threads} --seed 0 $(echo {input} | sed 's/ /,/g') {params.base_name} &> {log}"
    # Option 2: (preferred)
        # Input can be also a single file that was already combined
        bowtie-build: http://bowtie-bio.sourceforge.net/manual.shtml#the-bowtie-build-indexer
    """
    input:
        ancient(rules.merge_ref.output)
    output:
        index = expand("{ind_dir}/{prefix}.{suffix}.ebwt",
                        suffix = suffix, prefix = prefix, ind_dir = ind_dir)
    params:
        base_name = join(ind_dir, prefix),
    log:
        "logs/bowtie_build/{}.log".format(align_prefix)
    threads: 2
    shell:
        "bowtie-build -f --threads {threads} --seed 0 {input} "
        "{params.base_name} &> {log}"
Samtools sort

rule samtools_sort:
    """
    Sorts alignments and converts to binary format

    References:
        samtools: http://www.htslib.org/doc/samtools.html
    """
    input:
        rules.bowtie_align.output.sam
        # rules.clean_alignments.output.filt # for BLAST
    output:
        expand( "{ali_dir}/{align_prefix}.bam", ali_dir = ali_dir,
                align_prefix = align_prefix)
    threads: 2
    log:
        "logs/samtools/{}.log".format(align_prefix)
    params:
        tmp_dir = join(ali_dir, "tmp")
    shell:
        "samtools view -bS {input} | samtools sort -T {params.tmp_dir} "
        "-o {output} -@ {threads} &> {log}"
R

Detach all explicitly loaded packages:
lapply(paste('package:',names(sessionInfo()$otherPkgs),sep=""),detach,character.only=TRUE,unload=TRUE)
List all loaded packages:
pkg_list <- c(search()[(grepl("package:*", search()) & !grepl("package:base", search()))])
Install packages from local file:
install.packages("/path/to/pckg/pckg.tar.gz", source = TRUE, repos = NULL)
Inspecting objects:
# To see how many parameters you can change try the args function:
args(align)
# To see source code of any function:
package:::function
# Inspecting R objects:
attributes(object)
str(object)
Loading and saving variables to/from .RData file
save.image("file.RData")
e1 <- new.env()
load("file.RData", e1) # Load the workspace image to a new environment
myvar <- get("myvar", e1) # Get the desired object from the environment
rm(e1)
Making a CDF package

To obtain a valid binary CDF file  from either binary or ASCII cdf file:
#' Get a valid CDF annotation file from ASCII cdf file
#'
.make_binary_cdf_from_file <- function(annodb, annodata_path, chiptype){

  assertthat::assert_that(file.exists(annodb))

  out_path <- file.path(annodata_path, paste0(chiptype, ".cdf"))
  if (file.exists(out_path)){
    print(paste("Using file at:", out_path))
  } else {
    print("Converting ASCII CDF to binary....")
    affxparser::convertCdf(annodb, out_path, force = TRUE)
  }

  return(tools::file_path_sans_ext(basename(out_path)))
}
To pull out a CDF from existing package:
#' Get a valid CDF annotation file from existing R package
#'
#' @details
#' package must be installed (e.g. from bioconductor)
.get_cdf_from_package <- function(annodb, annodata_path, rawdata_path){
  suppressPackageStartupMessages(library(annodb, character.only = T))
  fnames <- list.files(rawdata_path, full.names = T)
  targetf <- file.path(annodata_path, paste0(gsub("cdf$", "",annodb), ".cdf"))

  if (!(file.exists(targetf))){
    print("Pulling out CDF from package....")
    pathname <- aroma.affymetrix::env2Cdf(annodb, fnames[[1]])
    system(paste0("mv -v ", pathname, " ", annodata_path))
  } else{
    print(paste0("Using existing cdf at: ",targetf," ...."))
    pathname <- targetf
  }

  return(tools::file_path_sans_ext(basename(pathname)))
}
To make a CDF package from binary CDF:
#' @title make_cdf_package: Create R package from binary CDF
#'
#' @description ....
#'
#' @param cdf_name path to cdf file
#' @param species_name caption for the species
#' @param verbose bool, controls vebrosity
#'
#' @return TRUE, invisibly
#'
#' @importFrom makecdfenv make.cdf.package
#'
#' @export
make_cdf_package <- function(cdf_file, species_name, verbose = TRUE){
  require(makecdfenv)

  filename <- cdf_file
  cdf_name <- tools::file_path_sans_ext(basename(cdf_file))
  if (!grepl(".cdf$", filename)){ # append extension if needed
    filename <- paste0(filename,".cdf")
  }

  cdf_path <- dirname(filename)
  makecdfenv::make.cdf.package(basename(filename), cdf_name, cdf_path, cdf_path,
                               author = "Me", unlink = TRUE, verbose = verbose,
                               species = species_name)
  invisible(TRUE)
}
Perl

tba
Git

cp $(git ls-files -m) ~/some/folder # copy all modified files
Undo changes to all files but some
git add file-or-folder/to/keep
git checkout . # throws away all changes
git reset # unstages the added files