Thomas Girke tgirke

## processFileInChunks.R
################################################
## Streaming through large tabular files in R ##
################################################
## Author: Thomas Girke
## Last update: 21-Dec-2018

## Utility: the function 'processFileInLineChunks' streams through a file in
## batches of lines and applies to each imported line batch a function assigned
## to the 'myFct' argument. The number of lines processed in each iteration can
## be defined under the 'n_rows' argument. As importer functions, both 'fread'

## PeakSeq.sh
##################################################
## Create mappability file for reference genome ##
##################################################
## Thomas Girke
## Date: May 16, 2018
## PeakSeq docs:
##   https://github.com/gersteinlab/PeakSeq
##   http://gensoft.pasteur.fr/docs/PeakSeq/1.1/PeakSeq.readme

## Note: mappability step should be skipped in newer version of PeakSeq

## hmmscan.R
##############################################
## Map Pfam Domains to Proteins with HMMER3 ##
##############################################
## Author: Thomas Girke
## Date: May 11, 2018

## Utility: mapping of Pfam domains to protein sequences.
## The module load and Pfam database paths given below are specific to the HPCC/biocluster system.
## For details consult the man page for hmmscan from the command-line with 'hmmscan -h'

## kegg_cmp_download.R
####################################################
## Import of KEGG Compounds into SDFset Container ##
####################################################
## Date: March 07, 2018
## Motivation: response to request on Bioc support list: https://support.bioconductor.org/p/106712/

################################
## Download KEGG CMPs via URL ##
################################

## alignStats.R
###################################################################
## Alignment Stats with Support for FASTQ Files with >500M Reads ##
###################################################################
alignStats <- function(args) {
    fqpaths <- infile1(args)
    bampaths <- outpaths(args)
    bamexists <- file.exists(bampaths)
    fqpaths <- fqpaths[bamexists]
    bampaths <- bampaths[bamexists]
    ## Obtain total read number from FASTQ files

## test.R
x <- 1:12

## qualityTrimming
#####################################
## Quality Trimming of FASTQ Reads ##
#####################################
## Author: Thomas Girke
## Last update: May 30, 2016
## Usage of below function combined with preprocessReads form systemPipeR:
# qcTrim <- "qualityTrimming(fq, phred_cutoff=20, cutoff_occurrences=1, N_cutoff=1, minreadlength=100)"
# preprocessReads(args=args, Fct=qcTrim, batchsize=100000, overwrite=TRUE, compress=TRUE)

## Arguments:

## appendCounter.R
############################
## appendCounter Function ##
############################
## Author: Thomas Girke
## Last update: 04-Oct-15

## Function to append occurrence counter to entries in character
## vector and return the results as named vector where the
## original data are in the same order in the data slot
## and the counting result in the name slot.

## test.R
my_frame[!duplicated(my_frame[,2]),] # Removes rows with duplicated values in selected column.
my_frame[my_frame$y2 > my_frame$y3,] # Prints all rows of data frame where values of col1 > col2. Comparison operators are: == (equal), != (not equal), >= (greater than or equal), etc. Logical operators are & (and), | (or) and ! (not).
x <- 0.5:10; x[x<1.0] <- -1/x[x<1.0] # Replaces all values in vector or data frame that are below 1 with their reciprocal value.
x <-data.frame(month=month.abb[1:12], AB=LETTERS[1:2], no1=1:48, no2=1:24); x[x$month == "Apr" & (x$no1 == x$no2 | x$no1 > x$no2),] # Prints all records of frame 'x' that contain 'Apr' AND have equal values in columns 'no1' and 'no2' OR have greater values in column 'no1'.
x[x[,1] %in% c("Jun", "Aug"),] # Retrieves rows with column matches specified in a query vector.
x[c(grep("\\d{2}", as.character(x$no1), perl = TRUE)),] # Possibility to print out all rows of a data frame where a regular expression matches (here all double digit values in col 'no1').
x[c(g

## gist:4101283
#####################################################
## (A) Add fingerprint folding argument to desc2fp ##
##########################################
## Intersect and Venn Diagram Functions ##
##########################################
## Author: Thomas Girke
## Last update: March 24, 2012
## Utilities:
## (1) Venn Intersects
##     Computation of Venn intersects among 2-20 or more sample sets using the typical
	################################################
	## Streaming through large tabular files in R ##
	################################################
	## Author: Thomas Girke
	## Last update: 21-Dec-2018

	## Utility: the function 'processFileInLineChunks' streams through a file in
	## batches of lines and applies to each imported line batch a function assigned
	## to the 'myFct' argument. The number of lines processed in each iteration can
	## be defined under the 'n_rows' argument. As importer functions, both 'fread'
	##################################################
	## Create mappability file for reference genome ##
	##################################################
	## Thomas Girke
	## Date: May 16, 2018
	## PeakSeq docs:
	## https://github.com/gersteinlab/PeakSeq
	## http://gensoft.pasteur.fr/docs/PeakSeq/1.1/PeakSeq.readme

	## Note: mappability step should be skipped in newer version of PeakSeq
	##############################################
	## Map Pfam Domains to Proteins with HMMER3 ##
	##############################################
	## Author: Thomas Girke
	## Date: May 11, 2018

	## Utility: mapping of Pfam domains to protein sequences.
	## The module load and Pfam database paths given below are specific to the HPCC/biocluster system.
	## For details consult the man page for hmmscan from the command-line with 'hmmscan -h'
	####################################################
	## Import of KEGG Compounds into SDFset Container ##
	####################################################
	## Date: March 07, 2018
	## Motivation: response to request on Bioc support list: https://support.bioconductor.org/p/106712/

	################################
	## Download KEGG CMPs via URL ##
	################################
	###################################################################
	## Alignment Stats with Support for FASTQ Files with >500M Reads ##
	###################################################################
	alignStats <- function(args) {
	fqpaths <- infile1(args)
	bampaths <- outpaths(args)
	bamexists <- file.exists(bampaths)
	fqpaths <- fqpaths[bamexists]
	bampaths <- bampaths[bamexists]
	## Obtain total read number from FASTQ files
	#####################################
	## Quality Trimming of FASTQ Reads ##
	#####################################
	## Author: Thomas Girke
	## Last update: May 30, 2016
	## Usage of below function combined with preprocessReads form systemPipeR:
	# qcTrim <- "qualityTrimming(fq, phred_cutoff=20, cutoff_occurrences=1, N_cutoff=1, minreadlength=100)"
	# preprocessReads(args=args, Fct=qcTrim, batchsize=100000, overwrite=TRUE, compress=TRUE)

	## Arguments:
	############################
	## appendCounter Function ##
	############################
	## Author: Thomas Girke
	## Last update: 04-Oct-15

	## Function to append occurrence counter to entries in character
	## vector and return the results as named vector where the
	## original data are in the same order in the data slot
	## and the counting result in the name slot.
	my_frame[!duplicated(my_frame[,2]),] # Removes rows with duplicated values in selected column.
	my_frame[my_frame$y2 > my_frame$y3,] # Prints all rows of data frame where values of col1 > col2. Comparison operators are: == (equal), != (not equal), >= (greater than or equal), etc. Logical operators are & (and), \| (or) and ! (not).
	x <- 0.5:10; x[x<1.0] <- -1/x[x<1.0] # Replaces all values in vector or data frame that are below 1 with their reciprocal value.
	x <-data.frame(month=month.abb[1:12], AB=LETTERS[1:2], no1=1:48, no2=1:24); x[x$month == "Apr" & (x$no1 == x$no2 \| x$no1 > x$no2),] # Prints all records of frame 'x' that contain 'Apr' AND have equal values in columns 'no1' and 'no2' OR have greater values in column 'no1'.
	x[x[,1] %in% c("Jun", "Aug"),] # Retrieves rows with column matches specified in a query vector.
	x[c(grep("\\d{2}", as.character(x$no1), perl = TRUE)),] # Possibility to print out all rows of a data frame where a regular expression matches (here all double digit values in col 'no1').
	x[c(g
	#####################################################
	## (A) Add fingerprint folding argument to desc2fp ##
	##########################################
	## Intersect and Venn Diagram Functions ##
	##########################################
	## Author: Thomas Girke
	## Last update: March 24, 2012
	## Utilities:
	## (1) Venn Intersects
	## Computation of Venn intersects among 2-20 or more sample sets using the typical