Philipp Münch philippmuench

## check_fasta.py
import os
import argparse
import random

def is_fasta(filename):
    try:
        with open(filename, 'r') as f:
            first_line = f.readline().strip()
            if not first_line:
                return 'empty'

## gist:5a03cccfd472cb33b2a3a12058b21277
## app.R ##
library(shinydashboard)
library(shiny)
library(keras)
library(deepG)
library(ggplot2)
library(dplyr)
library(DT)
library(hdf5r)
library(plotly)

## gist:46edf41b389da1882d5fa9338648ec51
---
title: "GenomeNet Viewer"
output:
  flexdashboard::flex_dashboard:
  orientation: rows
social: menu
theme: united #cerulean
source_code: embed
runtime: shiny
---

## gist:6b9bbb9f9f987ab22efb573f9f19160f
#' @title Trains a (mostly) LSTM model on genomic data. Designed for developing genome based language models (GenomeNet)
#'
#' @description
#' Depth and number of neurons per layer of the netwok can be specified. First layer can be a Convolutional Neural Network (CNN) that is designed to capture codons.
#' If a path to a folder where FASTA files are located is provided, batches will ge generated using an external generator which
#' is recommended for big training sets. Alternative, a dataset can be supplied that holds the preprocessed batches (generated by \code{preprocessSemiRedundant()})
#'  and keeps them in RAM. Supports also training on instances with multiple GPUs and scales linear with number of GPUs present.
#' @param train_type Either "lm" for language model, "label_header" or "label_folder". Language model is trained to predict next character in sequence.
#' label_header/label_folder are trained to predict a corresponding class, given a sequence as input. If "label_header", class will be read from f

## gist:4832a25f8a2693b90e7a9e96edfa9bfd
trainMinimalFunctionalAPI <- function(path = "example_files/fasta") {
  library(wavenet)
  message("Initialize model! This can take a few minutes.")

  maxlen <- 1000
  input <- keras::layer_input(batch_shape = c(64, maxlen, 6))


  # https://github.com/ibab/tensorflow-wavenet/blob/master/wavenet/ops.py#L46
  first <- keras::layer_conv_1d(

## gist:f8c41263647b175cf0b501539c32a959
trainMinimalFunctionalAPI <- function(path = "example_files/fasta") {

  message("Initialize model! This can take a few minutes.")

  input <- keras::layer_input(batch_shape = c(256, 50, 6))

  cnn <-
    keras::layer_conv_1d(
      object = input,
      kernel_size = 3,

## hmp1.txt
The Human Microbiome Project Consortium
Curtis Huttenhower, Dirk Gevers, Rob Knight, Sahar Abubucker, Jonathan H. Badger, Asif T. Chinwalla, Heather H. Creasy, Ashlee M. Earl, Michael G. FitzGerald, Robert S. Fulton, Michelle G. Giglio, Kymberlie Hallsworth-Pepin, Elizabeth A. Lobos, Ramana Madupu, Vincent Magrini, John C. Martin, Makedonka Mitreva, Donna M. Muzny, Erica J. Sodergren, James Versalovic, Aye M. Wollam, Kim C. Worley, Jennifer R. Wortman, Sarah K. Young, Qiandong Zeng, Kjersti M. Aagaard, Olukemi O. Abolude, Emma Allen-Vercoe, Eric J. Alm, Lucia Alvarado, Gary L. Andersen, Scott Anderson, Elizabeth Appelbaum, Harindra M. Arachchi, Gary Armitage, Cesar A. Arze, Tulin Ayvaz, Carl C. Baker, Lisa Begg, Tsegahiwot Belachew, Veena Bhonagiri, Monika Bihan, Martin J. Blaser, Toby Bloom, Vivien Bonazzi, J. Paul Brooks, Gregory A. Buck, Christian J. Buhay, Dana A. Busam, Joseph L. Campbell, Shane R. Canon, Brandi L. Cantarel, Patrick S. G. Chain, I-Min A. Chen, Lei Chen, Shaila Chhibba, Ken Chu, Dawn M. C

## commands.R
remove.packages("deepG")
devtools::install_github("hiddengenome/deepG@prepro")
tensorflow::install_tensorflow(version="1.12.0-gpu", method = "conda", conda_python_version = "2.7")
library(deepG)
hist <- trainNetwork(path = "/scratch/pmuench/crispr_refseq/with_crispr", use.cudnn = F, use.codon.cnn = F, maxlen = 80, batch.size = 500, run.name= "CrisprNet_v1_no_cnn", epochs = 50, steps.per.epoch = 1000, layers.lstm = 3, max.queue.size = 100, dropout.rate = 0.15, layer.size = 5,  vocabulary.size = 5)
# ValueError: No data provided for "lstm_input". Need data for each key in: ['lstm_input']

## step0-on-bifo-server.sh
# requirement: anaconda
conda install virtualenv
python3 -m venv ~/deepG_env
source ~/deepG_env/bin/activate
# deepG is tested with tensorflow 1.14
pip3 install https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.14.0-cp37-cp37m-linux_x86_64.whl
pip3 install keras

# setup cuda, required is cuda10 and cudnn 7.4
conda install -c anaconda cudatoolkit

## install.sh
wget https://crisprcas.i2bc.paris-saclay.fr/Home/DownloadFile?filename=CRISPRCasFinder.zip
unzip DownloadFile?filename=CRISPRCasFinder.zip
cd CRISPRCasFinder

# install perl modules
curl -L http://cpanmin.us | perl - App::cpanminus
cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
cpanm JSON::Parse

# install prodigal
	import os
	import argparse
	import random

	def is_fasta(filename):
	try:
	with open(filename, 'r') as f:
	first_line = f.readline().strip()
	if not first_line:
	return 'empty'
	## app.R ##
	library(shinydashboard)
	library(shiny)
	library(keras)
	library(deepG)
	library(ggplot2)
	library(dplyr)
	library(DT)
	library(hdf5r)
	library(plotly)
	---
	title: "GenomeNet Viewer"
	output:
	flexdashboard::flex_dashboard:
	orientation: rows
	social: menu
	theme: united #cerulean
	source_code: embed
	runtime: shiny
	---
	#' @title Trains a (mostly) LSTM model on genomic data. Designed for developing genome based language models (GenomeNet)
	#'
	#' @description
	#' Depth and number of neurons per layer of the netwok can be specified. First layer can be a Convolutional Neural Network (CNN) that is designed to capture codons.
	#' If a path to a folder where FASTA files are located is provided, batches will ge generated using an external generator which
	#' is recommended for big training sets. Alternative, a dataset can be supplied that holds the preprocessed batches (generated by \code{preprocessSemiRedundant()})
	#' and keeps them in RAM. Supports also training on instances with multiple GPUs and scales linear with number of GPUs present.
	#' @param train_type Either "lm" for language model, "label_header" or "label_folder". Language model is trained to predict next character in sequence.
	#' label_header/label_folder are trained to predict a corresponding class, given a sequence as input. If "label_header", class will be read from f
	trainMinimalFunctionalAPI <- function(path = "example_files/fasta") {
	library(wavenet)
	message("Initialize model! This can take a few minutes.")

	maxlen <- 1000
	input <- keras::layer_input(batch_shape = c(64, maxlen, 6))


	# https://github.com/ibab/tensorflow-wavenet/blob/master/wavenet/ops.py#L46
	first <- keras::layer_conv_1d(
	remove.packages("deepG")
	devtools::install_github("hiddengenome/deepG@prepro")
	tensorflow::install_tensorflow(version="1.12.0-gpu", method = "conda", conda_python_version = "2.7")
	library(deepG)
	hist <- trainNetwork(path = "/scratch/pmuench/crispr_refseq/with_crispr", use.cudnn = F, use.codon.cnn = F, maxlen = 80, batch.size = 500, run.name= "CrisprNet_v1_no_cnn", epochs = 50, steps.per.epoch = 1000, layers.lstm = 3, max.queue.size = 100, dropout.rate = 0.15, layer.size = 5, vocabulary.size = 5)
	# ValueError: No data provided for "lstm_input". Need data for each key in: ['lstm_input']
	# requirement: anaconda
	conda install virtualenv
	python3 -m venv ~/deepG_env
	source ~/deepG_env/bin/activate
	# deepG is tested with tensorflow 1.14
	pip3 install https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.14.0-cp37-cp37m-linux_x86_64.whl
	pip3 install keras

	# setup cuda, required is cuda10 and cudnn 7.4
	conda install -c anaconda cudatoolkit
	wget https://crisprcas.i2bc.paris-saclay.fr/Home/DownloadFile?filename=CRISPRCasFinder.zip
	unzip DownloadFile?filename=CRISPRCasFinder.zip
	cd CRISPRCasFinder

	# install perl modules
	curl -L http://cpanmin.us \| perl - App::cpanminus
	cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
	cpanm JSON::Parse

	# install prodigal