Instantly share code, notes, and snippets.

View SRNN-vs-LSTM
import matplotlib.pyplot as plt
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, Dense, SimpleRNN
N = 10000
num_repeats = 30
num_epochs = 5
# sequence length options
View spark-custom-aggregator
import scala.collection.mutable.Map
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.Encoders
import spark.implicits._
import org.apache.spark.sql.types._
View grid-cv
# related SF question: https://stackoverflow.com/questions/46351157/why-gridsearchcv-in-scikit-learn-spawn-so-many-threads
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
Cs = 10 ** np.arange(-2, 2, 0.1)
View eda_imports.py
import os
import io
import gzip
import time
import sys
import glob
import json
import re
import csv
import datetime
View gene_char_count.py
import pandas as pd
# gtf is a downloaded copy of https://github.com/bcgsc/KLEAT/blob/master/ensembl.fixed.sorted.gz, replace '' accordingly
gtf = ''
df_gtf = pd.read_csv(gtf, compression='gzip', sep='\t', header=None,
names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
# gene_id is misnamed when creating this gtf, it should've been gene_name
gene_names = df_gtf.attributes.str.extract(r'gene_id\ \"(?P<gene_name>.+?)\";', expand=False)
View execute.py
def execute(cmd, flag_file=None, msg_id='', debug=False):
"""
# http://stackoverflow.com/questions/1606795/catching-stdout-in-realtime-from-subprocess
:param cmd: should never inlcude pipe or redirection, which would requires
a new shell process
This execute logs all stdout and stderr, which could look funny, especially
when it comes to tools like aspc and wget
"""
logger.info('executing: {0}'.format(cmd))
# todo: should check whether cmdsp includes pipe or redirection here