Will Townes willtownes

## refgene2bed.py
"""
Python Script to read in a reference genome of refseq IDs and output several tab-delimited BED (text) files suitable for use with bedtools coverage for counting ChIP-seq reads that map to various gene features.
All output files have the structure expected by bedtools, namely,
CHROM POSITION1 POSITION2 REFSEQ_ID
Possible output files include:
1. distal promoter (transcription start [-5KB,-1KB]) KB means kilobase pairs, not kilobyte
2. proximal promoter (transcription start [-1KB,1KB])
3. gene body (anywhere between transcription start and transcription end)
4. transcript (anywhere in an exon)- outputs each exon as a separate line
5. first 1/3 transcript- outputs each exon as a separate line

## mp_benchmark.py
"""
Testing python multiprocessing speed. Based on
http://blogs.warwick.ac.uk/dwatkins/entry/benchmarking_parallel_python_1_2/
"""

import time
import math
import multiprocessing as mp

def isprime(n):

## poisson_prediction_interval.Rmd
---
title: "Poisson prediction interval"
author: "Will Townes"
output: html_document
---

Poisson prediction interval based on [Kim et al 2022](https://doi.org/10.1002/wics.1568)

```{r}
n<-100

## loess_vs_gp.R
library(pdist)
n<-200
X<-matrix(10*runif(n),ncol=1)
y<-sin(X[,1])#+rnorm(n,sd=.2)
#plot(X[,1],y)
#xnew<-3

#span<-1

my_loess<-function(xnew,X,y,span=.75){

## monotone_spline.R
library(mgcv)
#library(modules) #devtools::install_github(klmr/modules)
#mgcv<-import_package("mgcv")

mspline<-function(x,y,k=10,lower=NA,upper=NA){
  #fits a monotonic spline to data
  #small values of k= more smoothing (flatter curves)
  #large values of k= more flexible (wiggly curves)
  #k is related to effective degrees of freedom and number of knots
  #use unconstrained gam to get rough parameter estimates

## grigorev_twitter_question.py
"""
From https://twitter.com/Al_Grigor/status/1357028887209902088
Most candidates cannot solve this interview problem:
* Input: "aaaabbbcca"
* Output: [("a", 4), ("b", 3), ("c", 2), ("a", 1)]
Write a function that converts the input to the output
I ask it in the screening interview and give it 25 minutes
How would you solve it?
"""

## gpflow_multioutput.py
from gpflow.conditionals import conditional
from gpflow.inducing_variables import SeparateIndependentInducingVariables
from gpflow.kernels import SeparateIndependent

#note: object 'm' is of type gpflow.models.svgp.SVGP

ind_conditional = conditional.dispatch(
    object, SeparateIndependentInducingVariables, SeparateIndependent, object)
gmu, gvar = ind_conditional(
    X,

## pythonprimes
def listprimes2(m):
    '''another attempt to list all primes below m'''
    values = range(m+1) #note that in this list the key and the value are the same.
    primes = values[:]
    primes[1] = 0 #1 doesn't count as a prime
    for i in values:
        if primes[i] == 0:
            pass
        else:
            for j in values[i+1:]:

## omptest.cpp
#include <omp.h>
#include <stdio.h>
//clang++ -Xpreprocessor -fopenmp -lomp omptest.cpp -o omptest

int main() {
    #pragma omp parallel
    printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
}

## fuzzy_clustering_eval.R
#Fuzzy version of Jaccard and Rand Indices
#based on Suleman: "Assessing a Fuzzy Extension of Rand Index and Related Measures"

L<-4 #number of clusters
N<-50 #number of objects
X<-gtools::rdirichlet(N,rep(.01,L)) #NxL soft random clustering
y<-apply(X,1,which.max) #hard cluster version of X
table(y)
Y<-model.matrix(~factor(y)-1) #hard cluster version of X
Z<-matrix(1/L,nrow=N,ncol=L) #perfect uncertainty soft clustering
	"""
	Python Script to read in a reference genome of refseq IDs and output several tab-delimited BED (text) files suitable for use with bedtools coverage for counting ChIP-seq reads that map to various gene features.
	All output files have the structure expected by bedtools, namely,
	CHROM POSITION1 POSITION2 REFSEQ_ID
	Possible output files include:
	1. distal promoter (transcription start [-5KB,-1KB]) KB means kilobase pairs, not kilobyte
	2. proximal promoter (transcription start [-1KB,1KB])
	3. gene body (anywhere between transcription start and transcription end)
	4. transcript (anywhere in an exon)- outputs each exon as a separate line
	5. first 1/3 transcript- outputs each exon as a separate line
	"""
	Testing python multiprocessing speed. Based on
	http://blogs.warwick.ac.uk/dwatkins/entry/benchmarking_parallel_python_1_2/
	"""

	import time
	import math
	import multiprocessing as mp

	def isprime(n):
	---
	title: "Poisson prediction interval"
	author: "Will Townes"
	output: html_document
	---

	Poisson prediction interval based on [Kim et al 2022](https://doi.org/10.1002/wics.1568)

	```{r}
	n<-100
	library(pdist)
	n<-200
	X<-matrix(10*runif(n),ncol=1)
	y<-sin(X[,1])#+rnorm(n,sd=.2)
	#plot(X[,1],y)
	#xnew<-3

	#span<-1

	my_loess<-function(xnew,X,y,span=.75){
	library(mgcv)
	#library(modules) #devtools::install_github(klmr/modules)
	#mgcv<-import_package("mgcv")

	mspline<-function(x,y,k=10,lower=NA,upper=NA){
	#fits a monotonic spline to data
	#small values of k= more smoothing (flatter curves)
	#large values of k= more flexible (wiggly curves)
	#k is related to effective degrees of freedom and number of knots
	#use unconstrained gam to get rough parameter estimates
	"""
	From https://twitter.com/Al_Grigor/status/1357028887209902088
	Most candidates cannot solve this interview problem:
	* Input: "aaaabbbcca"
	* Output: [("a", 4), ("b", 3), ("c", 2), ("a", 1)]
	Write a function that converts the input to the output
	I ask it in the screening interview and give it 25 minutes
	How would you solve it?
	"""
	from gpflow.conditionals import conditional
	from gpflow.inducing_variables import SeparateIndependentInducingVariables
	from gpflow.kernels import SeparateIndependent

	#note: object 'm' is of type gpflow.models.svgp.SVGP

	ind_conditional = conditional.dispatch(
	object, SeparateIndependentInducingVariables, SeparateIndependent, object)
	gmu, gvar = ind_conditional(
	X,
	def listprimes2(m):
	'''another attempt to list all primes below m'''
	values = range(m+1) #note that in this list the key and the value are the same.
	primes = values[:]
	primes[1] = 0 #1 doesn't count as a prime
	for i in values:
	if primes[i] == 0:
	pass
	else:
	for j in values[i+1:]:
	#include <omp.h>
	#include <stdio.h>
	//clang++ -Xpreprocessor -fopenmp -lomp omptest.cpp -o omptest

	int main() {
	#pragma omp parallel
	printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
	}
	#Fuzzy version of Jaccard and Rand Indices
	#based on Suleman: "Assessing a Fuzzy Extension of Rand Index and Related Measures"

	L<-4 #number of clusters
	N<-50 #number of objects
	X<-gtools::rdirichlet(N,rep(.01,L)) #NxL soft random clustering
	y<-apply(X,1,which.max) #hard cluster version of X
	table(y)
	Y<-model.matrix(~factor(y)-1) #hard cluster version of X
	Z<-matrix(1/L,nrow=N,ncol=L) #perfect uncertainty soft clustering