Stanislaw Jastrzebski kudkudak

## gist:155eedffc6f850bc85d4
import sys

sys.path.append("/lhome/home/jastrzebski/mol2vec/mol2vec")
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
from sklearn.cross_validation import StratifiedKFold
from experiments.utils import wac_score
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from training_data.datasets import *

## QGB_test.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                kudkudak
                / QGB_test.ipynb
            
            
              Created
              January 16, 2016 11:28
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## clustering_1.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                kudkudak
                / clustering_1.md
            
            
              Last active
              December 9, 2015 17:17
            
              
                clustering_1
              
          
    Klastry kluczy MACCS wygenerowane przez sklastrowanie wektorów, które wygenerowałem dla każdego bitu z MACCS.
Każdy klaster jest opisany SMARTem związku (czasami jest "?", to może być artefakt rdkit którego używałem), który jest najbliżej środka klastru. Pod obrazkiem zamieściłem listy wszystkich kluczy w danym klastrze.
Każdy SMARTS (klucz MACCS) reprezentuję ok. 5 strukturami które do niego pasują. Czyli na każdy klaster przypada ok 5*ilość_kluczy_w_klastrze obrazków.
Zamieszczam też dwa klastrowania. Czy są jakieś różnice pomiędzy nimi?
Na końcu zamieściłem wyniki klasyfikacji 5ht6

  
## filter_two_dataset.py

# coding: utf-8

# In[1]:

get_ipython().magic(u'load_ext autoreload')
get_ipython().magic(u'autoreload 2')
import sys
sys.path.append("/afs/inf.ed.ac.uk/user/v/v1sjastr/scratch/v1sjastr/go-deep/")
import numpy as np

## alp_psuedo.py
# Ensure somehow we have some samples known

def fit(X, y):
  while True:
    indices = self._query_labels(X, y)

    if not oracle(indices, y):
      # No budget
      break

## wac.py
def wac_score(y_true, y_pred):
    """
    Parameters
    ----------
    y_true: numpy.ndarray
        True labels
    y_pred: numpy.ndarray
        Estimated labels
    Returns
    -------

## run_nn_example.py
import sys
sys.path.append("/home/ubuntu/hackathon_bes")
from utils import *
from run_nn import *
data = get_data("christine")
fit_predict(data["X_train"], data["Y_train"], test_datasets = [data["X_test"], data["X_valid"]],
            learning_rate=0.01, L1_reg=0.00, L2_reg=0.000001, n_epochs=200, batch_size = 100)

## gist:fb70db9d86883764d95e
def pick_h(protein, fingerprint, loader, model=TWELM, full_param_grid = {'C': list(np.logspace(0, 5, 6)),
                               'h': [100, 200, 500, 1000]}):
    comps = [[protein, fingerprint]]

    loader = copy.deepcopy(loader)

    preprocess_fncs = []

    data = get_data(comps, loader, preprocess_fncs).values()[0][0][0]


## gist:7fe5986216bee619a35b
def hit_and_run(X, Y, N=100, T=10, sub_sample_size=100, eps=0.5):
    """
    @param N hypothesis of points wanted
    @param T number of mixing iterations
    @param sub_sample_size how many samples of hypothesis take on the ray.
    @param eps noise level. Note that i should be pretty big
    """

    X = X[Y.known_ids]
    Y = Y[Y.known_ids]

## gist:d7e8a019433101e087b8
[('actives/beta2_actives_cluster1_KlekFP.csv', 1), ('actives/beta2_actives_cluster2_KlekFP.csv', 2), ('actives/beta2_actives_cluster3_KlekFP.csv', 3), ('actives/beta2_actives_cluster4_KlekFP.csv', 4), ('actives/beta2_actives_cluster5_KlekFP.csv', 5), ('actives/beta2_actives_cluster6_KlekFP.csv', 6)] inactives/beta2_inactives_KlekFP.csv
Sizes:  [408, 548, 548, 22, 2, 114]
Clusters  2  and  1
Shared examples # :  474
Indices of examples:
( 0 0 )
( 1 1 )
( 2 2 )
( 3 3 )
( 144 4 )
	import sys

	sys.path.append("/lhome/home/jastrzebski/mol2vec/mol2vec")
	import logging
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
	from sklearn.cross_validation import StratifiedKFold
	from experiments.utils import wac_score
	from sklearn.svm import SVC
	from sklearn.grid_search import GridSearchCV
	from training_data.datasets import *

	# coding: utf-8

	# In[1]:

	get_ipython().magic(u'load_ext autoreload')
	get_ipython().magic(u'autoreload 2')
	import sys
	sys.path.append("/afs/inf.ed.ac.uk/user/v/v1sjastr/scratch/v1sjastr/go-deep/")
	import numpy as np
	# Ensure somehow we have some samples known

	def fit(X, y):
	while True:
	indices = self._query_labels(X, y)

	if not oracle(indices, y):
	# No budget
	break
	def wac_score(y_true, y_pred):
	"""
	Parameters
	----------
	y_true: numpy.ndarray
	True labels
	y_pred: numpy.ndarray
	Estimated labels
	Returns
	-------
	import sys
	sys.path.append("/home/ubuntu/hackathon_bes")
	from utils import *
	from run_nn import *
	data = get_data("christine")
	fit_predict(data["X_train"], data["Y_train"], test_datasets = [data["X_test"], data["X_valid"]],
	learning_rate=0.01, L1_reg=0.00, L2_reg=0.000001, n_epochs=200, batch_size = 100)
	def pick_h(protein, fingerprint, loader, model=TWELM, full_param_grid = {'C': list(np.logspace(0, 5, 6)),
	'h': [100, 200, 500, 1000]}):
	comps = [[protein, fingerprint]]

	loader = copy.deepcopy(loader)

	preprocess_fncs = []

	data = get_data(comps, loader, preprocess_fncs).values()[0][0][0]
	def hit_and_run(X, Y, N=100, T=10, sub_sample_size=100, eps=0.5):
	"""
	@param N hypothesis of points wanted
	@param T number of mixing iterations
	@param sub_sample_size how many samples of hypothesis take on the ray.
	@param eps noise level. Note that i should be pretty big
	"""

	X = X[Y.known_ids]
	Y = Y[Y.known_ids]
	[('actives/beta2_actives_cluster1_KlekFP.csv', 1), ('actives/beta2_actives_cluster2_KlekFP.csv', 2), ('actives/beta2_actives_cluster3_KlekFP.csv', 3), ('actives/beta2_actives_cluster4_KlekFP.csv', 4), ('actives/beta2_actives_cluster5_KlekFP.csv', 5), ('actives/beta2_actives_cluster6_KlekFP.csv', 6)] inactives/beta2_inactives_KlekFP.csv
	Sizes: [408, 548, 548, 22, 2, 114]
	Clusters 2 and 1
	Shared examples # : 474
	Indices of examples:
	( 0 0 )
	( 1 1 )
	( 2 2 )
	( 3 3 )
	( 144 4 )