Skip to content

Instantly share code, notes, and snippets.

View azkalot1's full-sized avatar

Sergey Kolchenko azkalot1

  • Cellarity
  • Chicago
View GitHub Profile
@azkalot1
azkalot1 / load_depend.py
Created February 12, 2019 22:28
load_depend.py
import csv
import gzip
import os
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap
from sklearn.cluster import Birch, AffinityPropagation, DBSCAN, MeanShift, SpectralClustering, AgglomerativeClustering, estimate_bandwidth
#reading 10X data as stated at support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/matrices
matrix_dir = "filtered_feature_bc_matrix"
mat = scipy.io.mmread(os.path.join(matrix_dir, "matrix.mtx"))
mat = np.array(mat.todense())
features_path = os.path.join(matrix_dir, "features.tsv")
annotation = pd.read_csv(features_path,sep='\t',header=None)
annotation.columns = ['feature_ids','gene_names','feature_types']
barcodes_path = os.path.join(matrix_dir, "barcodes.tsv")
barcodes = [line.strip() for line in open(barcodes_path, 'r')]
print('Matrix dimensionality {}'.format(mat.shape))
f, ax = plt.subplots(1,2,figsize=(15,5))
per_cell_sum = mat.sum(axis=1)
ax[0].hist(np.log10(per_cell_sum+1));
ax[0].set_title('Distribtion of #UMIs per cell\n min {}, max {}, mean {} +- {}'.format(min(per_cell_sum),
max(per_cell_sum), np.mean(per_cell_sum),
np.sqrt(np.std(per_cell_sum))));
per_gene_sum = mat.sum(axis=0)
ax[1].hist(np.log10(per_gene_sum+1));
ax[1].set_title('Distribtion of #UMIs per gene\n min {}, max {}, mean {} +- {}'.format(min(per_gene_sum),
max(per_gene_sum), np.mean(per_gene_sum),
low_expr_thr = 100
high_expr_thr = 100000
mat = mat[:,(per_gene_sum>=low_expr_thr) & (per_gene_sum<=high_expr_thr)] #just remove extreme outliers
mean_exp = mat.mean(axis=0)
std_exp = np.sqrt(mat.std(axis=0))
CV = std_exp/mean_exp
plt.hist(CV);
plt.title('Distribution of CV, mean {} sd {}'.format(np.mean(CV), np.std(CV)**0.5));
mat = mat[:,CV>=10]
f, ax = plt.subplots(1,2,figsize=(15,5))
per_cell_sum = mat.sum(axis=1)
ax[0].hist(np.log10(per_cell_sum+1));
ax[0].set_title('Distribtion of #UMIs per cell\n min {}, max {}, mean {} +- {}'.format(min(per_cell_sum),
max(per_cell_sum), np.mean(per_cell_sum),
np.sqrt(np.std(per_cell_sum))));
per_gene_sum = mat.sum(axis=0)
ax[1].hist(np.log10(per_gene_sum+1));
ax[1].set_title('Distribtion of #UMIs per gene\n min {}, max {}, mean {} +- {}'.format(min(per_gene_sum),
cells_expression = mat.sum(axis=1)
mat = mat[cells_expression>=100,:]
mat = np.log(mat+1)
pca = PCA(n_components=100)
pca.fit(mat)
mat_reduce = pca.transform(mat)
embedding = umap.UMAP(n_neighbors=5,
min_dist=0.5,
metric='euclidean').fit_transform(mat_reduce)
plt.figure(figsize=(15,15))
plt.scatter(embedding[:,0],embedding[:,1],s=0.2);
plt.title('Naive clustering');
#basically like at https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html, but our data is reall
#prepre paramets
params = {'quantile': .3,
'eps': .3,
'damping': .9,
'preference': -200,
'n_neighbors': 10,
'n_clusters': 5}
bandwidth = estimate_bandwidth(embedding, quantile=params['quantile'])
connectivity = kneighbors_graph(
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms
from torch.optim import Optimizer
from torch.utils import data
class DataGenerator(data.Dataset):
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms
from torch.optim import Optimizer
from torch.utils import data
import pretrainedmodels