Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Last active August 29, 2015 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afrendeiro/7425a73e808dc90449ed to your computer and use it in GitHub Desktop.
Save afrendeiro/7425a73e808dc90449ed to your computer and use it in GitHub Desktop.
Playing with CellProfiler
#!/usr/env/python
import os
import h5py
import pandas as pd
projectDir = '/home/afr/workspace/cellprofiler/'
# open hdf5
hdf5 = h5py.File(projectDir + '1315001__2014-01-25T18_26_59-Measurement1.h5', 'r')
# get experiment name
experiment = str(hdf5['/Measurements'].items()[0][0])
objects = ['FilteredNuclei', 'Cells', 'Cytoplasm', 'Mitochondria']
measurementsToSkip = [
"ObjectNumber",
"Neighbors_FirstClosestObjectNumber_Adjacent",
"Neighbors_SecondClosestObjectNumber_Adjacent"
]
for obj in objects:
# initialize dataframe
df = pd.DataFrame()
measurements = hdf5['/Measurements/' + experiment + '/' + obj].items()
# prepare output directory
outputDir = projectDir + experiment + '-measurements/'
if not os.path.exists(outputDir):
os.makedirs(outputDir)
# iterate through measurements and append new data to dataframe as columns
for meas in measurements:
meas = meas[0]
if meas not in measurementsToSkip:
data = hdf5['/Measurements/' + experiment + '/' + obj][meas]['data'].value
tmpDF = pd.DataFrame({meas : data})
df[meas] = hdf5['/Measurements/' + experiment + '/' + obj][meas]['data'].value
# output as csv
df.to_csv(outputDir + obj + '.csv')
# Get relashionships
obj = 'Relationship'
relationships = hdf5['/Measurements/' + experiment + '/' + obj]
df = pd.DataFrame()
df['Cells-Cells-Neighbours_ObjectNumber_First'] = relationships['12']['Neighbors']['Cells']['Cells']['ObjectNumber_First'].value
df['Cells-Cells-Neighbours_ObjectNumber_Second'] = relationships['12']['Neighbors']['Cells']['Cells']['ObjectNumber_Second'].value
df.to_csv(outputDir + 'Cells-Cells-Neighbours' + '.csv')
df = pd.DataFrame()
df['Cells-Mitochondria-Parent_ObjectNumber_First'] = relationships['13']['Parent']['Cells']['Mitochondria']['ObjectNumber_First'].value
df['Cells-Mitochondria-Parent_ObjectNumber_Second'] = relationships['13']['Parent']['Cells']['Mitochondria']['ObjectNumber_Second'].value
df.to_csv(outputDir + 'Cells-Mitochondria-Parent' + '.csv')
df = pd.DataFrame()
df['Nuclei-Cells-Parent_ObjectNumber_First'] = relationships['7']['Parent']['Nuclei']['Cells']['ObjectNumber_First'].value
df['Nuclei-Cells-Parent_ObjectNumber_Second'] = relationships['7']['Parent']['Nuclei']['Cells']['ObjectNumber_Second'].value
df.to_csv(outputDir + 'Nuclei-Cells-Parent' + '.csv')
df = pd.DataFrame()
df['Cells-Cytoplasm-Parent_ObjectNumber_First'] = relationships['8']['Parent']['Cells']['Cytoplasm']['ObjectNumber_First'].value
df['Cells-Cytoplasm-Parent_ObjectNumber_Second'] = relationships['8']['Parent']['Cells']['Cytoplasm']['ObjectNumber_Second'].value
df.to_csv(outputDir + 'Cells-Cytoplasm-Parent' + '.csv')
#!/usr/env/python
import os
import h5py
import pandas as pd
import string
import math
import scipy.stats as stats
import pylab
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
import mlpy
projectDir = '/home/afr/workspace/cellprofiler/2014-10-21-16-10-40-measurements/'
# change font of plots to look like LATEX
#matplotlib.rc('font', family='serif')
#matplotlib.rc('text', usetex='false')
#matplotlib.rcParams.update({'font.size': 14})
def imageNumber2wellNumber(image):
"""
Calculates the index of the well from which an image belongs.
"""
return int(np.around((int(image) + 1)/4.)) if int(image) != 1 else 1
def alphanumeric2numeric(well):
"""
Converts a well from alphanumeric (e.g. 'B06') to numeric (e. g. 30).
"""
return ((string.ascii_uppercase.index(well[0])) * 24) + int(well[1:])
def wellNumber2Position(well):
"""
Calculates well position in plate (e. g. x, y = (2, 6) from its numeric index.
"""
row = 1 + math.trunc((well - 1) / 24)
col = ((well - 1) % 24) + 1
return (row, col)
def normalizeWells(wells, measurement):
"""
Requires dataframe column with Measurements, indexed with well number.
Returns same dataframe normalized (subtraction) by row and column.
"""
normalized = []
for well in range(1, 385):
# get median of all wells in row
row = wells.ix[well]['rowIndex']
col = wells.ix[well]['colIndex']
rowMeasurement = list(wells[(wells['rowIndex'] == row) & (wells['drugType'] != 'PosCon') & (wells['drugType'] != np.NaN)][measurement])
colMeasurement = list(wells[(wells['colIndex'] == row) & (wells['drugType'] != 'PosCon') & (wells['drugType'] != np.NaN)][measurement])
# normalize by row and col
#normalized.append(wells.ix[well][measurement] - np.median(rowMeasurement + colMeasurement))
# normalize by row and col and globally
#normalized.append((wells.ix[well][measurement] - np.median(rowMeasurement + colMeasurement)) / np.median(wells[measurement]))
# normalize by row and col and globally and logaritmize
#normalized.append(log2((wells.ix[well][measurement] - np.median(rowMeasurement + colMeasurement)) / np.median(wells[measurement])))
# normalize by row and col and globally and logaritmize
normalized.append(log2(wells.ix[well][measurement] - np.median(rowMeasurement + colMeasurement) + 1))
wells[measurement + 'Normalized'] = normalized
return wells
def normalizeCells(cells, measurement):
"""
Requires dataframe column with Measurements, indexed with well number.
Returns same dataframe normalized (subtraction) by row and column.
"""
normalized = []
for cell in range(len(cells)):
well = cells['wellNumber'][cell]
row, col = wellNumber2Position(well)
# get median of all cells in row
rowMeasurement = list(cells[(cells['rowIndex'] == row) & (cells['drugType'] != 'PosCon') & (cells['drugType'] != np.NaN)][measurement])
colMeasurement = list(cells[(cells['colIndex'] == col) & (cells['drugType'] != 'PosCon') & (cells['drugType'] != np.NaN)][measurement])
# normalize by row and col
#normalized.append(cells.ix[well][measurement] - np.median(rowMeasurement + colMeasurement))
# normalize by row and col and globally
#normalized.append((cells.ix[well][measurement] - np.median(rowMeasurement + colMeasurement)) / np.median(cells[measurement]))
# normalize by row and col and globally and logaritmize
#normalized.append(log2((cells.ix[well][measurement] - np.median(rowMeasurement + colMeasurement)) / np.median(cells[measurement])))
# normalize by row and col and globally and logaritmize
normalized.append(cells.ix[well][measurement] - np.median(rowMeasurement + colMeasurement))
cells[measurement + 'Normalized'] = normalized
return cells
def zFactor(meanPos, meanNeg, stdPos, stdNeg):
"""
Calculates the Z-factor of measure for a plate based on controls (pos and neg).
"""
return 1 - (3 * (stdPos + stdNeg) / abs(meanPos - meanNeg))
### read in csvs
nuclei = pd.io.parsers.read_csv(projectDir + 'FilteredNuclei.csv')
cells = pd.io.parsers.read_csv(projectDir + 'Cells.csv')
Cytoplasm = pd.io.parsers.read_csv(projectDir + 'Cytoplasm.csv')
mitochondria = pd.io.parsers.read_csv(projectDir + 'Mitochondria.csv')
### read in plate metadata
metadata = pd.io.parsers.read_csv(projectDir + '20140206_transferlist_001_064_annotated_PLATE_1315001.csv')
# remember which wells are used
usedWells = []
for well in metadata['DestWell']:
usedWells.append(alphanumeric2numeric(well))
metadata.index = usedWells
# fill in missing rows with NaNs
for row in range(1, 385):
if row not in metadata.index:
metadata.ix[row] = np.NaN
metadata.sort()
# convert well number to numeric
rowIndex = []
colIndex = []
for well in range(1, 385):
x, y = wellNumber2Position(well)
rowIndex.append(x)
colIndex.append(y)
metadata['rowIndex'] = rowIndex
metadata['colIndex'] = colIndex
### Annotate cells with well number, coordinates, drug type
wellN = []; drugType = []; rowIndex = []; colIndex = []
for i in range(len(cells['ImageNumber'])):
well = imageNumber2wellNumber(cells.ix[i]['ImageNumber'])
row, col = wellNumber2Position(well)
wellN.append(well)
rowIndex.append(row)
colIndex.append(col)
drugType.append(cellMeans.ix[well]['drugType'])
cells['wellNumber'] = wellN; cells['drugType'] = drugType; cells['rowIndex'] = rowIndex; cells['colIndex'] = colIndex
##################### Analyze with averaged measurements per well
measurements = ['cellNumber', 'cellAreaMean', 'cellAreaMeanStd', 'cellPerimeterMean', 'cellPerimeterStd', 'cellFormFactorMean', 'cellFormFactorStd', 'cellEccentricityMean', 'cellEccentricityStd']
### get drug type metadata
cellMeans = metadata[['rowIndex', 'colIndex','Type_A']]
cellMeans = cellMeans.rename(columns = {'Type_A' : 'drugType'})
### Get average and std for each measurement
for well in range(1, 385):
cellsWell = cells[cells['wellNumber'] == well]
if measurement == 'cellNumber':
cellMeans[measurement] = len(cellsWell)
else:
cellMeans[measurement + 'Mean'] = np.mean(cellsWell)
cellMeans[measurement + 'Std'] = np.std(cellsWell)
### plot measurements
measurements = ['cellNumber', 'cellAreaMean', 'cellAreaMeanStd', 'cellPerimeterMean', 'cellPerimeterStd', 'cellFormFactorMean', 'cellFormFactorStd', 'cellEccentricityMean', 'cellEccentricityStd']
for measurement in measurements:
# normalize measurements
cellMeans = normalizeWells(cellMeans, measurement)
# plot plate heatmap
# create np array
for row in range(1, 17):
if row == 1:
measurementArray = np.array([cellMeans[cellMeans['rowIndex'] == row][measurement]])
measurementArrayNormalized = np.array([cellMeans[cellMeans['rowIndex'] == row][measurement + 'Normalized']])
else:
measurementArray = np.append(measurementArray, [cellMeans[cellMeans['rowIndex'] == row][measurement]], axis = 0)
measurementArrayNormalized = np.append(measurementArrayNormalized, [cellMeans[cellMeans['rowIndex'] == row][measurement + 'Normalized']], axis = 0)
### calculate z-factor
meanPos = np.mean(cellMeans[cellMeans['drugType'] == 'PosCon'][measurement])
meanNeg = np.mean(cellMeans[cellMeans['drugType'] == 'DMSO'][measurement])
stdPos = np.std(cellMeans[cellMeans['drugType'] == 'PosCon'][measurement])
stdNeg = np.std(cellMeans[cellMeans['drugType'] == 'DMSO'][measurement])
z = zFactor(meanPos, meanNeg, stdPos, stdNeg)
### Plot
# unnormalized
plt.subplot(2, 1, 1)
plt.title(measurement + ' per well')
plt.xlabel('Z-factor: %s' % z)
plt.pcolor(measurementArray, vmin=cellMeans[measurement].min(), vmax=cellMeans[measurement].max())
plt.colorbar()
plt.xticks(np.arange(0, 24) + 0.5, range(1, 25))
plt.yticks(np.arange(0, 16) + 0.5, range(1, 17))
plt.xlim(0, 24); plt.ylim(0, 16)
# normalized
meanPos = np.mean(cellMeans[cellMeans['drugType'] == 'PosCon'][measurement + 'Normalized'])
meanNeg = np.mean(cellMeans[cellMeans['drugType'] == 'DMSO'][measurement + 'Normalized'])
stdPos = np.std(cellMeans[cellMeans['drugType'] == 'PosCon'][measurement + 'Normalized'])
stdNeg = np.std(cellMeans[cellMeans['drugType'] == 'DMSO'][measurement + 'Normalized'])
z = zFactor(meanPos, meanNeg, stdPos, stdNeg)
plt.subplot(2, 1, 2)
plt.title(measurement + ' per well')
plt.xlabel('Z-factor: %s' % z)
plt.pcolor(measurementArrayNormalized, vmin=cellMeans[measurement + 'Normalized'].min(), vmax=cellMeans[measurement + 'Normalized'].max())
plt.colorbar()
plt.xticks(np.arange(0, 24) + 0.5, range(1, 25))
plt.yticks(np.arange(0, 16) + 0.5, range(1, 17))
plt.xlim(0, 24); plt.ylim(0, 16)
plt.subplots_adjust(hspace=.5)
plt.savefig(projectDir + "plots/" + measurement + "Plate.png", dpi = 600, bbox_inches = 'tight')
plt.close()
#### plot cell number per well
plt.subplot(2, 1, 1)
plt.scatter(
y = cellMeans[cellMeans['drugType'] == 'Cpd'][measurement],
x = cellMeans[cellMeans['drugType'] == 'Cpd'].index,
c = 'red',
label = 'compounds'
)
plt.scatter(
y = cellMeans[cellMeans['drugType'] == 'DMSO'][measurement],
x = cellMeans[cellMeans['drugType'] == 'DMSO'].index,
c = 'blue',
label = 'DMSO'
)
plt.scatter(
y = cellMeans[cellMeans['drugType'] == 'PosCon'][measurement],
x = cellMeans[cellMeans['drugType'] == 'PosCon'].index,
c = 'green',
label = 'positive'
)
plt.title(measurement + ' per well'); plt.ylabel(measurement); plt.xlabel('Well n.')
plt.axis([cellMeans.index[0], cellMeans.index[-1], cellMeans[measurement].min(), cellMeans[measurement].max()])
plt.subplot(2, 1, 2)
plt.scatter(
y = cellMeans[cellMeans['drugType'] == 'Cpd'][measurement + 'Normalized'],
x = cellMeans[cellMeans['drugType'] == 'Cpd'].index,
c = 'red',
label = 'compounds'
)
plt.scatter(
y = cellMeans[cellMeans['drugType'] == 'DMSO'][measurement + 'Normalized'],
x = cellMeans[cellMeans['drugType'] == 'DMSO'].index,
c = 'blue',
label = 'DMSO'
)
plt.scatter(
y = cellMeans[cellMeans['drugType'] == 'PosCon'][measurement + 'Normalized'],
x = cellMeans[cellMeans['drugType'] == 'PosCon'].index,
c = 'green',
label = 'positive'
)
plt.title('Normalized ' + measurement + ' per well'); plt.ylabel('Normalized ' + measurement); plt.xlabel('Well n.')
plt.axis([cellMeans.index[0], cellMeans.index[-1], cellMeans[measurement + 'Normalized'].min(), cellMeans[measurement + 'Normalized'].max()])
plt.subplots_adjust(hspace=.5)
plt.legend(bbox_to_anchor=(1.05, -0.05), loc='best', borderaxespad=0.)
plt.savefig(projectDir + "plots/" + measurement + "_perWell.png", dpi = 600, bbox_inches='tight')
plt.close()
# plot cell Area vs Perimeter per well
#plt.scatter(
# x = cellMeans[cellMeans['drugType'] == 'Cpd']['cellAreaMean'],
# y = cellMeans[cellMeans['drugType'] == 'Cpd']['cellPerimeterMean'],
# c = 'red',
# label = 'compounds'
#)
#plt.scatter(
# x = cellMeans[cellMeans['drugType'] == 'DMSO']['cellAreaMean'],
# y = cellMeans[cellMeans['drugType'] == 'DMSO']['cellPerimeterMean'],
# c = 'blue',
# label = 'DMSO'
#)
#plt.scatter(
# x = cellMeans[cellMeans['drugType'] == 'PosCon']['cellAreaMean'],
# y = cellMeans[cellMeans['drugType'] == 'PosCon']['cellPerimeterMean'],
# c = 'green',
# label = 'positive'
#)
#plt.legend(); plt.title('Cell area vs Perimeter'); plt.xlabel('Cell area'); plt.ylabel('Cell perimeter')
#plt.xlim(0, 10000); plt.ylim(100,450)
#plt.savefig(projectDir + "/plots/cellAreaPerimeter.png", dpi = 600)
######################### SINGLE-CELL STUFF
### Normalize
cells = normalizeCells(cells, 'AreaShape_Area')
### Plot scatter for all cells in all wells
# colors according to drug type
colors = []
for i in cellMeans.drugType:
if i == 'Cpd':
colors.append('r')
elif i == 'DMSO':
colors.append('b')
elif i == 'PosCon':
colors.append('g')
else:
colors.append('r')
# plot density
plt.subplot(2, 1, 1)
for well in range(1, 385):
# get cells from well
cellsWell = cells[cells['wellNumber'] == well]
density = gaussian_kde(cellsWell[measurement])
xs = np.linspace(np.min(cellsWell[measurement]), np.max(cellsWell[measurement]), len(cellsWell[measurement]))
#density.covariance_factor = lambda : .25
#density._compute_covariance()
plt.plot(xs, density(xs), c = colors[well - 1], label = cellMeans.ix[well]['drugType'])
plt.subplot(2, 1, 2)
for well in range(1, 385):
# get cells from well
cellsWell = cells[cells['wellNumber'] == well]
density = gaussian_kde(cellsWell[measurement + 'Normalized'])
xs = np.linspace(np.min(cellsWell[measurement + 'Normalized']), np.max(cellsWell[measurement + 'Normalized']), len(cellsWell[measurement + 'Normalized']))
#density.covariance_factor = lambda : .25
#density._compute_covariance()
plt.plot(xs, density(xs), c = colors[well - 1], label = cellMeans.ix[well]['drugType'])
plt.title(measurement)
plt.xlabel(measurement)
plt.ylabel('Density')
plt.legend()
plt.savefig(projectDir + "/plots/AreaShape_Area_density.png", dpi = 600)
plt.close()
##### Classify nuclei using a SVM
# annotate nuclei with well n. and type of drug
wellN = []
drugType = []
for i in range(len(nuclei['ImageNumber'])):
well = imageNumber2wellNumber(nuclei.ix[i]['ImageNumber'])
drug = cellMeans.ix[well]['drugType']
wellN.append(well)
drugType.append(drug)
nuclei['wellNumber'] = wellN
nuclei['drugType'] = drugType
# take only nucleous shape measurements
nucleiMeasurements = ['AreaShape_Area', 'AreaShape_Center_X', 'AreaShape_Center_Y', 'AreaShape_Compactness', 'AreaShape_Eccentricity', 'AreaShape_EulerNumber', 'AreaShape_Extent', 'AreaShape_FormFactor', 'AreaShape_MajorAxisLength', 'AreaShape_MaxFeretDiameter', 'AreaShape_MaximumRadius', 'AreaShape_MeanRadius', 'AreaShape_MedianRadius', 'AreaShape_MinFeretDiameter', 'AreaShape_MinorAxisLength', 'AreaShape_Orientation', 'AreaShape_Perimeter', 'AreaShape_Solidity', 'AreaShape_Zernike_0_0', 'AreaShape_Zernike_1_1', 'AreaShape_Zernike_2_0', 'AreaShape_Zernike_2_2', 'AreaShape_Zernike_3_1', 'AreaShape_Zernike_3_3', 'AreaShape_Zernike_4_0', 'AreaShape_Zernike_4_2', 'AreaShape_Zernike_4_4', 'AreaShape_Zernike_5_1', 'AreaShape_Zernike_5_3', 'AreaShape_Zernike_5_5', 'AreaShape_Zernike_6_0', 'AreaShape_Zernike_6_2', 'AreaShape_Zernike_6_4', 'AreaShape_Zernike_6_6', 'AreaShape_Zernike_7_1', 'AreaShape_Zernike_7_3', 'AreaShape_Zernike_7_5', 'AreaShape_Zernike_7_7', 'AreaShape_Zernike_8_0', 'AreaShape_Zernike_8_2', 'AreaShape_Zernike_8_4', 'AreaShape_Zernike_8_6', 'AreaShape_Zernike_8_8', 'AreaShape_Zernike_9_1', 'AreaShape_Zernike_9_3', 'AreaShape_Zernike_9_5', 'AreaShape_Zernike_9_7', 'AreaShape_Zernike_9_9']
y = nuclei['drugType']
x = nuclei[nucleiMeasurements]
# reduce dimentionality with a PCA
pca = mlpy.PCA() # new PCA instance
pca.learn(x) # learn from data
z = pca.transform(x, k=2) # transform into 2D space
# plot pca
colors = []
for i in y:
if i == 'Cpd':
colors.append('r')
if i == 'DMSO':
colors.append('b')
if i == 'PosCon':
colors.append('g')
plt.scatter(x = z[:, 0], y = z[:, 1], marker = '.', c = colors, alpha = 1)
plt.title("PCA on 48 nucleous features")
plt.xlabel("First component")
plt.ylabel("Second component")
plt.savefig(projectDir + "/plots/nucleousPCA.png", dpi = 600)
plt.close()
# new linear SVM instance
linearSVM = mlpy.LibSvm(kernel_type = 'linear')
linearSVM.learn(z, y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment