Skip to content

Instantly share code, notes, and snippets.

@thouis
Created June 7, 2011 21:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thouis/1013254 to your computer and use it in GitHub Desktop.
Save thouis/1013254 to your computer and use it in GitHub Desktop.
start rewrite of measurements.py
"""Measurements.py - storage for image and object measurements
CellProfiler is distributed under the GNU General Public License.
See the accompanying file LICENSE for details.
Copyright (c) 2003-2009 Massachusetts Institute of Technology
Copyright (c) 2009-2011 Broad Institute
All rights reserved.
Please see the AUTHORS file for credits.
Website: http://www.cellprofiler.org
"""
__version__ = "$Revision: 11194 $"
import numpy as np
import re
from scipy.io.matlab import loadmat
import sqlite3
AGG_MEAN = "Mean"
AGG_STD_DEV = "StDev"
AGG_MEDIAN = "Median"
AGG_NAMES = [AGG_MEAN, AGG_MEDIAN, AGG_STD_DEV]
"""The per-image measurement category"""
IMAGE = "Image"
"""The per-experiment measurement category"""
EXPERIMENT = "Experiment"
"""The neighbor association measurement category"""
NEIGHBORS = "Neighbors"
"""The per-object "category" (if anyone needs the word, "Object")"""
OBJECT = "Object"
COLTYPE_INTEGER = "integer"
COLTYPE_FLOAT = "float"
'''16bit Binary Large Object. This object can fit 64K of raw data.
Currently used for storing image thumbnails as 200 x 200px (max) 8-bit pngs.
Should NOT be used for storing larger than 256 x 256px 8-bit pngs.'''
COLTYPE_BLOB = "blob"
'''24bit Binary Large Object. This object can fit 16M of raw data.
Not currently used'''
COLTYPE_MEDIUMBLOB = "mediumblob"
'''32bit Binary Large Object. This object can fit 4GB of raw data.
Not currently used'''
COLTYPE_LONGBLOB = "longblob"
'''SQL format for a varchar column
To get a varchar column of width X: COLTYPE_VARCHAR_FORMAT % X
'''
COLTYPE_VARCHAR_FORMAT = "varchar(%d)"
COLTYPE_VARCHAR = "varchar"
'''# of characters reserved for path name in the database'''
PATH_NAME_LENGTH = 256
'''# of characters reserved for file name in the database'''
FILE_NAME_LENGTH = 128
COLTYPE_VARCHAR_FILE_NAME = COLTYPE_VARCHAR_FORMAT % FILE_NAME_LENGTH
COLTYPE_VARCHAR_PATH_NAME = COLTYPE_VARCHAR_FORMAT % PATH_NAME_LENGTH
'''Column attribute: only available after post_group is run (True / False)'''
MCA_AVAILABLE_POST_GROUP = "AvailablePostGroup"
'''The name of the metadata category'''
C_METADATA = "Metadata"
'''The name of the site metadata feature'''
FTR_SITE = "Site"
'''The name of the well metadata feature'''
FTR_WELL = "Well"
'''The name of the row metadata feature'''
FTR_ROW = "Row"
'''The name of the column metadata feature'''
FTR_COLUMN = "Column"
'''The name of the plate metadata feature'''
FTR_PLATE = "Plate"
M_SITE, M_WELL, M_ROW, M_COLUMN, M_PLATE = \
['_'.join((C_METADATA, x))
for x in (FTR_SITE, FTR_WELL, FTR_ROW, FTR_COLUMN, FTR_PLATE)]
GROUP_NUMBER = "Group_Number"
GROUP_INDEX = "Group_Index"
def get_length_from_varchar(x):
'''Retrieve the length of a varchar column from its coltype def'''
m = re.match(r'^varchar\(([0-9]+)\)$', x)
if m is None:
return None
return int(m.groups()[0])
class Measurements(object):
"""Represents measurements made on images and objects
"""
def __init__(self,
can_overwrite=False,
image_set_start=None):
"""Create a new measurements collection
can_overwrite - if True, overwriting measurements during operation
is allowed. We turn this on for debugging.
image_set_start - the index of the first image set in the image set list
or None to start at the beginning
"""
self.__db_connection = None
self.__cursor = None
self.__image_set_number = (1 if image_set_start is None
else image_set_start + 1)
self.__image_set_start = image_set_start
self.__can_overwrite = can_overwrite
self.__is_first_image = True
self.__image_set_index = 0
self.__initialized_explicitly = False
self.__relationships = {}
def db_filename(self):
return ':memory:'
def initialize(self, measurement_columns):
'''Initialize the measurements dictionary with a list of columns
This explicitly initializes the measurements with a list of
columns as would be returned by get_measurement_columns()
measurement_columns - list of 3-tuples: object name, feature, type
'''
self.__dictionary = {}
self.__object_names = {}
# Create a new sqlite DB
self.__db_connection = sqlite3.connect(self.db_filename())
self.__cursor = self.__dbconnection.cursor()
for object_name, feature, coltype in measurement_columns:
self.__add_table(object_name)
self.__add_column(object_name, feature, coltype)
self.__initialized_explicitly = True
def next_image_set(self, explicit_image_set_number=None, erase=False):
if explicit_image_set_number is None:
self.__image_set_number += 1
else:
self.__image_set_number = explicit_image_set_number
self.__image_set_index += 1
self.__is_first_image = False
if erase:
for object_name in self.object_names():
if object_name in (EXPERIMENT):
continue
self.__cursor.execute('DELETE FROM "%s" WHERE ImageNumber=%d' % (self.__image_set_index))
self.__db_connection.commit()
@property
def image_set_count(self):
'''The number of complete image sets measured'''
# XXX - question for Lee: should this return the minimum number
# of non-null values across columns in the the Image table?
try:
self.__cursor.execute('SELECT COUNT(*) FROM %s'%(IMAGE))
except sqlite3.OperationalError, e:
if not 'no such table' in e.message:
raise
return 0
return self.__cursor.fetchall()[0]
def get_is_first_image(self):
'''True if this is the first image in the set'''
return self.__is_first_image
def set_is_first_image(self, value):
if not value:
raise ValueError("Can only reset to be first image")
self.__is_first_image = value
self.__image_set_index = 0
self.__image_set_number = self.image_set_start_number
is_first_image = property(get_is_first_image, set_is_first_image)
@property
def image_set_start_number(self):
'''The first image set (one-based) processed by the pipeline'''
if self.__image_set_start is None:
return 1
return self.__image_set_start + 1
@property
def has_image_set_start(self):
'''True if the image set has an explicit start'''
return not self.__image_set_start is None
def get_image_set_number(self):
"""The image set number ties a bunch of measurements to a particular image set
"""
return self.__image_set_number
def set_image_set_number(self, number):
self.__image_set_index = number -1
self.__image_set_number = number
image_set_number = property(get_image_set_number, set_image_set_number)
@property
def image_set_index(self):
'''Return the index into the measurements for the current measurement'''
return self.__image_set_index
def get_image_number_from_index(self, index):
'''Return the image number, given an index into the measurements
index - the index of a measurement, such as that returned by
get_all_measurements
returns the image number for the indexed image set, as it would be
stored in the database.
'''
return index + self.image_set_start_number
def get_index_from_image_number(self, image_number):
'''Return the index into the measurements for a given image number
image_number - the image set's image number as stored in the database
or CSV file
return the index of that image set's measurements, suitable for
finding values from get_all_measurements
'''
return image_number - self.image_set_start_number
def load(self, measurements_file_name):
'''Load measurements from a matlab file'''
handles = loadmat(measurements_file_name, struct_as_record=True)
self.create_from_handles(handles)
def create_from_handles(self, handles):
'''Load measurements from a handles structure'''
m = handles["handles"][0,0]["Measurements"][0,0]
for object_name in m.dtype.fields.keys():
omeas = m[object_name][0,0]
for feature_name in omeas.dtype.fields.keys():
if object_name == IMAGE:
values = [x.flatten()[0] for x in omeas[feature_name][0]]
elif object_name == EXPERIMENT:
value = omeas[feature_name][0,0].flatten()[0]
self.add_experiment_measurement(feature_name, value)
continue
else:
values = [x.flatten()
for x in omeas[feature_name][0].tolist()]
self.add_all_measurements(object_name,
feature_name,
values)
#
# Set the image set index to the end
#
self.__image_set_index = self.image_set_count - 1
def add_image_measurement(self, feature_name, data):
"""Add a measurement to the "Image" category
"""
self.add_measurement(IMAGE, feature_name, data)
def add_experiment_measurement(self, feature_name, data):
"""Add an experiment measurement to the measurement
Experiment measurements have one value per experiment
"""
if not self.__dictionary.has_key(EXPERIMENT):
self.__dictionary[EXPERIMENT] = {}
self.__dictionary[EXPERIMENT][feature_name] = data
def get_group_number(self):
'''The number of the group currently being processed'''
return self.get_current_image_measurement(GROUP_NUMBER)
def set_group_number(self, group_number):
self.add_image_measurement(GROUP_NUMBER, group_number)
group_number = property(get_group_number, set_group_number)
def get_group_index(self):
'''The within-group index of the current image set'''
return self.get_current_image_measurement(GROUP_INDEX)
def set_group_index(self, group_index):
self.add_image_measurement(GROUP_INDEX, group_index)
group_index = property(get_group_index, set_group_index)
def add_relate_measurement(
self, module_number,
relationship,
object_name1, object_name2,
group_indexes1, object_numbers1,
group_indexes2, object_numbers2):
'''Add object relationships to the measurements
module_number - the module that generated the relationship
relationship - the relationship of the two objects, for instance,
"Parent" means object # 1 is the parent of object # 2
group_number - the group number of the group currently being processed
object_name1, object_name2 - the name of the segmentation for the first and second objects
group_indexes1, group_indexes2 - for each object, the group index of
that object's image set
object_numbers1, object_numbers2 - for each object, the object number
in the object's object set
This method lets the caller store any sort of arbitrary relationship
between objects as long as they are in the same group. To record
all neighbors within a particular segmentation, call with the same
object name for object_name1 and object_name2 and the same group
index - that of the current image. Relating would have different object
names and TrackObjects would have different group indices.
'''
key = (module_number, relationship, self.group_number,
object_name1, object_name2)
if not self.__relationships.has_key(key):
v = self.__relationships[key] = []
else:
v = self.__relationships[key]
value = np.zeros(len(group_indexes1),
[("group_index1", int, 1),
("object_number1", int, 1),
("group_index2", int, 1),
("object_number2", int , 1)])
value = value.view(np.recarray)
value.group_index1 = group_indexes1
value.group_index2 = group_indexes2
value.object_number1 = object_numbers1
value.object_number2 = object_numbers2
v.append(value)
def get_relationship_groups(self):
'''Return the keys of each of the relationship groupings.
The value returned is a list composed of objects with the following
attributes:
module_number - the module number of the module used to generate the relationship
object_name1 - the object name of the first object in the relationship
object_name2 - the object name of the second object in the relationship
group_number - the group number of the group in which the objects were analyzed
'''
return [RelationshipKey(*key) for key in self.__relationships.keys()]
def get_relationships(self, module_number, relationship, object_name1, object_name2, group_number):
key = (module_number, relationship, group_number, object_name1, object_name2)
if not self.__relationships.has_key(key):
return np.zeros(0, [("group_index1", int, 1),
("object_index1", int, 1),
("group_index2", int, 1),
("object_index2", int , 1)]).view(np.recarray)
return np.hstack(self.__relationships[key]).view(np.recarray)
def add_measurement(self, object_name, feature_name, data, can_overwrite=False):
"""Add a measurement or, for objects, an array of measurements to the set
This is the classic interface - like CPaddmeasurements:
ObjectName - either the name of the labeled objects or "Image"
FeatureName - the feature name, encoded with underbars for category/measurement/image/scale
Data - the data item to be stored
"""
can_overwrite = can_overwrite or self.__can_overwrite
if self.is_first_image and not self.__initialized_explicitly:
if not self.__dictionary.has_key(object_name):
self.__dictionary[object_name] = {}
object_dict = self.__dictionary[object_name]
if not object_dict.has_key(feature_name):
object_dict[feature_name] = [data]
elif can_overwrite:
object_dict[feature_name] = [data]
elif (object_name == IMAGE and
feature_name.startswith(C_METADATA)):
if object_dict[feature_name][0] != data:
raise ValueError(
"Metadata feature %s has conficting values: %s and %s" %
(feature_name, object_dict[feature_name][0], data))
else:
assert False,"Adding a feature for a second time: %s.%s"%(object_name,feature_name)
else:
if can_overwrite:
if not self.__dictionary.has_key(object_name):
self.__dictionary[object_name] = {}
object_dict = self.__dictionary[object_name]
if not object_dict.has_key(feature_name):
object_dict[feature_name] = [ None] * (self.image_set_index+1)
assert self.__dictionary.has_key(object_name),\
("Object %s requested for the first time on pass # %d" %
(object_name,self.image_set_index))
assert self.__dictionary[object_name].has_key(feature_name),\
("Feature %s.%s added for the first time on pass # %d" %
(object_name,feature_name,self.image_set_index))
if (object_name == IMAGE and feature_name.startswith(C_METADATA)
and self.has_current_measurements(object_name, feature_name)):
assert self.get_current_image_measurement(feature_name) == data
else:
assert (can_overwrite or not
self.has_current_measurements(object_name, feature_name)),\
("Feature %s.%s has already been set for this image cycle" %
(object_name,feature_name))
#
# These are for convenience - wrap measurement in an numpy array to make it a cell
#
if isinstance(data,unicode):
data = str(data)
if isinstance(data,str):
a = np.ndarray((1,1),dtype='S%d'%(max(len(data),1)))
a[0,0]=data
self.__dictionary[object_name][feature_name][self.image_set_index] = data
def get_object_names(self):
"""The list of object names (including Image) that have measurements
"""
return self.__dictionary.keys()
object_names = property(get_object_names)
def get_feature_names(self,object_name):
"""The list of feature names (measurements) for an object
"""
if self.__dictionary.has_key(object_name):
return self.__dictionary[object_name].keys()
return []
def has_feature(self, object_name, feature_name):
"""Return true if a particular object has a particular feature"""
if not self.__dictionary.has_key(object_name):
return False
return self.__dictionary[object_name].has_key(feature_name)
def get_current_image_measurement(self, feature_name):
'''Return the value for the named image measurement
feature_name - the name of the measurement feature to be returned
'''
return self.get_current_measurement(IMAGE, feature_name)
def get_current_measurement(self,object_name,feature_name):
"""Return the value for the named measurement for the current image set
object_name - the name of the objects being measured or "Image"
feature_name - the name of the measurement feature to be returned
"""
return self.get_all_measurements(object_name,feature_name)[self.image_set_index]
def get_measurement(self,object_name,feature_name,image_set_index):
"""Return the value for the named measurement and indicated image set"""
return self.get_all_measurements(object_name,feature_name)[image_set_index]
def has_current_measurements(self,object_name,feature_name):
"""Return true if the value for the named measurement for the current image set has been set
object_name - the name of the objects being measured or "Image"
feature_name - the name of the measurement feature to be returned
"""
if not self.__dictionary.has_key(object_name):
return False
if not self.__dictionary[object_name].has_key(feature_name):
return False
return self.__dictionary[object_name][feature_name][self.image_set_index] != None
def get_all_measurements(self,object_name,feature_name):
assert self.__dictionary.has_key(object_name),"No measurements for %s"%(object_name)
assert self.__dictionary[object_name].has_key(feature_name),"No measurements for %s.%s"%(object_name,feature_name)
return self.__dictionary[object_name][feature_name]
def add_all_measurements(self, object_name, feature_name, values):
'''Add a list of measurements for all image sets
object_name - name of object or Images
feature_name - feature to add
values - list of either values or arrays of values
'''
if not self.__dictionary.has_key(object_name):
self.__dictionary[object_name] = {}
self.__dictionary[object_name][feature_name] = values
def get_experiment_measurement(self, feature_name):
"""Retrieve an experiment-wide measurement
"""
return self.get_all_measurements(EXPERIMENT, feature_name)
def apply_metadata(self, pattern, image_set_index=None):
"""Apply metadata from the current measurements to a pattern
pattern - a regexp-like pattern that specifies how to insert
metadata into a string. Each token has the form:
"\(?<METADATA_TAG>\)" (matlab-style) or
"\g<METADATA_TAG>" (Python-style)
image_name - name of image associated with the metadata (or None
if metadata is not associated with an image)
image_set_index - # of image set to use to retrieve data.
None for current.
returns a string with the metadata tags replaced by the metadata
"""
if image_set_index == None:
image_set_index = self.image_set_index
result_pieces = []
double_backquote = "\\\\"
single_backquote = "\\"
for piece in pattern.split(double_backquote):
# Replace tags in piece
result = ''
while(True):
# Replace one tag
m = re.search('\\(\\?[<](.+?)[>]\\)', piece)
if not m:
m = re.search('\\\\g[<](.+?)[>]', piece)
if not m:
result += piece
break
result += piece[:m.start()]
measurement = 'Metadata_'+m.groups()[0]
result += str(self.get_measurement("Image", measurement,
image_set_index))
piece = piece[m.end():]
result_pieces.append(result)
return single_backquote.join(result_pieces)
def group_by_metadata(self, tags):
"""Return groupings of image sets with matching metadata tags
tags - a sequence of tags to match.
Returns a sequence of MetadataGroup objects. Each one represents
a set of values for the metadata tags along with the indexes of
the image sets that match the values
"""
if len(tags) == 0:
# if there are no tags, all image sets match each other
return [MetadataGroup({}, range(self.image_set_index+1))]
#
# The flat_dictionary has a row of tag values as a key
#
flat_dictionary = {}
values = [self.get_all_measurements('Image', "Metadata_%s"%tag)
for tag in tags]
for index in range(self.image_set_index+1):
row = tuple([value[index] for value in values])
if flat_dictionary.has_key(row):
flat_dictionary[row].append(index)
else:
flat_dictionary[row] = [index]
result = []
for row in flat_dictionary.keys():
tag_dictionary = {}
tag_dictionary.update(zip(tags,row))
result.append(MetadataGroup(tag_dictionary, flat_dictionary[row]))
return result
def agg_ignore_object(self,object_name):
"""Ignore objects (other than 'Image') if this returns true"""
if object_name in (EXPERIMENT,NEIGHBORS):
return True
def agg_ignore_feature(self, object_name, feature_name):
"""Return true if we should ignore a feature during aggregation"""
if self.agg_ignore_object(object_name):
return True
if self.has_feature(object_name, "SubObjectFlag"):
return True
return agg_ignore_feature(feature_name)
def compute_aggregate_measurements(self, image_set_number,
aggs=AGG_NAMES):
"""Compute aggregate measurements for a given image set
returns a dictionary whose key is the aggregate measurement name and
whose value is the aggregate measurement value
"""
d = {}
for object_name in self.get_object_names():
if object_name == 'Image':
continue
for feature in self.get_feature_names(object_name):
if self.agg_ignore_feature(object_name, feature):
continue
feature_name = "%s_%s"%(object_name, feature)
values = self.get_measurement(object_name, feature,
image_set_number)
if values is not None:
values = values[np.isfinite(values)]
#
# Compute the mean and standard deviation
#
if AGG_MEAN in aggs:
mean_feature_name = get_agg_measurement_name(
AGG_MEAN, object_name,feature)
mean = values.mean() if values is not None else np.NaN
d[mean_feature_name] = mean
if AGG_MEDIAN in aggs:
median_feature_name = get_agg_measurement_name(
AGG_MEDIAN, object_name, feature)
median = np.median(values) if values is not None else np.NaN
d[median_feature_name] = median
if AGG_STD_DEV in aggs:
stdev_feature_name = get_agg_measurement_name(
AGG_STD_DEV, object_name, feature)
stdev = values.std() if values is not None else np.NaN
d[stdev_feature_name] = stdev
return d
class MetadataGroup(dict):
"""A set of metadata tag values and the image set indexes that match
The MetadataGroup object represents a group of image sets that
have the same values for a given set of tags. For instance, if an
experiment has metadata tags of "Plate", "Well" and "Site" and
we form a metadata group of "Plate" and "Well", then each metadata
group will have image set indexes of the images taken of a particular
well
"""
def __init__(self, tag_dictionary, indexes):
super(MetadataGroup,self).__init__(tag_dictionary)
self.__indexes = indexes
@property
def indexes(self):
return self.__indexes
def __setitem__(self, tag, value):
raise NotImplementedError("The dictionary is read-only")
def find_metadata_tokens(pattern):
"""Return a list of strings which are the metadata token names in a pattern
pattern - a regexp-like pattern that specifies how to find
metadata in a string. Each token has the form:
"(?<METADATA_TAG>...match-exp...)" (matlab-style) or
"\g<METADATA_TAG>" (Python-style replace)
"(?P<METADATA_TAG>...match-exp..)" (Python-style search)
"""
result = []
while True:
m = re.search('\\(\\?[<](.+?)[>]', pattern)
if not m:
m = re.search('\\\\g[<](.+?)[>]', pattern)
if not m:
m = re.search('\\(\\?P[<](.+?)[>]', pattern)
if not m:
break
result.append(m.groups()[0])
pattern = pattern[m.end():]
return result
def extract_metadata(pattern, text):
"""Return a dictionary of metadata extracted from the text
pattern - a regexp that specifies how to find
metadata in a string. Each token has the form:
"\(?<METADATA_TAG>...match-exp...\)" (matlab-style) or
"\(?P<METADATA_TAG>...match-exp...\)" (Python-style)
text - text to be searched
We do a little fixup in here to change Matlab searches to Python ones
before executing.
"""
# Convert Matlab to Python
orig_pattern = pattern
pattern = re.sub('(\\(\\?)([<].+?[>])','\\1P\\2',pattern)
match = re.search(pattern, text)
if match:
return match.groupdict()
else:
raise ValueError("Metadata extraction failed: regexp '%s' does not match '%s'"%(orig_pattern, text))
def is_well_row_token(x):
'''True if the string represents a well row metadata tag'''
return x.lower() in ("wellrow","well_row", "row")
def is_well_column_token(x):
'''true if the string represents a well column metadata tag'''
return x.lower() in ("wellcol","well_col","wellcolumn","well_column",
"column","col")
def load_measurements(measurements_file_name):
'''Load measurements from a .mat file'''
m = Measurements()
m.load(measurements_file_name)
return m
def get_agg_measurement_name(agg, object_name, feature):
'''Return the name of an aggregate measurement
agg - one of the names in AGG_NAMES, like AGG_MEAN
object_name - the name of the object that we're aggregating
feature - the name of the object's measurement
'''
return "%s_%s_%s" % (agg, object_name, feature)
def agg_ignore_feature(feature_name):
'''Return True if the feature is one to be ignored when aggregating'''
if feature_name.startswith('Description_'):
return True
if feature_name.startswith('ModuleError_'):
return True
if feature_name.startswith('TimeElapsed_'):
return True
if feature_name == "Number_Object_Number":
return True
return False
class RelationshipKey:
def __init__(self, module_number, relationship, group_number,
object_name1, object_name2):
self.module_number = module_number
self.relationship = relationship
self.object_name1 = object_name1
self.object_name2 = object_name2
self.group_number = group_number
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment