bradenmacdonald/compare.py Secret

## compare.py

#!/usr/bin/env python
# coding: utf8
"""
Quantified comparisons between two or more data cubes, using dendrograms.
Copyright 2012 Braden MacDonald.
Re-use for any purpose but leave my name in this source file. No warranty.
"""
from __future__ import division
import numpy as np
import sys
import os
import math
import subprocess
from astrodendro import Dendrogram
from astrodendro.components import Branch, Leaf
from astrocube import DataCube
from scipy.stats.stats import nanmean

############# Set up parameters ############

sys.setrecursionlimit(20000)

min_flux = 0.5
min_npix = 40
min_delta = 0.20
STAT_RESOLUTION = 0.1 # Flux increment per step out of total flux range of 0 to 10
NORM_FMAX = 10.0 # Normalize maximum flux to this value

def get_cube(name, add_noise=0):
    " Load and normalize a data cube. Use add_noise=0.6 to make it noisy."
    cube_path = name
    # Normalize all cubes t have peak flux value of 10
    use_noise = False
    data = DataCube(cube_path, calc_noise_dev=False).data
    data = data * (NORM_FMAX/np.nanmax(data)) # Normalize
    if add_noise > 0:
        data = data + np.random.normal(scale=add_noise, size=data.shape)
        data = data * (NORM_FMAX/np.nanmax(data)) # Re-normalize, since noise will have changed the max
    return data

def get_dendrogram(cube_name, add_noise=0):
    cube_data = get_cube(cube_name, add_noise)
    d = Dendrogram(cube_data, minimum_flux=min_flux, minimum_npix=min_npix, minimum_delta=min_delta, verbose=True)
    return d

def get_stats(d):
    " Generate various statistics of a dendrogram "
    root = d.main_branch
    if type(root) is not Branch:
        return None # That dendrogram is not complex enough; all useful structure has been filtered out.

    def nans(length, dtype=float):
        "Create an array of nans with length length"
        a = np.empty((length,), dtype)
        a.fill(np.nan)
        return a
    res = NORM_FMAX//STAT_RESOLUTION+1 # number of entries we'll need in our statistics arrays
    stats = {'NVF': nans(res), 'EVF': nans(res), 'HVF': nans(res)} # n(items) vs. flux, eccentricity vs. flux, mean Height vs. flux
    all_items = root.descendants
    nitems = len(all_items)
    max_eccentricity = max([item.eccentricity for item in all_items])

    f = STAT_RESOLUTION # Start just above zero
    idx = 0 # Integer flux coordinate, which we will use to index the stats arrays, since 'f' is a float and thus cannot be used as an index
    while f <= NORM_FMAX:
        matching_items = [item for item in all_items if item.fmax > f and (item.parent is None or item.parent.merge_level < f)]
        if matching_items:
            # Num items vs. flux:
            stats['NVF'][idx] = len(matching_items) # note: this needs to be normalized!
            # Eccentricity vs. flux:
            eccentricities = np.array([item.eccentricity for item in matching_items], dtype=np.int)
            stats['EVF'][idx] = (np.mean(eccentricities) / max_eccentricity)
            # mean Height vs. flux:
            heights = np.array([ item.height for item in matching_items])
            stats['HVF'][idx] = np.mean(heights)/NORM_FMAX
        f += STAT_RESOLUTION
        idx += 1
    # Normalize NVF:
    stats['NVF'] = stats['NVF'] / np.nanmax(stats['NVF'])
    return stats # stats is a dictionary, where each value is a list of numbers between 0.0 and 1.0, indexed by flux


if __name__ == '__main__':
    if len(sys.argv) < 3:
        exit("Usage: base_fits_file fits_file2 [fits_file3...]")
    cube_names = sys.argv[2:]
    base_cube = sys.argv[1]

    print("Generating dendrogram for {}:".format(base_cube))
    d1 = get_dendrogram(base_cube)
    stats1 = get_stats(d1)
    if stats1 is None:
        sys.exit("That dendrogram is not complex enough; all useful structure has been filtered out.")

    summaries = [] # list of (cube name str, summary str, total difference value)

    for cube_name in cube_names:
        print("Creating dendrogram for {}".format(cube_name))
        d2 = get_dendrogram(cube_name)
        stats2 = get_stats(d2)
        if stats2 is None:
            print("That dendrogram is not complex enough; all useful structure has been filtered out.")
            continue

        # Print some comparisons
        def diff(list1, list2):
            " Returns the mean difference between two statistics, given two lists, one for each cube "
            tuples = zip(list1, list2) # Each tuple in the result is e.g. (e1, e2), i.e. contains the mean eccentricities from each dendrogram at a given flux level
            diffs = [abs(e2-e1) for e1,e2 in tuples if (not np.isnan(e1) and not np.isnan(e2))]
            return np.mean(diffs)*100

        nvf_diff = diff(stats1['NVF'], stats2['NVF'])
        evf_diff = diff(stats1['EVF'], stats2['EVF'])
        hvf_diff = diff(stats1['HVF'], stats2['HVF'])

        total = nvf_diff+evf_diff+hvf_diff

        summary = "NVF {: >5.2f}  EVF {: >5.2f}  HVF {: >5.2f}  Total {: >4.1f}".format(nvf_diff, evf_diff, hvf_diff, total)
        summaries.append( (cube_name, summary, total) )
        print("Comparing {} with {}:".format(base_cube, cube_name))
        print(summary)

    print(" ")
    print("Summary: Comparing with {}".format(base_cube))
    summaries.sort(key=lambda t: t[2])
    for (cube_name, summary, total) in summaries:
        print("{: >16}: {}".format(cube_name[:16],summary))

	#!/usr/bin/env python
	# coding: utf8
	"""
	Quantified comparisons between two or more data cubes, using dendrograms.
	Copyright 2012 Braden MacDonald.
	Re-use for any purpose but leave my name in this source file. No warranty.
	"""
	from __future__ import division
	import numpy as np
	import sys
	import os
	import math
	import subprocess
	from astrodendro import Dendrogram
	from astrodendro.components import Branch, Leaf
	from astrocube import DataCube
	from scipy.stats.stats import nanmean

	############# Set up parameters ############

	sys.setrecursionlimit(20000)

	min_flux = 0.5
	min_npix = 40
	min_delta = 0.20
	STAT_RESOLUTION = 0.1 # Flux increment per step out of total flux range of 0 to 10
	NORM_FMAX = 10.0 # Normalize maximum flux to this value

	def get_cube(name, add_noise=0):
	" Load and normalize a data cube. Use add_noise=0.6 to make it noisy."
	cube_path = name
	# Normalize all cubes t have peak flux value of 10
	use_noise = False
	data = DataCube(cube_path, calc_noise_dev=False).data
	data = data * (NORM_FMAX/np.nanmax(data)) # Normalize
	if add_noise > 0:
	data = data + np.random.normal(scale=add_noise, size=data.shape)
	data = data * (NORM_FMAX/np.nanmax(data)) # Re-normalize, since noise will have changed the max
	return data

	def get_dendrogram(cube_name, add_noise=0):
	cube_data = get_cube(cube_name, add_noise)
	d = Dendrogram(cube_data, minimum_flux=min_flux, minimum_npix=min_npix, minimum_delta=min_delta, verbose=True)
	return d

	def get_stats(d):
	" Generate various statistics of a dendrogram "
	root = d.main_branch
	if type(root) is not Branch:
	return None # That dendrogram is not complex enough; all useful structure has been filtered out.

	def nans(length, dtype=float):
	"Create an array of nans with length length"
	a = np.empty((length,), dtype)
	a.fill(np.nan)
	return a
	res = NORM_FMAX//STAT_RESOLUTION+1 # number of entries we'll need in our statistics arrays
	stats = {'NVF': nans(res), 'EVF': nans(res), 'HVF': nans(res)} # n(items) vs. flux, eccentricity vs. flux, mean Height vs. flux
	all_items = root.descendants
	nitems = len(all_items)
	max_eccentricity = max([item.eccentricity for item in all_items])

	f = STAT_RESOLUTION # Start just above zero
	idx = 0 # Integer flux coordinate, which we will use to index the stats arrays, since 'f' is a float and thus cannot be used as an index
	while f <= NORM_FMAX:
	matching_items = [item for item in all_items if item.fmax > f and (item.parent is None or item.parent.merge_level < f)]
	if matching_items:
	# Num items vs. flux:
	stats['NVF'][idx] = len(matching_items) # note: this needs to be normalized!
	# Eccentricity vs. flux:
	eccentricities = np.array([item.eccentricity for item in matching_items], dtype=np.int)
	stats['EVF'][idx] = (np.mean(eccentricities) / max_eccentricity)
	# mean Height vs. flux:
	heights = np.array([ item.height for item in matching_items])
	stats['HVF'][idx] = np.mean(heights)/NORM_FMAX
	f += STAT_RESOLUTION
	idx += 1
	# Normalize NVF:
	stats['NVF'] = stats['NVF'] / np.nanmax(stats['NVF'])
	return stats # stats is a dictionary, where each value is a list of numbers between 0.0 and 1.0, indexed by flux


	if __name__ == '__main__':
	if len(sys.argv) < 3:
	exit("Usage: base_fits_file fits_file2 [fits_file3...]")
	cube_names = sys.argv[2:]
	base_cube = sys.argv[1]

	print("Generating dendrogram for {}:".format(base_cube))
	d1 = get_dendrogram(base_cube)
	stats1 = get_stats(d1)
	if stats1 is None:
	sys.exit("That dendrogram is not complex enough; all useful structure has been filtered out.")

	summaries = [] # list of (cube name str, summary str, total difference value)

	for cube_name in cube_names:
	print("Creating dendrogram for {}".format(cube_name))
	d2 = get_dendrogram(cube_name)
	stats2 = get_stats(d2)
	if stats2 is None:
	print("That dendrogram is not complex enough; all useful structure has been filtered out.")
	continue

	# Print some comparisons
	def diff(list1, list2):
	" Returns the mean difference between two statistics, given two lists, one for each cube "
	tuples = zip(list1, list2) # Each tuple in the result is e.g. (e1, e2), i.e. contains the mean eccentricities from each dendrogram at a given flux level
	diffs = [abs(e2-e1) for e1,e2 in tuples if (not np.isnan(e1) and not np.isnan(e2))]
	return np.mean(diffs)*100

	nvf_diff = diff(stats1['NVF'], stats2['NVF'])
	evf_diff = diff(stats1['EVF'], stats2['EVF'])
	hvf_diff = diff(stats1['HVF'], stats2['HVF'])

	total = nvf_diff+evf_diff+hvf_diff

	summary = "NVF {: >5.2f} EVF {: >5.2f} HVF {: >5.2f} Total {: >4.1f}".format(nvf_diff, evf_diff, hvf_diff, total)
	summaries.append( (cube_name, summary, total) )
	print("Comparing {} with {}:".format(base_cube, cube_name))
	print(summary)

	print(" ")
	print("Summary: Comparing with {}".format(base_cube))
	summaries.sort(key=lambda t: t[2])
	for (cube_name, summary, total) in summaries:
	print("{: >16}: {}".format(cube_name[:16],summary))