drammock/assess-dim-reduction.R

## assess-dim-reduction.R
#!/usr/bin/env Rscript
library(dplyr, warn.conflicts=FALSE)

# load phoible data (to get feat. col. names)
"~/Documents/academics/research/phoible/dev/data" -> data_dir
load(file.path(data_dir, "phoible.RData"))

# get feature column names
phoible %>%
    select(tone:click) %>%
    colnames() ->
    feature_columns

# list solution files
file.path("results", "dimredux-solutions") -> results_dir
list.files(results_dir, pattern="csv$", full.names=TRUE) -> filenames

# housekeeping
c() -> errors

id_from_filename <- function(filename) {
    basename(filename) %>%
        strsplit(split=".", fixed=TRUE) %>%
        unlist() %>%
        `[`(1) %>%
        as.integer()
}


record_errors <- function(inventory_id, solutions) {
    if(startsWith(solutions[1, 1], "ERROR")) {
        errors <<- c(errors, inventory_id)
        return(TRUE)
    }
    return(FALSE)
}


load_solutions <- function(filename) {
    id_from_filename(filename) -> inventory_id
    read.csv(filename, header=FALSE) -> solutions
    record_errors(inventory_id, solutions) -> is_error
    if(startsWith(solutions[1, 1], "ERROR")) return(NULL)
    # spread the features into boolean columns
    apply(solutions, 1, function(feats) feature_columns %in% feats) %>%
        as.data.frame(row.names=feature_columns) %>%
        t() %>%
        as.data.frame() ->
        solutions_df
    # record the inventory ID in a column
    inventory_id -> solutions_df$InventoryID
    return(solutions_df)
}


filenames %>%
    lapply(load_solutions) %>%
    do.call(rbind, .) %>%
    group_by(InventoryID) ->
    all_solutions

# make sure we didn't lose any
n_distinct(all_solutions$InventoryID) -> n_valid_solutions
assertthat::are_equal(length(filenames), n_valid_solutions + length(errors))

# count 'em up
all_solutions %>%
    tally(name="n_solutions", sort=TRUE) ->
    n_solutions_per_inventory

# which features occur in the most solutions?
all_solutions %>%
    ungroup() %>%
    select(-InventoryID) %>%
    summarise_all(sum) %>%
    unlist() %>%
    sort(decreasing=TRUE)->
    feature_occurrence_in_solutions_across_all_inventories

# which features occur in the most solutions (per inventory)?
all_solutions %>%
    summarise_all(sum) ->
    feature_occurrence_in_solutions_by_inventory
	#!/usr/bin/env Rscript
	library(dplyr, warn.conflicts=FALSE)

	# load phoible data (to get feat. col. names)
	"~/Documents/academics/research/phoible/dev/data" -> data_dir
	load(file.path(data_dir, "phoible.RData"))

	# get feature column names
	phoible %>%
	select(tone:click) %>%
	colnames() ->
	feature_columns

	# list solution files
	file.path("results", "dimredux-solutions") -> results_dir
	list.files(results_dir, pattern="csv$", full.names=TRUE) -> filenames

	# housekeeping
	c() -> errors

	id_from_filename <- function(filename) {
	basename(filename) %>%
	strsplit(split=".", fixed=TRUE) %>%
	unlist() %>%
	`[`(1) %>%
	as.integer()
	}


	record_errors <- function(inventory_id, solutions) {
	if(startsWith(solutions[1, 1], "ERROR")) {
	errors <<- c(errors, inventory_id)
	return(TRUE)
	}
	return(FALSE)
	}


	load_solutions <- function(filename) {
	id_from_filename(filename) -> inventory_id
	read.csv(filename, header=FALSE) -> solutions
	record_errors(inventory_id, solutions) -> is_error
	if(startsWith(solutions[1, 1], "ERROR")) return(NULL)
	# spread the features into boolean columns
	apply(solutions, 1, function(feats) feature_columns %in% feats) %>%
	as.data.frame(row.names=feature_columns) %>%
	t() %>%
	as.data.frame() ->
	solutions_df
	# record the inventory ID in a column
	inventory_id -> solutions_df$InventoryID
	return(solutions_df)
	}


	filenames %>%
	lapply(load_solutions) %>%
	do.call(rbind, .) %>%
	group_by(InventoryID) ->
	all_solutions

	# make sure we didn't lose any
	n_distinct(all_solutions$InventoryID) -> n_valid_solutions
	assertthat::are_equal(length(filenames), n_valid_solutions + length(errors))

	# count 'em up
	all_solutions %>%
	tally(name="n_solutions", sort=TRUE) ->
	n_solutions_per_inventory

	# which features occur in the most solutions?
	all_solutions %>%
	ungroup() %>%
	select(-InventoryID) %>%
	summarise_all(sum) %>%
	unlist() %>%
	sort(decreasing=TRUE)->
	feature_occurrence_in_solutions_across_all_inventories

	# which features occur in the most solutions (per inventory)?
	all_solutions %>%
	summarise_all(sum) ->
	feature_occurrence_in_solutions_by_inventory