Last active
October 6, 2020 02:28
-
-
Save mr-eyes/34c2eaea7afa531e5951711064489215 to your computer and use it in GitHub Desktop.
Merge and process tables (special)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 | 2 | 2 | 1 | 3 | |
---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
exp1 | exp2 | exp3 | exp4 | exp5 | |
---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 | |
4 | 3 | 2 | 1 | 1 | |
5 | 2 | 2 | 5 | 2 | |
10 | 0 | 8 | 20 | 6 | |
8 | 9 | 4.5 | 90 | 5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from statistics import mean, median | |
if len(sys.argv) < 4: | |
sys.exit("run: python transform.py <data TSV> <clusters TSV> <func:max,min,mean,median>") | |
data_file = sys.argv[1] | |
clusters_file = sys.argv[2] | |
func_name = sys.argv[3] | |
func = None | |
if func_name == "max": | |
func = max | |
elif func_name == "min": | |
func = min | |
elif func_name in ["avg", "mean"]: | |
func = mean | |
elif func_name == "median": | |
func = median | |
# This save all the information | |
all_clusters = list() | |
number_of_runs = 0 | |
with open(clusters_file) as clustersFileReader: | |
for line in clustersFileReader: | |
number_of_runs += 1 | |
tmp_column_to_cluster = dict() | |
line = line.strip().split('\t') | |
for colID, clusterID in enumerate(line, start = 1): | |
tmp_column_to_cluster[colID] = clusterID | |
collapsed_tmp_column_to_cluster = dict() | |
for colID, clusterID in tmp_column_to_cluster.items(): | |
if clusterID in collapsed_tmp_column_to_cluster: | |
collapsed_tmp_column_to_cluster[clusterID].append(colID) | |
else: | |
collapsed_tmp_column_to_cluster[clusterID] = [colID] | |
all_clusters.append(collapsed_tmp_column_to_cluster) | |
run_id_to_file = dict() | |
for i in range(1, number_of_runs + 1, 1): | |
run_id_to_file[i] = open(f"run_{i}_{func_name}.tsv", 'w') | |
with open(data_file) as dataFileReader: | |
next(dataFileReader) # Skip header (delete if no header) | |
for line in dataFileReader: | |
line = list(map(float, line.strip().split('\t'))) | |
current_cluster_to_columns = [] | |
current_run_id = 0 | |
for cluster_to_cols in all_clusters: | |
current_run_id += 1 | |
cluster_to_final_value = dict() | |
for clusterID, cols in cluster_to_cols.items(): | |
tmp_selected_rows = list() | |
for col in cols: | |
tmp_selected_rows.append(line[col-1]) | |
final_value = func(tmp_selected_rows) | |
cluster_to_final_value[clusterID] = final_value | |
transformed_final_values = list() | |
for clusterID, finalValue in cluster_to_final_value.items(): | |
transformed_final_values.append(finalValue) | |
transformed_row = '\t'.join(list(map(str, transformed_final_values))) + '\n' | |
run_id_to_file[current_run_id].write(transformed_row) | |
for i, fileWriter in run_id_to_file.items(): | |
fileWriter.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment