Skip to content

Instantly share code, notes, and snippets.

@mr-eyes
Last active October 6, 2020 02:28
Show Gist options
  • Save mr-eyes/34c2eaea7afa531e5951711064489215 to your computer and use it in GitHub Desktop.
Save mr-eyes/34c2eaea7afa531e5951711064489215 to your computer and use it in GitHub Desktop.
Merge and process tables (special)
1 2 2 1 3
1 2 3 4 5
exp1 exp2 exp3 exp4 exp5
1 2 3 4 5
4 3 2 1 1
5 2 2 5 2
10 0 8 20 6
8 9 4.5 90 5
import sys
import os
from statistics import mean, median
if len(sys.argv) < 4:
sys.exit("run: python transform.py <data TSV> <clusters TSV> <func:max,min,mean,median>")
data_file = sys.argv[1]
clusters_file = sys.argv[2]
func_name = sys.argv[3]
func = None
if func_name == "max":
func = max
elif func_name == "min":
func = min
elif func_name in ["avg", "mean"]:
func = mean
elif func_name == "median":
func = median
# This save all the information
all_clusters = list()
number_of_runs = 0
with open(clusters_file) as clustersFileReader:
for line in clustersFileReader:
number_of_runs += 1
tmp_column_to_cluster = dict()
line = line.strip().split('\t')
for colID, clusterID in enumerate(line, start = 1):
tmp_column_to_cluster[colID] = clusterID
collapsed_tmp_column_to_cluster = dict()
for colID, clusterID in tmp_column_to_cluster.items():
if clusterID in collapsed_tmp_column_to_cluster:
collapsed_tmp_column_to_cluster[clusterID].append(colID)
else:
collapsed_tmp_column_to_cluster[clusterID] = [colID]
all_clusters.append(collapsed_tmp_column_to_cluster)
run_id_to_file = dict()
for i in range(1, number_of_runs + 1, 1):
run_id_to_file[i] = open(f"run_{i}_{func_name}.tsv", 'w')
with open(data_file) as dataFileReader:
next(dataFileReader) # Skip header (delete if no header)
for line in dataFileReader:
line = list(map(float, line.strip().split('\t')))
current_cluster_to_columns = []
current_run_id = 0
for cluster_to_cols in all_clusters:
current_run_id += 1
cluster_to_final_value = dict()
for clusterID, cols in cluster_to_cols.items():
tmp_selected_rows = list()
for col in cols:
tmp_selected_rows.append(line[col-1])
final_value = func(tmp_selected_rows)
cluster_to_final_value[clusterID] = final_value
transformed_final_values = list()
for clusterID, finalValue in cluster_to_final_value.items():
transformed_final_values.append(finalValue)
transformed_row = '\t'.join(list(map(str, transformed_final_values))) + '\n'
run_id_to_file[current_run_id].write(transformed_row)
for i, fileWriter in run_id_to_file.items():
fileWriter.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment