-
-
Save tsvibt/53b1497dcd320e3a8403329d5c763ac9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' | |
| how to use: | |
| download the spreadsheet data by clicking File > Dowload > Web Page (.html) | |
| then move the file to the present directory and unzip with the 'unzip' command | |
| then run exec(open('code_gist.py').read()) in a prompt | |
| ''' | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import time | |
| import pyperclip | |
| import colorcet as cc | |
| import math | |
| from pathlib import Path | |
| import shutil | |
| from bs4 import BeautifulSoup | |
| colors1 = plt.cm.tab20(np.linspace(0, 1, 20)) | |
| colors2 = plt.cm.Set3(np.linspace(0, 1, 12)) | |
| colors3 = plt.cm.Pastel1(np.linspace(0, 1, 8)) | |
| all_colors = np.vstack((colors3, colors1, colors2, colors3)) | |
| plt.rcParams["figure.figsize"] = [13.3, 18] | |
| plt.rcParams["figure.autolayout"] = True | |
| plt.rcParams.update({'font.size': 11}) | |
| def clean_file_name(name): return name.replace(' ', '_') | |
| def relative_absolute_path(relative_path): return str(Path(relative_path).resolve()) | |
| def sorted_data_bar_graph(data, graph_title, file_name, logscale=False): | |
| ordered_names = [] | |
| for d in data: | |
| if d['name'] not in ordered_names: ordered_names.append(d['name']) | |
| start_year = data[0]['year'] | |
| end_year = data[-1]['year'] | |
| year_range = list(range(start_year, end_year + 1)) | |
| data_by_year = {year:[] for year in year_range} | |
| for datum in data: | |
| data_by_year[datum['year']].append(datum) | |
| full_data = [] | |
| full_data.append((start_year-1, {})) | |
| for year in year_range: | |
| full_data.append((year, | |
| full_data[-1][1] | {datum['name']:datum for datum in data_by_year[year]} | |
| )) | |
| full_data = full_data[1:] | |
| colors = all_colors[:len(ordered_names)] | |
| def name_index(name): return ordered_names.index(name) | |
| def name_color(name): return colors[name_index(name)] | |
| fig, ax = plt.subplots() | |
| for year, full_ds in full_data: | |
| x = year - 2000 | |
| bottom = 0 | |
| name_data = sorted(full_ds.items(), key=lambda x: name_index(x[0])) | |
| total = sum(datum['# genomes'] for _, datum in name_data) | |
| totalbar = total if not logscale else np.log10(total) + .01 | |
| print(x, total, totalbar, ) | |
| scaling_factor = totalbar/total | |
| for name, datum in name_data: | |
| scaled_count = datum['# genomes'] * scaling_factor | |
| ax.bar(x, scaled_count , bottom=bottom, color=name_color(name), edgecolor='black') | |
| bottom += scaled_count | |
| def name_displaytext(name): | |
| year_counts = [] | |
| for d in data: | |
| if d['name'] == name: | |
| year_counts.append((d['year'], d['# genomes'])) | |
| return ', '.join(f'{count if math.isclose(count, .5) else int(count):,} ({year})' for year, count in year_counts) + ': ' + name | |
| ax.legend( [plt.Rectangle((0, 0), 1, 1, color=name_color(name)) for name in ordered_names], | |
| [name_displaytext(name) for name in ordered_names], loc='upper left', bbox_to_anchor=(0, -0.10), ncol=1) | |
| ax.set_xticks(range(25)) | |
| ax.set_xticklabels([str(x).zfill(2) for x in range(25)]) | |
| if logscale: ax.set_yticklabels([f"$10^{int(ytick)}$" for ytick in ax.get_yticks()]) | |
| ax.set_xlabel("year (2000s)") | |
| ax.set_ylabel("genomes") | |
| ax.set_title(graph_title) | |
| plt.text( 0.95, 0.035, 'Berkeley Genomics Project, 2025', fontsize=9.5, ha='right', va='bottom', transform=plt.gcf().transFigure) | |
| plt.subplots_adjust(bottom=0.7) | |
| file_format = 'png' | |
| save_filename = relative_absolute_path(f'./images/{clean_file_name(file_name)}{round(time.time())}.{file_format}') | |
| plt.savefig(save_filename) | |
| save_filename = relative_absolute_path(f'./images/{clean_file_name(file_name)}_most_recent.{file_format}') | |
| plt.savefig(save_filename) | |
| pyperclip.copy(save_filename) | |
| def make_checkpoint(file_name, file_format='png'): | |
| real_name = clean_file_name(file_name) | |
| recent_path = relative_absolute_path(f'./images/{real_name}_most_recent.{file_format}') | |
| checkpoint_path = relative_absolute_path(f'./images/{real_name}_checkpoint.{file_format}') | |
| shutil.copy(recent_path, checkpoint_path) | |
| pyperclip.copy(f'') | |
| def parse_html_table(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| html_content = file.read() | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| rows = soup.find_all('tr') | |
| table_data = [] | |
| for row in rows: | |
| cells = row.find_all('td') | |
| cell_texts = [cell.get_text(strip=True) for cell in cells] | |
| if cell_texts: | |
| table_data.append(cell_texts) | |
| return table_data | |
| file_path = 'Sheet1.html' | |
| all_rows = parse_html_table(file_path) | |
| #['confidence', 'name', 'year', '# genomes', 'type', 'parameter', '$ cost', 'funding', 'evidence', 'notes', '', '', '', ''] | |
| content_rows = [ {key:content_row[i] for i,key in enumerate(all_rows[3]) if key != ''} | |
| for content_row in all_rows[4:] | |
| ] | |
| content_rows = [row for row in content_rows if row['name'] not in ['NO DATA', '']] | |
| def row_yearInt(row): | |
| if (year := row['year'].strip(' ?')) == '': return 0 | |
| else: return int(year) | |
| content_rows = [row | {'year':row_yearInt(row)} for row in content_rows] | |
| confident_rows = sorted([x for x in content_rows if '?' not in x['confidence'] and x['year'] != 0], | |
| key=lambda row: row['year']) | |
| def num_genomes_float(genomes): | |
| value = genomes.strip(' ?').replace(',','').lower() | |
| if value.endswith('k') : return float(value[:-1]) * 1000 | |
| elif value.endswith('m') : return float(value[:-1]) * 1000000 | |
| else : return float(value) | |
| confident_rows = [row | {'# genomes':num_genomes_float(row['# genomes'])} for row in confident_rows] | |
| narrow_snp_rows = [x for x in confident_rows if x['type'].lower() == 'snp' ] | |
| narrow_wgs_rows = [x for x in confident_rows if x['type'].lower() == 'wgs' ] | |
| WHGS = 'whole human genomes sequenced' | |
| #sorted_data_bar_graph(narrow_wgs_rows, f'{WHGS} (cumulative, convenience sample)', WHGS) | |
| #make_checkpoint(WHGS) | |
| sorted_data_bar_graph(narrow_wgs_rows, f'{WHGS} (cumulative, convenience sample)\nscaled to log_10 of total', WHGS + ' logscale', logscale=True) | |
| #make_checkpoint(WHGS + ' logscale') | |
| #sorted_data_bar_graph(narrow_snp_rows, 'SNP', 'SNP') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment