Skip to content

Instantly share code, notes, and snippets.

@tsvibt
Created January 19, 2025 07:19
Show Gist options
  • Select an option

  • Save tsvibt/53b1497dcd320e3a8403329d5c763ac9 to your computer and use it in GitHub Desktop.

Select an option

Save tsvibt/53b1497dcd320e3a8403329d5c763ac9 to your computer and use it in GitHub Desktop.
'''
how to use:
download the spreadsheet data by clicking File > Dowload > Web Page (.html)
then move the file to the present directory and unzip with the 'unzip' command
then run exec(open('code_gist.py').read()) in a prompt
'''
import matplotlib.pyplot as plt
import numpy as np
import time
import pyperclip
import colorcet as cc
import math
from pathlib import Path
import shutil
from bs4 import BeautifulSoup
colors1 = plt.cm.tab20(np.linspace(0, 1, 20))
colors2 = plt.cm.Set3(np.linspace(0, 1, 12))
colors3 = plt.cm.Pastel1(np.linspace(0, 1, 8))
all_colors = np.vstack((colors3, colors1, colors2, colors3))
plt.rcParams["figure.figsize"] = [13.3, 18]
plt.rcParams["figure.autolayout"] = True
plt.rcParams.update({'font.size': 11})
def clean_file_name(name): return name.replace(' ', '_')
def relative_absolute_path(relative_path): return str(Path(relative_path).resolve())
def sorted_data_bar_graph(data, graph_title, file_name, logscale=False):
ordered_names = []
for d in data:
if d['name'] not in ordered_names: ordered_names.append(d['name'])
start_year = data[0]['year']
end_year = data[-1]['year']
year_range = list(range(start_year, end_year + 1))
data_by_year = {year:[] for year in year_range}
for datum in data:
data_by_year[datum['year']].append(datum)
full_data = []
full_data.append((start_year-1, {}))
for year in year_range:
full_data.append((year,
full_data[-1][1] | {datum['name']:datum for datum in data_by_year[year]}
))
full_data = full_data[1:]
colors = all_colors[:len(ordered_names)]
def name_index(name): return ordered_names.index(name)
def name_color(name): return colors[name_index(name)]
fig, ax = plt.subplots()
for year, full_ds in full_data:
x = year - 2000
bottom = 0
name_data = sorted(full_ds.items(), key=lambda x: name_index(x[0]))
total = sum(datum['# genomes'] for _, datum in name_data)
totalbar = total if not logscale else np.log10(total) + .01
print(x, total, totalbar, )
scaling_factor = totalbar/total
for name, datum in name_data:
scaled_count = datum['# genomes'] * scaling_factor
ax.bar(x, scaled_count , bottom=bottom, color=name_color(name), edgecolor='black')
bottom += scaled_count
def name_displaytext(name):
year_counts = []
for d in data:
if d['name'] == name:
year_counts.append((d['year'], d['# genomes']))
return ', '.join(f'{count if math.isclose(count, .5) else int(count):,} ({year})' for year, count in year_counts) + ': ' + name
ax.legend( [plt.Rectangle((0, 0), 1, 1, color=name_color(name)) for name in ordered_names],
[name_displaytext(name) for name in ordered_names], loc='upper left', bbox_to_anchor=(0, -0.10), ncol=1)
ax.set_xticks(range(25))
ax.set_xticklabels([str(x).zfill(2) for x in range(25)])
if logscale: ax.set_yticklabels([f"$10^{int(ytick)}$" for ytick in ax.get_yticks()])
ax.set_xlabel("year (2000s)")
ax.set_ylabel("genomes")
ax.set_title(graph_title)
plt.text( 0.95, 0.035, 'Berkeley Genomics Project, 2025', fontsize=9.5, ha='right', va='bottom', transform=plt.gcf().transFigure)
plt.subplots_adjust(bottom=0.7)
file_format = 'png'
save_filename = relative_absolute_path(f'./images/{clean_file_name(file_name)}{round(time.time())}.{file_format}')
plt.savefig(save_filename)
save_filename = relative_absolute_path(f'./images/{clean_file_name(file_name)}_most_recent.{file_format}')
plt.savefig(save_filename)
pyperclip.copy(save_filename)
def make_checkpoint(file_name, file_format='png'):
real_name = clean_file_name(file_name)
recent_path = relative_absolute_path(f'./images/{real_name}_most_recent.{file_format}')
checkpoint_path = relative_absolute_path(f'./images/{real_name}_checkpoint.{file_format}')
shutil.copy(recent_path, checkpoint_path)
pyperclip.copy(f'![]({checkpoint_path})')
def parse_html_table(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
rows = soup.find_all('tr')
table_data = []
for row in rows:
cells = row.find_all('td')
cell_texts = [cell.get_text(strip=True) for cell in cells]
if cell_texts:
table_data.append(cell_texts)
return table_data
file_path = 'Sheet1.html'
all_rows = parse_html_table(file_path)
#['confidence', 'name', 'year', '# genomes', 'type', 'parameter', '$ cost', 'funding', 'evidence', 'notes', '', '', '', '']
content_rows = [ {key:content_row[i] for i,key in enumerate(all_rows[3]) if key != ''}
for content_row in all_rows[4:]
]
content_rows = [row for row in content_rows if row['name'] not in ['NO DATA', '']]
def row_yearInt(row):
if (year := row['year'].strip(' ?')) == '': return 0
else: return int(year)
content_rows = [row | {'year':row_yearInt(row)} for row in content_rows]
confident_rows = sorted([x for x in content_rows if '?' not in x['confidence'] and x['year'] != 0],
key=lambda row: row['year'])
def num_genomes_float(genomes):
value = genomes.strip(' ?').replace(',','').lower()
if value.endswith('k') : return float(value[:-1]) * 1000
elif value.endswith('m') : return float(value[:-1]) * 1000000
else : return float(value)
confident_rows = [row | {'# genomes':num_genomes_float(row['# genomes'])} for row in confident_rows]
narrow_snp_rows = [x for x in confident_rows if x['type'].lower() == 'snp' ]
narrow_wgs_rows = [x for x in confident_rows if x['type'].lower() == 'wgs' ]
WHGS = 'whole human genomes sequenced'
#sorted_data_bar_graph(narrow_wgs_rows, f'{WHGS} (cumulative, convenience sample)', WHGS)
#make_checkpoint(WHGS)
sorted_data_bar_graph(narrow_wgs_rows, f'{WHGS} (cumulative, convenience sample)\nscaled to log_10 of total', WHGS + ' logscale', logscale=True)
#make_checkpoint(WHGS + ' logscale')
#sorted_data_bar_graph(narrow_snp_rows, 'SNP', 'SNP')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment