Skip to content

Instantly share code, notes, and snippets.

@david-a-parry
Last active August 14, 2021 04:33
Show Gist options
  • Save david-a-parry/a05863750bdc8b3934020b069db66359 to your computer and use it in GitHub Desktop.
Save david-a-parry/a05863750bdc8b3934020b069db66359 to your computer and use it in GitHub Desktop.
Example to convert excel corrupted human gene names back from dates to gene symbols
#!/usr/bin/env python3
import os
import warnings
import re
import pandas as pd
date_re = re.compile(r'\d{4}-\d{2}-\d{2}')
mnth2str = {3: 'MARCH', 9: 'SEPT', 10: 'OCT', 12: 'DEC'}
def date2gene(x):
try:
mnth = mnth2str[pd.to_datetime(x).month]
day = pd.to_datetime(x).day
return "{}{}".format(mnth, day)
except KeyError:
warnings.warn("Could not convert date: {}".format(x))
return x
def convert_column(df, column):
'''
Convert dates in column to gene names.
Args:
df (pandas.DataFrame):
dataframe to convert
column (str):
name of column with gene names to convert
'''
df[column] = df[column].apply(lambda x: date2gene(x)
if date_re.match(str(x)) else x)
xlsx = '/path/to/file.xlsx' # edit as appropriate
sheet_name = 'my_gene_list' # edit as appropriate
column = 'gene_sybols' # edit as appropriate
df = pd.read_excel(xlsx, sheet_name=sheet_name)
convert_column(df, column)
f, ext = os.path.splitext(xlsx)
out = f + '.date2gene.xlsx'
df.to_excel(out, sheet_name=sheet_name, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment