Last active
August 14, 2021 04:33
-
-
Save david-a-parry/a05863750bdc8b3934020b069db66359 to your computer and use it in GitHub Desktop.
Example to convert excel corrupted human gene names back from dates to gene symbols
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import warnings | |
import re | |
import pandas as pd | |
date_re = re.compile(r'\d{4}-\d{2}-\d{2}') | |
mnth2str = {3: 'MARCH', 9: 'SEPT', 10: 'OCT', 12: 'DEC'} | |
def date2gene(x): | |
try: | |
mnth = mnth2str[pd.to_datetime(x).month] | |
day = pd.to_datetime(x).day | |
return "{}{}".format(mnth, day) | |
except KeyError: | |
warnings.warn("Could not convert date: {}".format(x)) | |
return x | |
def convert_column(df, column): | |
''' | |
Convert dates in column to gene names. | |
Args: | |
df (pandas.DataFrame): | |
dataframe to convert | |
column (str): | |
name of column with gene names to convert | |
''' | |
df[column] = df[column].apply(lambda x: date2gene(x) | |
if date_re.match(str(x)) else x) | |
xlsx = '/path/to/file.xlsx' # edit as appropriate | |
sheet_name = 'my_gene_list' # edit as appropriate | |
column = 'gene_sybols' # edit as appropriate | |
df = pd.read_excel(xlsx, sheet_name=sheet_name) | |
convert_column(df, column) | |
f, ext = os.path.splitext(xlsx) | |
out = f + '.date2gene.xlsx' | |
df.to_excel(out, sheet_name=sheet_name, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment