Skip to content

Instantly share code, notes, and snippets.

%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
sns.color_palette('Spectral')
import matplotlib.pyplot as plt
import numpy as np
import requests
import pandas as pd
data = pd.read_csv('genome.txt', sep='\t', dtype={'rsid':'str', 'chromosome':'object', 'position':'int', 'genotype':'str'}, comment='#')
print(data)
rsid chromosome position genotype
0 rs548049170 1 69869 TT
1 rs13328684 1 74792 --
2 rs9283150 1 565508 AA
# Read the data into a pandas DataFrame and do some EDA
df = pd.DataFrame(data)
df.head()
df.isna().any()
rsid False
chromosome False
df['chromosome'].unique()
array(['1', '2', '3', '4', '5', '6', '7', '8',
'9', '10', '11', '12','13', '14', '15',
'16', '17', '18', '19', '20', '21', '22',
'X','MT'], dtype=object)
df['chromosome'] = df['chromosome'].apply(lambda x:
df[df.chromosome == 1].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 49514 entries, 0 to 49513
Data columns (total 4 columns):
rsid 49514 non-null object
chromosome 49514 non-null int64
position 49514 non-null int64
genotype 49514 non-null object
snp_df = pd.read_csv('result.csv')
snp_df.head()
snp_df['genotype'] = snp_df['Unnamed: 0'].apply(lambda x:
re.sub(r'.*([AGCT]);([AGCT])\)', r'\1\2', x))
new_cols = ['rsid', 'magnitude', 'repute',
'summary', 'genotype']
snp_df.columns = new_cols
snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower())
snp_df['rsid'] = snp_df['rsid'].map(lambda x :
re.sub(r'([a-z]{1,}[\d]+)\([agct];[agct]\)',
r'\1', x))
null_repute = snp_df[snp_df['repute'].isnull()]
null_summaries = snp_df[snp_df['summary'].isnull()]
null_repute_and_summaries = pd.concat([null_repute,null_summaries]).drop_duplicates().reset_index(drop=True)
display(null_repute_and_summaries)
snp_df['repute'].fillna(value='Neutral', inplace=True)
snp_df['summary'].fillna(value='None', inplace=True)
# No no NaNette
snp_df.isna().any()
rsid False
magnitude False