Ray L. Johns lorarjohns

## import.py
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
sns.color_palette('Spectral')
import matplotlib.pyplot as plt


import numpy as np
import requests
import pandas as pd

## output.py
data = pd.read_csv('genome.txt', sep='\t', dtype={'rsid':'str', 'chromosome':'object', 'position':'int', 'genotype':'str'}, comment='#')


print(data)


rsid chromosome  position genotype
0       rs548049170          1     69869       TT
1        rs13328684          1     74792       --
2         rs9283150          1    565508       AA

## EDA.py
# Read the data into a pandas DataFrame and do some EDA
df = pd.DataFrame(data)

df.head()

df.isna().any()


rsid         False
chromosome    False

## DNA_dict.py
df['chromosome'].unique()


array(['1', '2', '3', '4', '5', '6', '7', '8',
      '9', '10', '11', '12','13', '14', '15',
      '16', '17', '18', '19', '20', '21', '22',
      'X','MT'], dtype=object)


df['chromosome'] = df['chromosome'].apply(lambda x:

## DNA_count.py
df[df.chromosome == 1].info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 49514 entries, 0 to 49513
Data columns (total 4 columns):
 rsid         49514 non-null object
chromosome    49514 non-null int64
position      49514 non-null int64
genotype      49514 non-null object

## snp_df.py
snp_df = pd.read_csv('result.csv')
snp_df.head()

## regex1.py
snp_df['genotype'] = snp_df['Unnamed: 0'].apply(lambda x:
	re.sub(r'.*([AGCT]);([AGCT])\)', r'\1\2', x))

## clean.py
new_cols = ['rsid', 'magnitude', 'repute',
'summary', 'genotype']
snp_df.columns = new_cols

## regex2.py
snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower())
snp_df['rsid'] = snp_df['rsid'].map(lambda x :
	re.sub(r'([a-z]{1,}[\d]+)\([agct];[agct]\)',
	r'\1', x))

## NaN.py
null_repute = snp_df[snp_df['repute'].isnull()]
null_summaries = snp_df[snp_df['summary'].isnull()]
null_repute_and_summaries = pd.concat([null_repute,null_summaries]).drop_duplicates().reset_index(drop=True)
display(null_repute_and_summaries)
snp_df['repute'].fillna(value='Neutral', inplace=True)
snp_df['summary'].fillna(value='None', inplace=True)
# No no NaNette
snp_df.isna().any()
rsid         False
magnitude    False
	%matplotlib inline
	import seaborn as sns
	sns.set_style('darkgrid')
	sns.color_palette('Spectral')
	import matplotlib.pyplot as plt


	import numpy as np
	import requests
	import pandas as pd
	data = pd.read_csv('genome.txt', sep='\t', dtype={'rsid':'str', 'chromosome':'object', 'position':'int', 'genotype':'str'}, comment='#')


	print(data)


	rsid chromosome position genotype
	0 rs548049170 1 69869 TT
	1 rs13328684 1 74792 --
	2 rs9283150 1 565508 AA
	# Read the data into a pandas DataFrame and do some EDA
	df = pd.DataFrame(data)

	df.head()

	df.isna().any()


	rsid False
	chromosome False
	df['chromosome'].unique()


	array(['1', '2', '3', '4', '5', '6', '7', '8',
	'9', '10', '11', '12','13', '14', '15',
	'16', '17', '18', '19', '20', '21', '22',
	'X','MT'], dtype=object)


	df['chromosome'] = df['chromosome'].apply(lambda x:
	df[df.chromosome == 1].info()


	<class 'pandas.core.frame.DataFrame'>
	Int64Index: 49514 entries, 0 to 49513
	Data columns (total 4 columns):
	rsid 49514 non-null object
	chromosome 49514 non-null int64
	position 49514 non-null int64
	genotype 49514 non-null object
	snp_df['genotype'] = snp_df['Unnamed: 0'].apply(lambda x:
	re.sub(r'.*([AGCT]);([AGCT])\)', r'\1\2', x))
	new_cols = ['rsid', 'magnitude', 'repute',
	'summary', 'genotype']
	snp_df.columns = new_cols
	snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower())
	snp_df['rsid'] = snp_df['rsid'].map(lambda x :
	re.sub(r'([a-z]{1,}[\d]+)\([agct];[agct]\)',
	r'\1', x))
	null_repute = snp_df[snp_df['repute'].isnull()]
	null_summaries = snp_df[snp_df['summary'].isnull()]
	null_repute_and_summaries = pd.concat([null_repute,null_summaries]).drop_duplicates().reset_index(drop=True)
	display(null_repute_and_summaries)
	snp_df['repute'].fillna(value='Neutral', inplace=True)
	snp_df['summary'].fillna(value='None', inplace=True)
	# No no NaNette
	snp_df.isna().any()
	rsid False
	magnitude False