Ray L. Johns lorarjohns

## to_csv.py
# DataFrame to CSV
export_csv = abstracts_df.to_csv(r'/Users/lorajohns/Documents/Python/DNA/DNA_articles.csv')

## DNA_scraper.py
# Get the base URL from SNPedia
base_url = 'https://www.snpedia.com/index.php/'
# Create URLs for each gene that I want to study
gene_urls = [base_url + rsid for rsid in bad_genes['rsid']]
# Initialize Selenium
browser = webdriver.Chrome()
import time
# Write a function to visit the SNPedia URLs, click through to PubMed,
# and retrieve the info on the articles for each gene
def scrape_abstracts(urls):

## dfs.py
# Create a DataFrame for some subsets of genes
good_genes = new_df[new_df.repute == 'Good']
bad_genes = new_df[new_df.repute == 'Bad']
interesting_genes = new_df[new_df.magnitude > 4] # 4 is the threshold for "worth your time" given by SNPedia

## merge.py
new_df = snp_df.merge(df, how='inner', on=['rsid', 'genotype'], suffixes=('_SNPedia', '_myDNA'))

## NaN.py
null_repute = snp_df[snp_df['repute'].isnull()]
null_summaries = snp_df[snp_df['summary'].isnull()]
null_repute_and_summaries = pd.concat([null_repute,null_summaries]).drop_duplicates().reset_index(drop=True)
display(null_repute_and_summaries)
snp_df['repute'].fillna(value='Neutral', inplace=True)
snp_df['summary'].fillna(value='None', inplace=True)
# No no NaNette
snp_df.isna().any()
rsid         False
magnitude    False

## regex2.py
snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower())
snp_df['rsid'] = snp_df['rsid'].map(lambda x :
	re.sub(r'([a-z]{1,}[\d]+)\([agct];[agct]\)',
	r'\1', x))

## clean.py
new_cols = ['rsid', 'magnitude', 'repute',
'summary', 'genotype']
snp_df.columns = new_cols

## regex1.py
snp_df['genotype'] = snp_df['Unnamed: 0'].apply(lambda x:
	re.sub(r'.*([AGCT]);([AGCT])\)', r'\1\2', x))

## snp_df.py
snp_df = pd.read_csv('result.csv')
snp_df.head()

## DNA_count.py
df[df.chromosome == 1].info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 49514 entries, 0 to 49513
Data columns (total 4 columns):
 rsid         49514 non-null object
chromosome    49514 non-null int64
position      49514 non-null int64
genotype      49514 non-null object
	# DataFrame to CSV
	export_csv = abstracts_df.to_csv(r'/Users/lorajohns/Documents/Python/DNA/DNA_articles.csv')
	# Get the base URL from SNPedia
	base_url = 'https://www.snpedia.com/index.php/'
	# Create URLs for each gene that I want to study
	gene_urls = [base_url + rsid for rsid in bad_genes['rsid']]
	# Initialize Selenium
	browser = webdriver.Chrome()
	import time
	# Write a function to visit the SNPedia URLs, click through to PubMed,
	# and retrieve the info on the articles for each gene
	def scrape_abstracts(urls):
	# Create a DataFrame for some subsets of genes
	good_genes = new_df[new_df.repute == 'Good']
	bad_genes = new_df[new_df.repute == 'Bad']
	interesting_genes = new_df[new_df.magnitude > 4] # 4 is the threshold for "worth your time" given by SNPedia
	null_repute = snp_df[snp_df['repute'].isnull()]
	null_summaries = snp_df[snp_df['summary'].isnull()]
	null_repute_and_summaries = pd.concat([null_repute,null_summaries]).drop_duplicates().reset_index(drop=True)
	display(null_repute_and_summaries)
	snp_df['repute'].fillna(value='Neutral', inplace=True)
	snp_df['summary'].fillna(value='None', inplace=True)
	# No no NaNette
	snp_df.isna().any()
	rsid False
	magnitude False
	snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower())
	snp_df['rsid'] = snp_df['rsid'].map(lambda x :
	re.sub(r'([a-z]{1,}[\d]+)\([agct];[agct]\)',
	r'\1', x))
	new_cols = ['rsid', 'magnitude', 'repute',
	'summary', 'genotype']
	snp_df.columns = new_cols
	snp_df['genotype'] = snp_df['Unnamed: 0'].apply(lambda x:
	re.sub(r'.*([AGCT]);([AGCT])\)', r'\1\2', x))
	df[df.chromosome == 1].info()


	<class 'pandas.core.frame.DataFrame'>
	Int64Index: 49514 entries, 0 to 49513
	Data columns (total 4 columns):
	rsid 49514 non-null object
	chromosome 49514 non-null int64
	position 49514 non-null int64
	genotype 49514 non-null object