Skip to content

Instantly share code, notes, and snippets.

# DataFrame to CSV
export_csv = abstracts_df.to_csv(r'/Users/lorajohns/Documents/Python/DNA/DNA_articles.csv')
# Get the base URL from SNPedia
base_url = 'https://www.snpedia.com/index.php/'
# Create URLs for each gene that I want to study
gene_urls = [base_url + rsid for rsid in bad_genes['rsid']]
# Initialize Selenium
browser = webdriver.Chrome()
import time
# Write a function to visit the SNPedia URLs, click through to PubMed,
# and retrieve the info on the articles for each gene
def scrape_abstracts(urls):
# Create a DataFrame for some subsets of genes
good_genes = new_df[new_df.repute == 'Good']
bad_genes = new_df[new_df.repute == 'Bad']
interesting_genes = new_df[new_df.magnitude > 4] # 4 is the threshold for "worth your time" given by SNPedia
new_df = snp_df.merge(df, how='inner', on=['rsid', 'genotype'], suffixes=('_SNPedia', '_myDNA'))
null_repute = snp_df[snp_df['repute'].isnull()]
null_summaries = snp_df[snp_df['summary'].isnull()]
null_repute_and_summaries = pd.concat([null_repute,null_summaries]).drop_duplicates().reset_index(drop=True)
display(null_repute_and_summaries)
snp_df['repute'].fillna(value='Neutral', inplace=True)
snp_df['summary'].fillna(value='None', inplace=True)
# No no NaNette
snp_df.isna().any()
rsid False
magnitude False
snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower())
snp_df['rsid'] = snp_df['rsid'].map(lambda x :
re.sub(r'([a-z]{1,}[\d]+)\([agct];[agct]\)',
r'\1', x))
new_cols = ['rsid', 'magnitude', 'repute',
'summary', 'genotype']
snp_df.columns = new_cols
snp_df['genotype'] = snp_df['Unnamed: 0'].apply(lambda x:
re.sub(r'.*([AGCT]);([AGCT])\)', r'\1\2', x))
snp_df = pd.read_csv('result.csv')
snp_df.head()
df[df.chromosome == 1].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 49514 entries, 0 to 49513
Data columns (total 4 columns):
rsid 49514 non-null object
chromosome 49514 non-null int64
position 49514 non-null int64
genotype 49514 non-null object