Skip to content

Instantly share code, notes, and snippets.

@Bamimore-Tomi
Created November 10, 2020 05:49
Show Gist options
  • Save Bamimore-Tomi/258bef6bd8a1b70337c31f2c2e53356d to your computer and use it in GitHub Desktop.
Save Bamimore-Tomi/258bef6bd8a1b70337c31f2c2e53356d to your computer and use it in GitHub Desktop.
Build a scrapper to scrape information on over 2000 disease
from bs4 import BeautifulSoup as BS
import requests
import pickle
import string
alphabets = string.ascii_uppercase
data=[]
total_scrapped = 0
total_skipped = 0
for letter in alphabets:
base_url='https://www.mayoclinic.org'
url = 'https://www.mayoclinic.org/diseases-conditions/index?letter='+letter
page = requests.get(url)
soup = BS(page.content, "html.parser")
try:
ailment_soup = soup.find('div',{'id':'index'}).find_all('li')
except:
print("COULD NOT GET")
continue
for i in range(0,len(ailment_soup)):
try:
temp = ailment_soup[i].find('span',{'class':'inline-link'}).get_text()
except:
temp = ailment_soup[i].get_text()
ailment = temp.replace('(See:','')
ailment_details_link = ailment_soup[i].find('a').get_attribute_list('href')[0]
page_2 = requests.get(base_url+ailment_details_link)
soup_2 = BS(page_2.content,"html.parser")
caption = soup_2.find('p',{'class':'caption'})
if caption is not None:
caption = caption.get_text()
overview_temp=[]
symptoms_temp = []
causes_temp = []
h2s = soup_2.find_all('h2')
for j in range(0,len(h2s)-1):
if curr_tag.name=='h2':
continue
data.append({ailment.strip().replace('\n',''):{"overview":"".join(overview_temp),"symptoms":",".join(symptoms_temp),"causes":",".join(causes_temp)}})
wirte_data = open('dataset2','wb')
Pickle.dump(data,write_data)
write_data.close()
print("Total amount of ailments: ",total_scrapped)
print("Total amount of ailments skipped: ",total_skipped)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment