Skip to content

Instantly share code, notes, and snippets.

@Bamimore-Tomi
Created November 10, 2020 05:53
Show Gist options
  • Save Bamimore-Tomi/c00d0924c50249a361c73c1aa46e0f30 to your computer and use it in GitHub Desktop.
Save Bamimore-Tomi/c00d0924c50249a361c73c1aa46e0f30 to your computer and use it in GitHub Desktop.
Script to scrape diseases data on NHS
from bs4 import BeautifulSoup
import requests
import pickle
url = 'https://www.nhs.uk/conditions'
base_url = 'https://www.nhs.uk'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
data = []
lis = soup.find_all('li',{'class':'nhsuk-list-panel__item'})
for i in range(0,len(lis)):
temp_lis = lis[i].findChild('a')
link_lis = temp_lis.get_attribute_list('href')[0]
text_lis = temp_lis.get_text()
sub_url = base_url+link_lis
sub_req = requests.get(sub_url)
sub_soup = BeautifulSoup(sub_req.content, 'html.parser')
sections = sub_soup.find_all('section')
section_arr_data = []
for i in range(0,len(sections)):
try:
section_id = sections[i].find('a').get_attribute_list('id')[0]
section_data = sections[i].get_text()
section_arr_data.append({'id':section_id,'data':section_data})
except:
section_data = sections[i].get_text()
section_arr_data.append({'id':'misc','data':section_data})
data.append((text_lis,sub_url,section_arr_data))
write_data = open('nhsdataset','wb')
pickle.dump(data,write_data)
write_data.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment