larssono/scrape_diseases.py

## scrape_diseases.py
from bs4 import BeautifulSoup
from urllib.request import urlopen, HTTPError
import string
import time
import random

url = 'https://www.mayoclinic.org/diseases-conditions/index?letter=%s'
output = 'diseases.txt'

fp = open(output,'w')
for letter in string.ascii_uppercase:
    print(letter, url% letter)
    time.sleep(random.random())
    error = True
    while error:
        try:
            page = urlopen(url% letter)
            error=False
        except HTTPError as e:
            dt=random.random()*15
            print(e, sleeping, dt)
            time.sleep(dt)

    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    for elem in soup.find_all('ol')[1]:
        link = elem.find('a')
        if link !=-1:
            fp.write('%s\n' %link.text)
fp.close()

# The file created has duplicated rows for alternative names for example"
# the lines: Uterine cancer, also known asEndometrial cancer AND Endometrial cancer appear
# All lines that contain the phrase ", also known as" can be thrown out.
# This is done with grep -v  ", also known as" diseases.txt > diseases_list.txt
# Also need to add healthy_control
	from bs4 import BeautifulSoup
	from urllib.request import urlopen, HTTPError
	import string
	import time
	import random

	url = 'https://www.mayoclinic.org/diseases-conditions/index?letter=%s'
	output = 'diseases.txt'

	fp = open(output,'w')
	for letter in string.ascii_uppercase:
	print(letter, url% letter)
	time.sleep(random.random())
	error = True
	while error:
	try:
	page = urlopen(url% letter)
	error=False
	except HTTPError as e:
	dt=random.random()*15
	print(e, sleeping, dt)
	time.sleep(dt)

	html = page.read().decode("utf-8")
	soup = BeautifulSoup(html, "html.parser")
	for elem in soup.find_all('ol')[1]:
	link = elem.find('a')
	if link !=-1:
	fp.write('%s\n' %link.text)
	fp.close()

	# The file created has duplicated rows for alternative names for example"
	# the lines: Uterine cancer, also known asEndometrial cancer AND Endometrial cancer appear
	# All lines that contain the phrase ", also known as" can be thrown out.
	# This is done with grep -v ", also known as" diseases.txt > diseases_list.txt
	# Also need to add healthy_control