Skip to content

Instantly share code, notes, and snippets.

@nuhil
Created July 15, 2020 02:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nuhil/7668fd6a7755ab14864a5ffb33bf51ca to your computer and use it in GitHub Desktop.
Save nuhil/7668fd6a7755ab14864a5ffb33bf51ca to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import os
chrome_driver = os.path.join(os.getcwd(), "chromedriver")
urls = []
pages = 100
for page in range(1, pages+1):
driver = webdriver.Chrome(executable_path=chrome_driver)
driver.get('https://www.medhelp.org/user_posts/list/526144?page='+str(page))
source = driver.page_source
soup = BeautifulSoup(source, "html.parser")
for div in soup.findAll('div', {'class': 'subject_title'}):
urls.append(div.find('a')['href'])
driver.close()
with open("urls.txt", "w") as f:
for url in urls:
f.write("%s\n" % url)
posts = []
comments = []
i = 0
for url in urls:
driver = webdriver.Chrome(executable_path=chrome_driver)
driver.get('https://www.medhelp.org'+str(url))
source = driver.page_source
soup = BeautifulSoup(source, "html.parser")
content = soup.find('div', {'id': 'subject_msg'})
content = content.text if content else ""
with open("dataset/disclosure/"+str(i).zfill(5)+"_post.txt", "w") as f:
f.write(content.strip())
posts.append(content.strip())
comment = soup.find('div', {'class': 'resp_body'})
comment = comment.text if comment else ""
with open("dataset/non-disclosure/"+str(i).zfill(5)+"_comment.txt", "w") as f:
f.write(comment.strip())
comments.append(comment.strip())
i += 1
driver.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment