Created
August 10, 2020 23:31
-
-
Save shinysu/1a5a91dd3bb21910c9a1bb1291815081 to your computer and use it in GitHub Desktop.
to extract questions from quora
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import time | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
TOTAL_PAGES = 100 | |
CHROME_DRIVER_PATH = '/Users/shinysuresh/Documents/chromedriver-5' | |
url = 'https://www.quora.com/search?q=engineering+colleges+hostel+Chennai' | |
OUTPUT_FILE = '/Users/shinysuresh/Projects/StudentQuestions/quora_questions.csv' | |
def write_csv_file(rows): | |
with open(OUTPUT_FILE, 'a', newline='') as file: | |
writer = csv.writer(file) | |
for row in rows: | |
writer.writerow([row]) | |
def scrap_page(url): | |
no_of_pages = TOTAL_PAGES | |
browser = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH) | |
browser.get(url) | |
html = browser.find_element_by_tag_name('html') | |
while no_of_pages: | |
html.send_keys(Keys.PAGE_DOWN) | |
time.sleep(0.5) | |
no_of_pages -= 1 | |
all_ans = browser.find_elements_by_xpath("//span[@class='ui_qtext_rendered_qtext']") | |
questions = [s.text for s in all_ans if s.text] | |
print(questions) | |
write_csv_file(questions) | |
scrap_page(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment