Last active
April 16, 2021 08:05
-
-
Save garrettmojica/e5e4327358de79836d098a2b89103e71 to your computer and use it in GitHub Desktop.
Using Python to scrape beyond Google’s 4 initial “People also ask” Questions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium.common.exceptions import NoSuchElementException | |
from selenium.webdriver.chrome.options import Options | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
import xlsxwriter | |
import datetime | |
import time | |
import os | |
def returnChromeDriver(pathToChromeDriver): | |
chrome_options = Options() | |
chrome_options.add_experimental_option( "prefs",{'profile.managed_default_content_settings.javascript': 1, 'profile.managed_default_content_settings.images': 1, 'profile.managed_default_content_settings.stylesheet': 2} ) | |
chromedriver = pathToChromeDriver | |
userAgent = "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" | |
os.environ["webdriver.chrome.driver"] = chromedriver | |
driver = webdriver.Chrome(chromedriver, chrome_options = chrome_options) | |
return driver | |
def returnSearchUrl(question): | |
baseGoogleQuery = "https://www.google.com/search?q=" | |
searchUrl = baseGoogleQuery + question.lower().replace(" ", "+").replace("?", "%3F").replace("'", "%27") | |
return searchUrl | |
def clickQuestions(driver, question, totalClicks): | |
searchUrl = returnSearchUrl(question) | |
driver.get(searchUrl) | |
time.sleep(1) | |
if driver.find_elements_by_css_selector('div.related-question-pair'): | |
questionIndex = 0 | |
questions = driver.find_elements_by_css_selector('div.related-question-pair') | |
questions[-1].location_once_scrolled_into_view | |
while(questionIndex < totalClicks): | |
questions[questionIndex].click() | |
time.sleep(1) | |
questions = driver.find_elements_by_css_selector('div.related-question-pair') | |
questions[questionIndex + 1].location_once_scrolled_into_view | |
questionIndex = questionIndex + 1 | |
return driver | |
def extractQuestionData(soup): | |
questionList = [] | |
for question in soup.findAll("div", class_="related-question-pair"): | |
questionDict = {} | |
questionDict['relatedQuestion'] = question.find("g-accordion-expander").find("div").text | |
if question.find("h3"): | |
questionDict['titleTag'] = question.find("h3").text | |
questionDict['titleTagLength'] = len(questionDict['titleTag']) | |
else: | |
questionDict['titleTag'] = "N/A - ERROR?" | |
questionDict['titleTagLength'] = "N/A - ERROR?" | |
if question.find("div", {"role":"heading"}): | |
questionDict['answer'] = question.find("div", {"role":"heading"}).text | |
questionDict['answerLength'] = len(questionDict['answer']) | |
else: | |
try: | |
questionDict['answer'] = question.find("g-accordion-expander").findAll("div")[2].text | |
questionDict['answerLength'] = len(questionDict['answer']) | |
except: | |
questionDict['answer'] = "N/A" | |
questionDict['answerLength'] = "N/A" | |
if question.find("div", class_="r"): | |
questionDict['questionUrl'] = question.find("div", class_="r").find("a")['href'] | |
else: | |
questionDict['questionUrl'] = "N/A - ERROR" | |
questionList.append(questionDict) | |
return questionList | |
def writeExcelFile(allExtractedDataList): | |
date = datetime.datetime.now() | |
workbook = xlsxwriter.Workbook("data-" + date.strftime("%b-%d-%Y-%H-%M-%S") + ".xlsx", {'strings_to_urls': True}) | |
worksheet01 = workbook.add_worksheet("Data") | |
worksheet01.write(0, 0, "Initial Question") | |
worksheet01.write(0, 1, "Related Question") | |
worksheet01.write(0, 2, "Title Tag") | |
worksheet01.write(0, 3, "Title Tag Length") | |
worksheet01.write(0, 4, "Answer") | |
worksheet01.write(0, 5, "Answer Length") | |
worksheet01.write(0, 6, "Question URL") | |
row = 1 | |
for questionData in allExtractedDataList: | |
for relatedQuestion in questionData['relatedQuestionData']: | |
worksheet01.write(row, 0, questionData['initialQuestion']) | |
worksheet01.write(row, 1, relatedQuestion['relatedQuestion']) | |
worksheet01.write(row, 2, relatedQuestion['titleTag']) | |
worksheet01.write(row, 3, relatedQuestion['titleTagLength']) | |
worksheet01.write(row, 4, relatedQuestion['answer']) | |
worksheet01.write(row, 5, relatedQuestion['answerLength']) | |
worksheet01.write(row, 6, relatedQuestion['questionUrl']) | |
row = row + 1 | |
workbook.close() | |
questions = ["Who do you think you are?", "Where do you get the nerve?", "What's your sob story?"] | |
pathToChromeDriver = "/path/to/chromedriver" | |
totalClicks = 10 | |
driver = returnChromeDriver(pathToChromeDriver) | |
allExtractedDataList = [] | |
for question in questions: | |
driver = clickQuestions(driver, question, totalClicks) | |
if driver.find_elements_by_css_selector('div.related-question-pair'): | |
soup = BeautifulSoup(driver.page_source, "lxml") | |
extractedData = extractQuestionData(soup) | |
currentQuestionDict = {'initialQuestion':question, 'relatedQuestionData':extractedData} | |
allExtractedDataList.append(currentQuestionDict) | |
else: | |
print "No Questions Found For: " + question | |
driver.quit() | |
writeExcelFile(allExtractedDataList) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @garrettmojica
I tried this script recently i'm getting few below errors, I'm not sure what exactly these errors are. Can you please help me on this?
Warning (from warnings module):
File "D:\Python Tasks\PAA Extract.py", line 16
driver = webdriver.Chrome(chromedriver, chrome_options = chrome_options)
DeprecationWarning: use options instead of chrome_options
Traceback (most recent call last):
File "D:\Python Tasks\PAA Extract.py", line 111, in
driver = clickQuestions(driver, question, totalClicks)
File "D:\Python Tasks\PAA Extract.py", line 35, in clickQuestions
questions[questionIndex].click()
File "C:\Users\laptop\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webelement.py", line 80, in click
self._execute(Command.CLICK_ELEMENT)
File "C:\Users\laptop\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webelement.py", line 633, in _execute
return self._parent.execute(command, params)
File "C:\Users\laptop\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "C:\Users\laptop\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.ElementClickInterceptedException: Message: element click intercepted: Element
(Session info: chrome=89.0.4389.82)
Thank you,
Suri