Forked from lobstrio/google_maps_scraping_selenium.py
Created
November 2, 2021 09:40
-
-
Save yazzou/d1b57a739719cc67486c14d77d2356d9 to your computer and use it in GitHub Desktop.
Collect all data from a Search URL on Google Maps 👋
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# _*_ coding: utf-8 _*° | |
# Copyright(C) 2021 lobstr | |
from selenium import webdriver | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import NoSuchElementException | |
import time | |
import csv | |
from lxml import html | |
class CrawlerGoogleMapsSelenium(): | |
def __init__(self): | |
self.driver = webdriver.Chrome("/Users/sashabouloudnine/Downloads/chromedriver") | |
def accept_cookies(self): | |
accept_button = self.driver.find_element_by_xpath("//span[contains(text(), \"J\'accepte\")]") | |
if accept_button: | |
self.driver.execute_script("arguments[0].scrollIntoView();", accept_button) | |
time.sleep(2) | |
accept_button.click() | |
time.sleep(2) | |
def iter_etabs(self, starting_url): | |
assert starting_url | |
self.driver.get(starting_url) | |
time.sleep(2) | |
self.accept_cookies() | |
_ = WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//div[contains(@jsaction, 'mouseover:pane')]"))) | |
time.sleep(2) | |
urls = self.driver.find_elements_by_xpath("//div[contains(@jsaction, 'mouseover:pane')]/a") | |
urls_list = [] | |
for url in urls: | |
url = url.get_attribute('href') | |
urls_list.append(url) | |
for url in urls_list: | |
assert url | |
result_dict = self.get_etab(url) | |
print('\t'.join([str(v) for v in result_dict.values()])) | |
yield result_dict | |
def get_etab(self, url): | |
assert url | |
print(url) | |
self.driver.get(url) | |
_ = WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[@jstcache=127]"))) | |
time.sleep(5) | |
try: | |
categorie = self.driver.find_element_by_xpath('//button[@jsaction="pane.rating.category"]').text | |
except NoSuchElementException: | |
categorie = '' | |
try: | |
reviews = self.driver.find_element_by_xpath('//button[@jsaction="pane.rating.moreReviews"]').text | |
except NoSuchElementException: | |
reviews = '' | |
try: | |
prix = self.driver.find_element_by_xpath('//span[contains(@aria-label, "Prix:")]').text | |
except NoSuchElementException: | |
prix = '' | |
# aria-label | |
try: | |
nom = self.driver.find_element_by_xpath('//div[@role="main" and @aria-label]').get_attribute('aria-label') | |
except NoSuchElementException: | |
nom = '' | |
try: | |
score = self.driver.find_element_by_xpath('//ol[@class="section-star-array"]').get_attribute('aria-label') | |
score.replace('\xa0', ' ') | |
except NoSuchElementException: | |
score = '' | |
try: | |
adresse = self.driver.find_element_by_xpath('//button[@data-item-id="address"]').get_attribute('aria-label') | |
adresse = adresse.replace('Adresse: ', '') | |
except NoSuchElementException: | |
adresse = '' | |
try: | |
telephone = self.driver.find_element_by_xpath('//button[contains(@aria-label, "Numéro de téléphone:")]').get_attribute('aria-label') | |
telephone = telephone.replace('Numéro de téléphone: ', '') | |
except NoSuchElementException: | |
telephone = '' | |
try: | |
website = self.driver.find_element_by_xpath('//button[contains(@aria-label, "Site Web:")]').get_attribute('aria-label') | |
website = website.replace('Site Web: ', '') | |
except NoSuchElementException: | |
website = '' | |
result_dict = { | |
'nom': nom, | |
'categorie': categorie, | |
'reviews': reviews, | |
'score': score, | |
'prix': prix, | |
'adresse': adresse, | |
'telephone': telephone, | |
'website': website | |
} | |
return result_dict | |
def main(self, url): | |
l = [] | |
etabs = list(self.iter_etabs(url)) | |
keys = ['nom', 'categorie', 'reviews', 'score', 'prix', 'adresse', 'telephone', 'website'] | |
with open('googlemaps_20210803.csv', mode='w') as f: | |
writer = csv.DictWriter(f, delimiter='\t', fieldnames=keys) | |
writer.writeheader() | |
for etab in etabs: | |
writer.writerow(etab) | |
if __name__ == '__main__': | |
starting_url = 'https://www.google.com/maps/search/restaurant+marseille/@43.2850096,5.3752173,14z' | |
google_maps_crawler = CrawlerGoogleMapsSelenium() | |
google_maps_crawler.main(starting_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment