Skip to content

Instantly share code, notes, and snippets.

@cristianmiranda
Last active July 11, 2017 18:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cristianmiranda/a169e00d75207d2e7bb507a4b653dc02 to your computer and use it in GitHub Desktop.
Save cristianmiranda/a169e00d75207d2e7bb507a4b653dc02 to your computer and use it in GitHub Desktop.
import sys
import time
import csv
from sys import stdout
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
'''
# SELENIUM - GECKO
export PATH=$PATH:'/Applications/Geckodriver'
export PATH=$PATH:'/Applications/Firefox.app/Contents/MacOS'
'''
CSV_FILE = '/Users/cristianmiranda/Desktop/inmobiliarias.csv'
BASE_URL = 'https://inmobiliarias.enbuenosaires.com.ar/bienes-raices/capital-federal.html#summaryResults%3AorderBySelector=id~desc&current_page={0}'
driver = None
def wait_until_visible_by_xpath(xpath):
global driver
i = 0
visible = False
while not visible:
stdout.write("\r- Elapsed: {0} seconds - {1}".format(i, xpath))
stdout.flush()
try:
driver.find_element_by_xpath(xpath)
visible = True
return visible
except:
time.sleep(0.2)
if i >= 20:
return False
i += 1
stdout.write("\r\n")
stdout.flush()
def wait_until_hidden_by_id(id):
global driver
i = 0
visible = True
while visible:
stdout.write("\r- Elapsed: {0} seconds - {1}".format(i, id))
stdout.flush()
try:
element = driver.find_element_by_id(id)
if (not element.is_displayed()) or (i > 15):
visible = False
i += 1
except:
time.sleep(0.5)
if i > 15:
visible = False
i += 1
stdout.write("\r\n")
stdout.flush()
def main():
global driver
profile = FirefoxProfile()
profile.set_preference("browser.download.panel.shown", False)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.helperApps.neverAsk.openFile", "text/csv")
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.lastDir", "/Users/cristianmiranda/Downloads/DKT")
driver = webdriver.Firefox(firefox_profile=profile)
reload(sys)
sys.setdefaultencoding('utf-8')
# Access main page
driver.get("https://inmobiliarias.enbuenosaires.com.ar/bienes-raices/capital-federal.html")
wait_until_visible_by_xpath("*//nav/ul/li[9]/a")
pages = driver.find_element_by_xpath("*//nav/ul/li[9]/a").get_attribute("text")
print "Pages: {0}".format(pages)
with open(CSV_FILE, 'wb') as csvfile:
fieldnames = ['page', 'name', 'tel', 'email', 'site']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# Inmobiliarias
for i in xrange(0, int(pages)):
driver.get(BASE_URL.format(i))
wait_until_hidden_by_id("loading")
wait_until_visible_by_xpath("//*[@class='businessheader']/a")
wait_until_visible_by_xpath("//*[@class='businessheader']/a")
inmobiliarias = len(driver.find_elements_by_xpath("//*[@class='businessheader']/a"))
# Acceder a inmobiliarias
for j in xrange(0, inmobiliarias):
with open(CSV_FILE, 'a') as csvfile:
writer = csv.writer(csvfile, delimiter=';')
driver.find_elements_by_xpath("//*[@class='businessheader']/a")[j].click()
wait_until_hidden_by_id("loading")
contacto = wait_until_visible_by_xpath("*//a[contains(., 'Ver Datos de Contacto')]")
if contacto:
driver.find_element_by_xpath("*//a[contains(., 'Ver Datos de Contacto')]").click()
wait_until_visible_by_xpath("*//dt[contains(., 'Telefono')]")
elements = len(driver.find_elements_by_xpath("*//dt[contains(., 'Telefono')]/../dd"))
name = 'N/A'
if elements >= 1:
name = driver.find_element_by_xpath("*//dt[contains(., 'Telefono')]/../dd[1]").get_attribute("innerText")
tel = 'N/A'
if elements >= 2:
tel = driver.find_element_by_xpath("*//dt[contains(., 'Telefono')]/../dd[2]").get_attribute("innerText")
email = 'N/A'
if elements >= 3:
email = driver.find_element_by_xpath("*//dt[contains(., 'Telefono')]/../dd[3]").get_attribute("innerText")
site = 'N/A'
if elements >= 4:
site = driver.find_element_by_xpath("*//dt[contains(., 'Telefono')]/../dd[4]").get_attribute("innerText")
writer.writerow([i, name, tel, email, site])
driver.get(BASE_URL.format(i))
wait_until_hidden_by_id("loading")
driver.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment