Created
June 15, 2019 14:26
-
-
Save thiagomarzagao/fd21b8e2bca553f90485ae515b6edbb2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import time | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
basepath = '/Volumes/UNTITLED/wimoveis/anuncios/' | |
hrefs = pd.read_csv('hrefs.csv') # get URLs | |
hrefs = set(hrefs['href']) # remove duplicate URLs | |
for i, href in enumerate(hrefs): | |
# get ID of the listing | |
id_anuncio = re.findall(r'[0-9]{1,20}\.html', href)[0].replace('.html', '') | |
# if listing has been downloaded before, ignore | |
path = basepath + id_anuncio + '/' | |
if os.path.exists(path): | |
continue | |
# get the source code of the listing; | |
# doesn't always work on the first try, so | |
# wait for 60s and try again if necessary; | |
# looks like this risks infinite loops, but | |
# somehow that didn't happen | |
url = 'https://www.wimoveis.com.br' + href | |
while True: | |
try: | |
response = requests.get(url) | |
break | |
except: | |
print('error; waiting') | |
time.sleep(60) | |
# if it worked, move on | |
if response.status_code == 200: | |
print(i, path) | |
os.mkdir(path) # create destination directory | |
html = response.text # get source code | |
# save source code to file | |
with open(path + 'anuncio_' + str(i) + '.html', mode = 'w') as f: | |
f.write(html) | |
# now the time-consuming part: getting the | |
# pictures of the listing | |
pic_path = path + 'pics/' | |
os.mkdir(pic_path) # create destination directory | |
# find URLs of the pictures | |
soup = BeautifulSoup(html) | |
figures = soup.find_all('figure', class_ = 'slide-content') | |
links = [e.find('img')['data-flickity-lazyload'] for e in figures] | |
# try downloading each picture | |
for n, link in enumerate(links): | |
while True: | |
try: | |
response = requests.get(link, stream = True) | |
break | |
except: | |
print('conn error; waiting') | |
time.sleep(60) | |
# if it worked, save picture to file | |
if response.status_code == 200: | |
with open(pic_path + str(n) + '.jpg', mode = 'wb') as f: | |
for chunk in response: | |
f.write(chunk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment