Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import os
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
basepath = '/Volumes/UNTITLED/wimoveis/anuncios/'
hrefs = pd.read_csv('hrefs.csv') # get URLs
hrefs = set(hrefs['href']) # remove duplicate URLs
for i, href in enumerate(hrefs):
# get ID of the listing
id_anuncio = re.findall(r'[0-9]{1,20}\.html', href)[0].replace('.html', '')
# if listing has been downloaded before, ignore
path = basepath + id_anuncio + '/'
if os.path.exists(path):
continue
# get the source code of the listing;
# doesn't always work on the first try, so
# wait for 60s and try again if necessary;
# looks like this risks infinite loops, but
# somehow that didn't happen
url = 'https://www.wimoveis.com.br' + href
while True:
try:
response = requests.get(url)
break
except:
print('error; waiting')
time.sleep(60)
# if it worked, move on
if response.status_code == 200:
print(i, path)
os.mkdir(path) # create destination directory
html = response.text # get source code
# save source code to file
with open(path + 'anuncio_' + str(i) + '.html', mode = 'w') as f:
f.write(html)
# now the time-consuming part: getting the
# pictures of the listing
pic_path = path + 'pics/'
os.mkdir(pic_path) # create destination directory
# find URLs of the pictures
soup = BeautifulSoup(html)
figures = soup.find_all('figure', class_ = 'slide-content')
links = [e.find('img')['data-flickity-lazyload'] for e in figures]
# try downloading each picture
for n, link in enumerate(links):
while True:
try:
response = requests.get(link, stream = True)
break
except:
print('conn error; waiting')
time.sleep(60)
# if it worked, save picture to file
if response.status_code == 200:
with open(pic_path + str(n) + '.jpg', mode = 'wb') as f:
for chunk in response:
f.write(chunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment