Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Created June 15, 2019 14:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thiagomarzagao/fd21b8e2bca553f90485ae515b6edbb2 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/fd21b8e2bca553f90485ae515b6edbb2 to your computer and use it in GitHub Desktop.
import os
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
basepath = '/Volumes/UNTITLED/wimoveis/anuncios/'
hrefs = pd.read_csv('hrefs.csv') # get URLs
hrefs = set(hrefs['href']) # remove duplicate URLs
for i, href in enumerate(hrefs):
# get ID of the listing
id_anuncio = re.findall(r'[0-9]{1,20}\.html', href)[0].replace('.html', '')
# if listing has been downloaded before, ignore
path = basepath + id_anuncio + '/'
if os.path.exists(path):
continue
# get the source code of the listing;
# doesn't always work on the first try, so
# wait for 60s and try again if necessary;
# looks like this risks infinite loops, but
# somehow that didn't happen
url = 'https://www.wimoveis.com.br' + href
while True:
try:
response = requests.get(url)
break
except:
print('error; waiting')
time.sleep(60)
# if it worked, move on
if response.status_code == 200:
print(i, path)
os.mkdir(path) # create destination directory
html = response.text # get source code
# save source code to file
with open(path + 'anuncio_' + str(i) + '.html', mode = 'w') as f:
f.write(html)
# now the time-consuming part: getting the
# pictures of the listing
pic_path = path + 'pics/'
os.mkdir(pic_path) # create destination directory
# find URLs of the pictures
soup = BeautifulSoup(html)
figures = soup.find_all('figure', class_ = 'slide-content')
links = [e.find('img')['data-flickity-lazyload'] for e in figures]
# try downloading each picture
for n, link in enumerate(links):
while True:
try:
response = requests.get(link, stream = True)
break
except:
print('conn error; waiting')
time.sleep(60)
# if it worked, save picture to file
if response.status_code == 200:
with open(pic_path + str(n) + '.jpg', mode = 'wb') as f:
for chunk in response:
f.write(chunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment