thiagomarzagao/scrape_wimoveis_listings.py

## scrape_wimoveis_listings.py
import os
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

basepath = '/Volumes/UNTITLED/wimoveis/anuncios/'
hrefs = pd.read_csv('hrefs.csv') # get URLs
hrefs = set(hrefs['href']) # remove duplicate URLs
for i, href in enumerate(hrefs):

    # get ID of the listing
    id_anuncio = re.findall(r'[0-9]{1,20}\.html', href)[0].replace('.html', '')

    # if listing has been downloaded before, ignore
    path = basepath + id_anuncio + '/'
    if os.path.exists(path):
        continue

    # get the source code of the listing;
    # doesn't always work on the first try, so
    # wait for 60s and try again if necessary;
    # looks like this risks infinite loops, but
    # somehow that didn't happen
    url = 'https://www.wimoveis.com.br' + href
    while True:
        try:
            response = requests.get(url)
            break
        except:
            print('error; waiting')
            time.sleep(60)

    # if it worked, move on
    if response.status_code == 200:
        print(i, path)
        os.mkdir(path) # create destination directory
        html = response.text # get source code

        # save source code to file
        with open(path + 'anuncio_' + str(i) + '.html', mode = 'w') as f:
            f.write(html)

        # now the time-consuming part: getting the
        # pictures of the listing
        pic_path = path + 'pics/'
        os.mkdir(pic_path) # create destination directory

        # find URLs of the pictures
        soup = BeautifulSoup(html)
        figures = soup.find_all('figure', class_ = 'slide-content')
        links = [e.find('img')['data-flickity-lazyload'] for e in figures]

        # try downloading each picture
        for n, link in enumerate(links):
            while True:
                try:
                    response = requests.get(link, stream = True)
                    break
                except:
                    print('conn error; waiting')
                    time.sleep(60)

            # if it worked, save picture to file
            if response.status_code == 200:
                with open(pic_path + str(n) + '.jpg', mode = 'wb') as f:
                    for chunk in response:
                        f.write(chunk)
	import os
	import re
	import time
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup

	basepath = '/Volumes/UNTITLED/wimoveis/anuncios/'
	hrefs = pd.read_csv('hrefs.csv') # get URLs
	hrefs = set(hrefs['href']) # remove duplicate URLs
	for i, href in enumerate(hrefs):

	# get ID of the listing
	id_anuncio = re.findall(r'[0-9]{1,20}\.html', href)[0].replace('.html', '')

	# if listing has been downloaded before, ignore
	path = basepath + id_anuncio + '/'
	if os.path.exists(path):
	continue

	# get the source code of the listing;
	# doesn't always work on the first try, so
	# wait for 60s and try again if necessary;
	# looks like this risks infinite loops, but
	# somehow that didn't happen
	url = 'https://www.wimoveis.com.br' + href
	while True:
	try:
	response = requests.get(url)
	break
	except:
	print('error; waiting')
	time.sleep(60)

	# if it worked, move on
	if response.status_code == 200:
	print(i, path)
	os.mkdir(path) # create destination directory
	html = response.text # get source code

	# save source code to file
	with open(path + 'anuncio_' + str(i) + '.html', mode = 'w') as f:
	f.write(html)

	# now the time-consuming part: getting the
	# pictures of the listing
	pic_path = path + 'pics/'
	os.mkdir(pic_path) # create destination directory

	# find URLs of the pictures
	soup = BeautifulSoup(html)
	figures = soup.find_all('figure', class_ = 'slide-content')
	links = [e.find('img')['data-flickity-lazyload'] for e in figures]

	# try downloading each picture
	for n, link in enumerate(links):
	while True:
	try:
	response = requests.get(link, stream = True)
	break
	except:
	print('conn error; waiting')
	time.sleep(60)

	# if it worked, save picture to file
	if response.status_code == 200:
	with open(pic_path + str(n) + '.jpg', mode = 'wb') as f:
	for chunk in response:
	f.write(chunk)