Last active
February 18, 2017 11:27
-
-
Save willwillems/ea2c042daa8566dbca66511c37a14c9a to your computer and use it in GitHub Desktop.
Simple Python 3.5.2 Marktplaats scraping example with Beautiful Soup 4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import urllib.parse | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from bs4 import BeautifulSoup | |
def crawlmp( base_url, parameters, add_to_url = '' ): | |
""" Parameters dictionairy can contain: | |
query[str], | |
categoryId[int], | |
searchOnTitleAndDescription[bool], | |
startDateFrom[str], | |
categoryId[int] | |
""" | |
default = { | |
'query' : '', | |
'searchOnTitleAndDescription' : "true", | |
'startDateFrom' : 'always' | |
} | |
# merge the default args and the provided ones | |
z = {**default, **parameters} | |
urlargs = urllib.parse.urlencode(z) | |
# get page | |
url = "http://www.marktplaats.nl/{}?{}{}".format(base_url, urlargs, add_to_url) | |
r = requests.get(url) | |
# get content of page | |
data = r.text | |
# Produce DOM tree | |
soup = BeautifulSoup(data, 'html.parser') | |
# produce array to append to in for loop | |
price = [] | |
# price = np.array([]) | |
# Find how many pages this query has | |
if soup.find('span', class_="last") == None: | |
pages = 1 | |
else: | |
pages = pages = int(soup.find('span', class_="last").text) | |
ammount_of_pages = range(0, pages) | |
for i in ammount_of_pages: | |
current_page = {'currentPage' : str(i+1)} | |
# Marktplaats needs an aditional argument for the last page | |
if i == ammount_of_pages[-1]: | |
current_page['lastPage'] = 'true' | |
r = requests.get(url + '&' + urllib.parse.urlencode(current_page)) | |
data = r.text | |
soup = BeautifulSoup(data, 'html.parser') | |
# Progress indicator | |
print("req nr:", i) | |
for ad in soup.find_all('article', class_='search-result'): | |
try: | |
price_tag = ad.find('span', class_='price-new').text.replace('.', '') | |
if price_tag.find(',') != -1: | |
price.append({ | |
'price' : float(price_tag.split('€').pop().strip().replace(',', '.')), | |
'date' : ad.find('div', class_='date').getText().strip(), | |
'seller' : ad.find('div', class_='seller-name ellipsis')['title'], | |
'title' : ad.find('span', class_='mp-listing-title')['title'] | |
}) | |
except: | |
print("Non numeric price") | |
# print("Unexpected error:", sys.exc_info()[0]) | |
return price | |
# Example of distribution plot of all Apple laptops | |
MB = crawlmp( | |
'z/computers-en-software/laptops-apple.html', | |
{ | |
'categoryId' : '325', | |
'query' : '', | |
'searchOnTitleAndDescription' : "true", | |
'startDateFrom' : 'always' | |
} | |
) | |
# Get a list of all the prices | |
MB_prices = np.array([d['price'] for d in MB]) | |
# Get the average price of the results | |
print("Average price: {}".format(np.average(MB_prices))) | |
# Plot this shit | |
n, bins, patches = plt.hist(MB_prices, bins = range(0, 3000, 50)) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example of result for all 13-inch macbook pro's:
X-axis: price in €
Y-axis: amount of ads in bin
Bin size: €50
Data collected on 17-02-17