Skip to content

Instantly share code, notes, and snippets.

@linlincheng
Last active April 20, 2020 12:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save linlincheng/dd3a3c470bbf992ffd87d5ecf2b445c3 to your computer and use it in GitHub Desktop.
Save linlincheng/dd3a3c470bbf992ffd87d5ecf2b445c3 to your computer and use it in GitHub Desktop.
import urllib2
from bs4 import BeautifulSoup
import requests
import re
import urlparse
from selenium import webdriver
from csv import DictWriter
import json
#Step 1. Generating first level urls, for month 1-12, year 1998-2016, possible page range: 1-3
url_list=[]
for i in range(1,13):
for j in range (1998, 2017):
for k in range(1,4):
urll='http://www.christies.com/results/?month='+str(i)+'&year='+str(j)+'&locations=&scids=&action=paging&initialpageload=false&pg='+str(k)
url_list.append(urll)
#print urll
print len(url_list)
#Step 2. Reading url into html files
tmp_list=[]
for url in url_list:
try:
auction = urllib2.urlopen(url)
html = auction.read()
soup = BeautifulSoup(html)
grand_list = soup.find("ul", {"id": "list-items"})
#creating a sub_soup
sub_soup = BeautifulSoup(str(grand_list))
events = sub_soup.find_all("li", id = re.compile('^day-'))
print '.'
except:
print url
pass
print '-'
#inner loop to construct a dictionary for each event:
final_result = []
for event in events:
try:
auction = {}
url_find = event.find("a", {"class": "description"}).get('href').encode('ascii', 'ignore').strip()
auction['month'] = event.find("span", "month").get_text().encode('ascii', 'ignore').strip()
auction['date'] = event.find("span", "date").get_text().encode('ascii', 'ignore').strip()
auction['year'] = event.find("span", "year").get_text().encode('ascii', 'ignore').strip()
auction['ID'] = event.find("a", {"class": "sale-number"}).get_text().encode('ascii', 'ignore').strip()
auction['location'] = event.find("span", "location").get_text().encode('ascii', 'ignore').strip()
auction['event_name'] = event.find("a", {"class": "description"}).get_text().encode('ascii', 'ignore').strip()
auction['total_sales'] = event.find("ul", {"class": "auction-links"}).find('a').get_text().encode('ascii', 'ignore').strip()
auction['url'] = "http://www.christies.com" + url_find
final_result.append(auction)
#
print auction['month']
except Exception as e:
print e
tmp_list += final_result
print len(tmp_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment