Created
December 24, 2011 21:01
-
-
Save tfausak/1518335 to your computer and use it in GitHub Desktop.
Scrape application information from the Windows Phone Marketplace.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import BeautifulSoup | |
import cookielib | |
import re | |
import urllib2 | |
def parse_rating(rating): | |
"""Parse a rating's class name and return it as a number. | |
The rating's class name is expected to follow the format set | |
on the Windows Phone Marketplace. The rating should be between | |
1 and 5 with half-star increments. A rating of 0 means unrated. | |
>>> [parse_rating(x + 'Pt' + y) | |
... for x in ['zero', 'one', 'two', 'three', 'four', 'five'] | |
... for y in ['zero', 'five']] | |
[0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5] | |
""" | |
# Build a lookup table of values. | |
values = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5} | |
# Split the rating around the decimal point. | |
integer, fraction = rating.split('Pt') | |
# Convert each part to an integer. | |
integer = values[integer] | |
fraction = values[fraction.lower()] | |
# Combine and return the parts. | |
return integer + (fraction / 10.0) | |
def scrape(url, jar=None): | |
"""Extract application data from the Windows Phone Marketplace. | |
The URL is just an app's URL in the marketplace. The jar is an | |
optional argument that makes scraping multiple apps faster by | |
skipping the required cookie handshake on subsequent scrapes. | |
Returns a tuple with the app's details as a dictionary and the | |
cookie jar. | |
>>> scrape(...) | |
({...}, <cookielib.CookieJar[...]>) | |
""" | |
# Create a cookie jar if necessary. | |
if jar is None: | |
jar = cookielib.CookieJar() | |
# Build an opener using the cookie jar. | |
handler = urllib2.HTTPCookieProcessor(jar) | |
opener = urllib2.build_opener(handler) | |
# Fill the cookie jar if it's empty. | |
if not jar: | |
opener.open(url) | |
# Open and parse the marketplace page. | |
response = opener.open(url) | |
soup = BeautifulSoup.BeautifulSoup(response) | |
# Extract app information. | |
info = { | |
'category': { | |
'name': soup.find(id='category').find('strong').string, | |
'url': ('http://www.windowsphone.com' + | |
soup.find(id='category').find('a')['href']), | |
}, | |
'description': soup.find(id='appDetails').find('pre').string, | |
'id': soup.find(property='og:url')['content'][-36:], | |
'image': soup.find(id='appSummary').find('img')['src'], | |
'name': soup.find(id='application').find('h1').string, | |
'permissions': [tag.string | |
for tag in soup.find(id='disclosures').findAll('li')], | |
'price': soup.find('a', {'class': 'purchase'}), | |
'publisher': { | |
'name': soup.find(id='publisher').find('a').string, | |
'url': soup.find(id='publisher').find('a')['href'], | |
}, | |
'rating': parse_rating(soup.find('div', | |
{'class': re.compile('ratingLarge .*')})['class'].split()[1]), | |
'required': soup.find(id='required').string, | |
'screenshots': [tag['src'] | |
for tag in soup.find(id='screenshots').findAll('img')], | |
'url': soup.find(property='og:url')['content'], | |
'version': soup.find(id='version').find('li').string, | |
} | |
# Parse price as a float. | |
if info['price']['id'] == 'free': | |
info['price'] = 0.00 | |
else: | |
info['price'] = float(info['price'].string[-4:]) | |
# Remove unnecessary query string from publisher URL. | |
info['publisher']['url'] = re.sub('[?].*$', '', | |
info['publisher']['url']) | |
# Remove resize bit from image query strings. | |
info['image'] = re.sub('&resize=true', '', info['image']) | |
for index, screenshot in enumerate(info['screenshots']): | |
info['screenshots'][index] = re.sub('&resize=true', '', screenshot) | |
return info |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment