Skip to content

Instantly share code, notes, and snippets.

@tfausak
Created December 24, 2011 21:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tfausak/1518335 to your computer and use it in GitHub Desktop.
Save tfausak/1518335 to your computer and use it in GitHub Desktop.
Scrape application information from the Windows Phone Marketplace.
#!/usr/bin/env python
import BeautifulSoup
import cookielib
import re
import urllib2
def parse_rating(rating):
"""Parse a rating's class name and return it as a number.
The rating's class name is expected to follow the format set
on the Windows Phone Marketplace. The rating should be between
1 and 5 with half-star increments. A rating of 0 means unrated.
>>> [parse_rating(x + 'Pt' + y)
... for x in ['zero', 'one', 'two', 'three', 'four', 'five']
... for y in ['zero', 'five']]
[0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5]
"""
# Build a lookup table of values.
values = {'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5}
# Split the rating around the decimal point.
integer, fraction = rating.split('Pt')
# Convert each part to an integer.
integer = values[integer]
fraction = values[fraction.lower()]
# Combine and return the parts.
return integer + (fraction / 10.0)
def scrape(url, jar=None):
"""Extract application data from the Windows Phone Marketplace.
The URL is just an app's URL in the marketplace. The jar is an
optional argument that makes scraping multiple apps faster by
skipping the required cookie handshake on subsequent scrapes.
Returns a tuple with the app's details as a dictionary and the
cookie jar.
>>> scrape(...)
({...}, <cookielib.CookieJar[...]>)
"""
# Create a cookie jar if necessary.
if jar is None:
jar = cookielib.CookieJar()
# Build an opener using the cookie jar.
handler = urllib2.HTTPCookieProcessor(jar)
opener = urllib2.build_opener(handler)
# Fill the cookie jar if it's empty.
if not jar:
opener.open(url)
# Open and parse the marketplace page.
response = opener.open(url)
soup = BeautifulSoup.BeautifulSoup(response)
# Extract app information.
info = {
'category': {
'name': soup.find(id='category').find('strong').string,
'url': ('http://www.windowsphone.com' +
soup.find(id='category').find('a')['href']),
},
'description': soup.find(id='appDetails').find('pre').string,
'id': soup.find(property='og:url')['content'][-36:],
'image': soup.find(id='appSummary').find('img')['src'],
'name': soup.find(id='application').find('h1').string,
'permissions': [tag.string
for tag in soup.find(id='disclosures').findAll('li')],
'price': soup.find('a', {'class': 'purchase'}),
'publisher': {
'name': soup.find(id='publisher').find('a').string,
'url': soup.find(id='publisher').find('a')['href'],
},
'rating': parse_rating(soup.find('div',
{'class': re.compile('ratingLarge .*')})['class'].split()[1]),
'required': soup.find(id='required').string,
'screenshots': [tag['src']
for tag in soup.find(id='screenshots').findAll('img')],
'url': soup.find(property='og:url')['content'],
'version': soup.find(id='version').find('li').string,
}
# Parse price as a float.
if info['price']['id'] == 'free':
info['price'] = 0.00
else:
info['price'] = float(info['price'].string[-4:])
# Remove unnecessary query string from publisher URL.
info['publisher']['url'] = re.sub('[?].*$', '',
info['publisher']['url'])
# Remove resize bit from image query strings.
info['image'] = re.sub('&resize=true', '', info['image'])
for index, screenshot in enumerate(info['screenshots']):
info['screenshots'][index] = re.sub('&resize=true', '', screenshot)
return info
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment