netguy204/steamgrab.py

## steamgrab.py
#!/usr/bin/env python

import json
import os
import sys
import urllib
import pickle
import re
import csv
from bs4 import BeautifulSoup

def cachename(appid):
    return os.path.join('steamcache', str(appid))

def read_cache(appid):
    name = cachename(appid)
    if os.path.exists(name):
        with open(name) as f:
            return pickle.load(f)

def write_cache(appid, value):
    with open(cachename(appid), 'w') as f:
        pickle.dump(value, f)

def extractor(data):
    for item in data['applist']['apps']['app']:
        yield item['name']


def printer(gen):
    for name in gen:
        print name.encode('ascii', errors='ignore')

def getdata(appid):
    pagesrc = read_cache(appid)
    if not pagesrc:
        url = 'http://store.steampowered.com/app/%d/' % appid
        f = urllib.urlopen(url)
        pagesrc = f.read()
        write_cache(appid, pagesrc)

    price = '?'
    ismac = False
    iswin = False
    islin = False

    soup = BeautifulSoup(pagesrc)
    priceel = soup.select('div.price')
    if priceel:
        pricestr = priceel[0].text.lstrip().rstrip()
        price = float(pricestr[1:])

    macpic = soup.select('span.platform_img.mac')
    if macpic: ismac = True
    winpic = soup.select('span.platform_img.win')
    if winpic: iswin = True
    linpic = soup.select('span.platform_img.linux')
    if linpic: islin = True

    return {'price': price, 'mac': ismac, 'win': iswin, 'lin': islin}

idre = re.compile('app/([0-9]+)')

iddict = None
def id2name(appid):
    global iddict
    if not iddict:
        with open('steam.json') as f:
            data = json.load(f)
        iddict = {}
        for item in data['applist']['apps']['app']:
            iddict[item['appid']] = item['name'].encode('ascii', errors='ignore')
    if appid in iddict:
        return iddict[appid]
    else:
        return None


def getsearchpage(page = 1):
    cachekey = 'page%d' % page
    pagesrc = read_cache(cachekey)
    if not pagesrc:
        baseurl = 'http://store.steampowered.com/search/results'
        query = urllib.urlencode([('sort_by', 'Name'), ('sort_order', 'ASC'), ('category1', '99'),
                                  ('cc', 'us'), ('v5', '1'), ('page', str(page))])
        url = '%s?%s' % (baseurl, query)
        pagesrc = urllib.urlopen(url).read()
        write_cache(cachekey, pagesrc)

    soup = BeautifulSoup(pagesrc)
    for row in soup.select('a.search_result_row'):
        link = row['href']
        matches = idre.search(link)
        appid = None

        if matches:
            appid = int(matches.group(1))
        else:
            continue

        pricestr = row.select('div.search_price')[0].string
        if pricestr: pricestr = pricestr.rstrip().lstrip()
        price = ''
        if pricestr:
            if pricestr.startswith('Free'):
                price = 0
            else:
                try:
                    price = float(pricestr[1:])
                except:
                    price = pricestr

        iswin = bool(row.select('span.platform_img.win'))
        islin = bool(row.select('span.platform_img.linux'))
        ismac = bool(row.select('span.platform_img.mac'))
        name = row.select('h4')[0].string
        if not name: continue
        name = name.encode('ascii', errors='ignore')
        metascore_elm = row.select('div.search_metascore')
        metascore = None
        if metascore_elm:
            metascore_str = metascore_elm[0].text.rstrip().lstrip()
            if metascore_str: metascore = int(metascore_str)

        yield {'price': price, 'win': iswin, 'lin': islin, 'mac': ismac,
               'metascore': metascore, 'appid': appid, 'name': name}

if __name__ == '__main__':
    fname = None
    if len(sys.argv) >= 2:
        fname = sys.argv[1]

    if not fname:
        #print getdata(int(fname))
        page = 1
        writer = csv.writer(sys.stdout)
        writer.writerow(['Name', 'Mac', 'Win', 'Lin', 'AppId', 'Score', 'Price'])
        timeszero = 0
        while True:
            count = 0
            for item in getsearchpage(page):
                writer.writerow([item['name'], item['mac'], item['win'], item['lin'],
                                 item['appid'], item['metascore'], item['price']])
                count = count + 1

            # sometimes pages have only things we don't care about. if
            # that happens too much then we must have reached the end
            if count == 0:
                timeszero = timeszero + 1
            else:
                timeszero = 0

            if timeszero == 3: break

            page = page + 1

    elif os.path.exists(fname):
        printer(extractor(json.loads(open(fname).read())))
    else:
        print 'dont know what to do'
	#!/usr/bin/env python

	import json
	import os
	import sys
	import urllib
	import pickle
	import re
	import csv
	from bs4 import BeautifulSoup

	def cachename(appid):
	return os.path.join('steamcache', str(appid))

	def read_cache(appid):
	name = cachename(appid)
	if os.path.exists(name):
	with open(name) as f:
	return pickle.load(f)

	def write_cache(appid, value):
	with open(cachename(appid), 'w') as f:
	pickle.dump(value, f)

	def extractor(data):
	for item in data['applist']['apps']['app']:
	yield item['name']


	def printer(gen):
	for name in gen:
	print name.encode('ascii', errors='ignore')

	def getdata(appid):
	pagesrc = read_cache(appid)
	if not pagesrc:
	url = 'http://store.steampowered.com/app/%d/' % appid
	f = urllib.urlopen(url)
	pagesrc = f.read()
	write_cache(appid, pagesrc)

	price = '?'
	ismac = False
	iswin = False
	islin = False

	soup = BeautifulSoup(pagesrc)
	priceel = soup.select('div.price')
	if priceel:
	pricestr = priceel[0].text.lstrip().rstrip()
	price = float(pricestr[1:])

	macpic = soup.select('span.platform_img.mac')
	if macpic: ismac = True
	winpic = soup.select('span.platform_img.win')
	if winpic: iswin = True
	linpic = soup.select('span.platform_img.linux')
	if linpic: islin = True

	return {'price': price, 'mac': ismac, 'win': iswin, 'lin': islin}

	idre = re.compile('app/([0-9]+)')

	iddict = None
	def id2name(appid):
	global iddict
	if not iddict:
	with open('steam.json') as f:
	data = json.load(f)
	iddict = {}
	for item in data['applist']['apps']['app']:
	iddict[item['appid']] = item['name'].encode('ascii', errors='ignore')
	if appid in iddict:
	return iddict[appid]
	else:
	return None


	def getsearchpage(page = 1):
	cachekey = 'page%d' % page
	pagesrc = read_cache(cachekey)
	if not pagesrc:
	baseurl = 'http://store.steampowered.com/search/results'
	query = urllib.urlencode([('sort_by', 'Name'), ('sort_order', 'ASC'), ('category1', '99'),
	('cc', 'us'), ('v5', '1'), ('page', str(page))])
	url = '%s?%s' % (baseurl, query)
	pagesrc = urllib.urlopen(url).read()
	write_cache(cachekey, pagesrc)

	soup = BeautifulSoup(pagesrc)
	for row in soup.select('a.search_result_row'):
	link = row['href']
	matches = idre.search(link)
	appid = None

	if matches:
	appid = int(matches.group(1))
	else:
	continue

	pricestr = row.select('div.search_price')[0].string
	if pricestr: pricestr = pricestr.rstrip().lstrip()
	price = ''
	if pricestr:
	if pricestr.startswith('Free'):
	price = 0
	else:
	try:
	price = float(pricestr[1:])
	except:
	price = pricestr

	iswin = bool(row.select('span.platform_img.win'))
	islin = bool(row.select('span.platform_img.linux'))
	ismac = bool(row.select('span.platform_img.mac'))
	name = row.select('h4')[0].string
	if not name: continue
	name = name.encode('ascii', errors='ignore')
	metascore_elm = row.select('div.search_metascore')
	metascore = None
	if metascore_elm:
	metascore_str = metascore_elm[0].text.rstrip().lstrip()
	if metascore_str: metascore = int(metascore_str)

	yield {'price': price, 'win': iswin, 'lin': islin, 'mac': ismac,
	'metascore': metascore, 'appid': appid, 'name': name}

	if __name__ == '__main__':
	fname = None
	if len(sys.argv) >= 2:
	fname = sys.argv[1]

	if not fname:
	#print getdata(int(fname))
	page = 1
	writer = csv.writer(sys.stdout)
	writer.writerow(['Name', 'Mac', 'Win', 'Lin', 'AppId', 'Score', 'Price'])
	timeszero = 0
	while True:
	count = 0
	for item in getsearchpage(page):
	writer.writerow([item['name'], item['mac'], item['win'], item['lin'],
	item['appid'], item['metascore'], item['price']])
	count = count + 1

	# sometimes pages have only things we don't care about. if
	# that happens too much then we must have reached the end
	if count == 0:
	timeszero = timeszero + 1
	else:
	timeszero = 0

	if timeszero == 3: break

	page = page + 1

	elif os.path.exists(fname):
	printer(extractor(json.loads(open(fname).read())))
	else:
	print 'dont know what to do'