Skip to content

Instantly share code, notes, and snippets.

@puhitaku
Created April 14, 2015 09:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save puhitaku/553f7cd1b3000752bb8c to your computer and use it in GitHub Desktop.
Save puhitaku/553f7cd1b3000752bb8c to your computer and use it in GitHub Desktop.
BROOKS Coffee price parser
import re #regex
import requests as req
from bs4 import BeautifulSoup as bs
def get_text(url):
src = req.get(url)
return src.text.encode(src.encoding)
def get_price(product_url):
soup = bs( get_text(product_url) )
line = soup.find('td', {'class': 'sp'})
raw = line.b.contents[0]
raw = raw.replace(',','')
price = re.compile('^[0-9]*').match(raw).group()
print(price)
def get_products(category_url):
soup = bs( get_text(category_url) )
#Match all a tags that contain product info
prods = soup.find_all('a', text=re.compile('(豆|挽)\s(.*)\s(.*[0-9]*)g'))
#Get rid of tags
prods = [x.next_element for x in prods]
#Match several infos about products
prods = [re.compile('\n.*(豆|挽)\s(.*)\s([0-9]*)g').match(x).group(1,2,3) for x in prods]
#Match raw texts and get rid of tags
prices = [x.next_element for x in soup.find_all('b', text=re.compile('(.*)円\(税抜\)'))]
#Match prices
prices = [re.compile('(.*)円\(税抜\)').match(x).group(1).replace(',','') for x in prices]
#Convert them into int
prices = [int(x) for x in prices]
return [{'kind': x[0], 'name': x[1], 'gram': x[2], 'price': y} for x, y in zip(prods, prices)]
#get_price('http://www.brooks.co.jp/refer/syosai.php?SHNCOD=29205')
info = get_products('http://www.brooks.co.jp/refer/ichiran.php?CATEGORY=241')
for x in info:
print('Kind:', x['kind'])
print('Name:', x['name'])
print('Grams:', x['gram'], '[g]')
print('Price:', x['price'], '[Yen]')
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment