Skip to content

Instantly share code, notes, and snippets.

@AkhilRD
Last active August 11, 2022 06:43
Show Gist options
  • Save AkhilRD/473022207bcc386b99e89f907b16e8ef to your computer and use it in GitHub Desktop.
Save AkhilRD/473022207bcc386b99e89f907b16e8ef to your computer and use it in GitHub Desktop.
A scraping code that scrapes Gin's from the whiskey exchange website and builds a data-frame to analyse findings further.
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
baseurl ="https://www.thewhiskyexchange.com/"
url ="https://www.thewhiskyexchange.com/c/338/gin"
source = requests.get(url)
soup = BeautifulSoup(source.content,'lxml')
productlist = soup.find_all('li',{'class':'product-grid__item'})
print(productlist)
product_links = []
for page in range(1,13):
source = requests.get
(f'https://www.thewhiskyexchange.com/c/338/gin?pg={page}&psize=60')
soup = BeautifulSoup(source.content,'lxml')
productlist = soup.find_all('li',{'class':'product-grid__item'})
for item in productlist:
for link in item.find_all("a",href = True):
product_links.append(baseurl + link['href'])
print(product_links)
print(len(product_links))
#Building a structure (Bombay Sapphire Gin as an example)
r = requests.get("https://www.thewhiskyexchange.com/p/2249/bombay-sapphire-london-dry-gin")
soup = BeautifulSoup(r.content,'lxml')
name = soup.find("h1",class_ = "product-main__name").text
rating_temp = soup.find("div",class_="review-overview")
rating = rating_temp.text if rating_temp else 'N/A'
price = soup.find('p',class_="product-action__price").text.strip()
price_ltr = soup.find('p',class_ = 'product-action__unit-price').text
abv = soup.find('p',class_ = 'product-main__data').text.strip()
country = soup.find(class_ = "product-facts__data").text.strip()
description = soup.find(class_ = 'product-main__description').text.strip()
notes = [x.get_text() for x in soup.find_all('span', attrs={'class': 'flavour-profile__label'})]
Gins = {"name":name,"cl/abv":abv,"price":price,"price per ltr":price_ltr,'rating':rating,"Notes":notes,
"country":country,"description":description}
print(Gins)
# Building the core of our scraper
Gin_List = []
for link in product_links:
r = requests.get(link,headers = headers)
soup = BeautifulSoup(r.content,'lxml')
try:
name = soup.find("h1",class_ = "product-main__name").text.strip()
price = soup.find('p',class_="product-action__price").text.strip()
price_ltr = soup.find('p',class_ = 'product-action__unit-price').text
rating_temp = soup.find("div",class_="review-overview")
rating = rating_temp.text.strip() if rating_temp else 'N/A'
abv = soup.find('p',class_ = 'product-main__data').text.strip()
notes = [x.get_text() for x in soup.find_all
('span', attrs={'class': 'flavour-profile__label'})]
country = soup.find(class_ = "product-facts__data").text.strip()
description = soup.find
(class_ = 'product-main__description').text.strip()
except:
None
Gins = {"name":name,"cl/abv":abv,"price":price,
"price per ltr":price_ltr,'rating':rating,"Notes":notes,
"country":country,"description":description}
Gin_List.append(Gins)
print("Saving: ",Gins['name'])
#Removing format in the reviews column
df = df.replace('\n\n\n|\n', " ",regex = True)
#Writing a function to extract only numeric objects using regex library
def number(column):
num = re.findall(r'\d[0-9]+',column)
return ".".join(num)
#Extracting numbers using a combination of above function and converting them to integer or float dtypes
df['price in £']=df['price'].apply(lambda x: number(x)).astype(float)
df['price per ltr in £']=df['price per ltr'].apply(lambda x: number(x))
df['abv'] = df['cl/abv'].apply(lambda x: number(x)).str[3:5]
df['score'] = df['rating'].str.extractall('(^\d|\.\d+)').unstack().fillna('').sum(axis=1).astype(float)
df['Number of reviews'] = df['rating'].str.extractall('(\((\d+))')[1].unstack().fillna('').sum(axis=1).astype(int)
#Dropping the columns we do not need
df.drop(['cl/abv','price','price per ltr','rating'], axis=1, inplace=True) # dropping all the columns we got data from
def flavours(series):
return pd.Series([x for _list in series for x in _list])
flavour_profiles = flavours(df['Notes']).value_counts(normalize= True).head(20)
flavour_profiles.plot(kind = 'barh')
#Converting the notes column into a separate dataframe and renaming the first 3 notes
split_df = pd.DataFrame(df['Notes'].tolist())
split_df.rename(columns={0:'Primary',1:'Secondary',2:'Tertiary'}, inplace=True)
#Attaching the new dataframe to our existing dataframe
table = pd.concat([df, split_df], axis=1)
#A simple function to pick gins based on their notes. (The note has to exist in the top 3)
def pick_gin(Primary_note,Secondary_note,Tertiary_note = None):
combo = table.loc
[(table['Primary'] == Primary_note) |
(table['Secondary'] == Secondary_note)|
(table['Tertiary'] == Tertiary_note)]
return combo.iloc[:,0:9]
table.loc[(table['score'] > 4) & (table['Number of reviews'] > 20)].iloc[:,0:9] #Finding the best bottles
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment