Last active
August 11, 2022 06:43
-
-
Save AkhilRD/473022207bcc386b99e89f907b16e8ef to your computer and use it in GitHub Desktop.
A scraping code that scrapes Gin's from the whiskey exchange website and builds a data-frame to analyse findings further.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import re | |
import pandas as pd | |
baseurl ="https://www.thewhiskyexchange.com/" | |
url ="https://www.thewhiskyexchange.com/c/338/gin" | |
source = requests.get(url) | |
soup = BeautifulSoup(source.content,'lxml') | |
productlist = soup.find_all('li',{'class':'product-grid__item'}) | |
print(productlist) | |
product_links = [] | |
for page in range(1,13): | |
source = requests.get | |
(f'https://www.thewhiskyexchange.com/c/338/gin?pg={page}&psize=60') | |
soup = BeautifulSoup(source.content,'lxml') | |
productlist = soup.find_all('li',{'class':'product-grid__item'}) | |
for item in productlist: | |
for link in item.find_all("a",href = True): | |
product_links.append(baseurl + link['href']) | |
print(product_links) | |
print(len(product_links)) | |
#Building a structure (Bombay Sapphire Gin as an example) | |
r = requests.get("https://www.thewhiskyexchange.com/p/2249/bombay-sapphire-london-dry-gin") | |
soup = BeautifulSoup(r.content,'lxml') | |
name = soup.find("h1",class_ = "product-main__name").text | |
rating_temp = soup.find("div",class_="review-overview") | |
rating = rating_temp.text if rating_temp else 'N/A' | |
price = soup.find('p',class_="product-action__price").text.strip() | |
price_ltr = soup.find('p',class_ = 'product-action__unit-price').text | |
abv = soup.find('p',class_ = 'product-main__data').text.strip() | |
country = soup.find(class_ = "product-facts__data").text.strip() | |
description = soup.find(class_ = 'product-main__description').text.strip() | |
notes = [x.get_text() for x in soup.find_all('span', attrs={'class': 'flavour-profile__label'})] | |
Gins = {"name":name,"cl/abv":abv,"price":price,"price per ltr":price_ltr,'rating':rating,"Notes":notes, | |
"country":country,"description":description} | |
print(Gins) | |
# Building the core of our scraper | |
Gin_List = [] | |
for link in product_links: | |
r = requests.get(link,headers = headers) | |
soup = BeautifulSoup(r.content,'lxml') | |
try: | |
name = soup.find("h1",class_ = "product-main__name").text.strip() | |
price = soup.find('p',class_="product-action__price").text.strip() | |
price_ltr = soup.find('p',class_ = 'product-action__unit-price').text | |
rating_temp = soup.find("div",class_="review-overview") | |
rating = rating_temp.text.strip() if rating_temp else 'N/A' | |
abv = soup.find('p',class_ = 'product-main__data').text.strip() | |
notes = [x.get_text() for x in soup.find_all | |
('span', attrs={'class': 'flavour-profile__label'})] | |
country = soup.find(class_ = "product-facts__data").text.strip() | |
description = soup.find | |
(class_ = 'product-main__description').text.strip() | |
except: | |
None | |
Gins = {"name":name,"cl/abv":abv,"price":price, | |
"price per ltr":price_ltr,'rating':rating,"Notes":notes, | |
"country":country,"description":description} | |
Gin_List.append(Gins) | |
print("Saving: ",Gins['name']) | |
#Removing format in the reviews column | |
df = df.replace('\n\n\n|\n', " ",regex = True) | |
#Writing a function to extract only numeric objects using regex library | |
def number(column): | |
num = re.findall(r'\d[0-9]+',column) | |
return ".".join(num) | |
#Extracting numbers using a combination of above function and converting them to integer or float dtypes | |
df['price in £']=df['price'].apply(lambda x: number(x)).astype(float) | |
df['price per ltr in £']=df['price per ltr'].apply(lambda x: number(x)) | |
df['abv'] = df['cl/abv'].apply(lambda x: number(x)).str[3:5] | |
df['score'] = df['rating'].str.extractall('(^\d|\.\d+)').unstack().fillna('').sum(axis=1).astype(float) | |
df['Number of reviews'] = df['rating'].str.extractall('(\((\d+))')[1].unstack().fillna('').sum(axis=1).astype(int) | |
#Dropping the columns we do not need | |
df.drop(['cl/abv','price','price per ltr','rating'], axis=1, inplace=True) # dropping all the columns we got data from | |
def flavours(series): | |
return pd.Series([x for _list in series for x in _list]) | |
flavour_profiles = flavours(df['Notes']).value_counts(normalize= True).head(20) | |
flavour_profiles.plot(kind = 'barh') | |
#Converting the notes column into a separate dataframe and renaming the first 3 notes | |
split_df = pd.DataFrame(df['Notes'].tolist()) | |
split_df.rename(columns={0:'Primary',1:'Secondary',2:'Tertiary'}, inplace=True) | |
#Attaching the new dataframe to our existing dataframe | |
table = pd.concat([df, split_df], axis=1) | |
#A simple function to pick gins based on their notes. (The note has to exist in the top 3) | |
def pick_gin(Primary_note,Secondary_note,Tertiary_note = None): | |
combo = table.loc | |
[(table['Primary'] == Primary_note) | | |
(table['Secondary'] == Secondary_note)| | |
(table['Tertiary'] == Tertiary_note)] | |
return combo.iloc[:,0:9] | |
table.loc[(table['score'] > 4) & (table['Number of reviews'] > 20)].iloc[:,0:9] #Finding the best bottles |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment