AkhilRD/gin-scraping.py

## gin-scraping.py
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

baseurl ="https://www.thewhiskyexchange.com/"
url ="https://www.thewhiskyexchange.com/c/338/gin"

source = requests.get(url)
soup = BeautifulSoup(source.content,'lxml')

productlist = soup.find_all('li',{'class':'product-grid__item'})
print(productlist)

product_links = []
for page in range(1,13):
    source = requests.get
    (f'https://www.thewhiskyexchange.com/c/338/gin?pg={page}&psize=60')
    soup = BeautifulSoup(source.content,'lxml')
    productlist = soup.find_all('li',{'class':'product-grid__item'})

    for item in productlist:
        for link in item.find_all("a",href = True):
            product_links.append(baseurl + link['href'])

print(product_links)
print(len(product_links))

#Building a structure (Bombay Sapphire Gin as an example)

r = requests.get("https://www.thewhiskyexchange.com/p/2249/bombay-sapphire-london-dry-gin")
soup = BeautifulSoup(r.content,'lxml')

name = soup.find("h1",class_ = "product-main__name").text
rating_temp = soup.find("div",class_="review-overview")
rating = rating_temp.text if rating_temp else 'N/A'
price = soup.find('p',class_="product-action__price").text.strip()
price_ltr = soup.find('p',class_ = 'product-action__unit-price').text
abv = soup.find('p',class_ = 'product-main__data').text.strip()
country = soup.find(class_ = "product-facts__data").text.strip()
description = soup.find(class_ = 'product-main__description').text.strip()
notes = [x.get_text() for x in soup.find_all('span', attrs={'class': 'flavour-profile__label'})]

Gins = {"name":name,"cl/abv":abv,"price":price,"price per ltr":price_ltr,'rating':rating,"Notes":notes,
        "country":country,"description":description}
print(Gins)

# Building the core of our scraper
Gin_List = []

for link in product_links:
    r = requests.get(link,headers = headers)
    soup = BeautifulSoup(r.content,'lxml')


    try:
        name = soup.find("h1",class_ = "product-main__name").text.strip()
        price = soup.find('p',class_="product-action__price").text.strip()
        price_ltr = soup.find('p',class_ = 'product-action__unit-price').text
        rating_temp = soup.find("div",class_="review-overview")
        rating = rating_temp.text.strip() if rating_temp else 'N/A'
        abv = soup.find('p',class_ = 'product-main__data').text.strip()
        notes = [x.get_text() for x in soup.find_all
                ('span', attrs={'class': 'flavour-profile__label'})]
        country = soup.find(class_ = "product-facts__data").text.strip()
        description = soup.find
                      (class_ = 'product-main__description').text.strip()

    except:
        None

    Gins = {"name":name,"cl/abv":abv,"price":price,
           "price per  ltr":price_ltr,'rating':rating,"Notes":notes,
           "country":country,"description":description}

    Gin_List.append(Gins)
    print("Saving: ",Gins['name'])

#Removing format in the reviews column

df = df.replace('\n\n\n|\n', " ",regex = True)

#Writing a function to extract only numeric objects using regex library

def number(column):
    num = re.findall(r'\d[0-9]+',column)
    return ".".join(num)

#Extracting numbers using a combination of above function and converting them to integer or float dtypes

df['price in £']=df['price'].apply(lambda x: number(x)).astype(float)
df['price per ltr in £']=df['price per ltr'].apply(lambda x: number(x))
df['abv'] = df['cl/abv'].apply(lambda x: number(x)).str[3:5]
df['score'] = df['rating'].str.extractall('(^\d|\.\d+)').unstack().fillna('').sum(axis=1).astype(float)
df['Number of reviews'] = df['rating'].str.extractall('(\((\d+))')[1].unstack().fillna('').sum(axis=1).astype(int)

#Dropping the columns we do not need

df.drop(['cl/abv','price','price per ltr','rating'], axis=1, inplace=True) # dropping all the columns we got data from

def flavours(series):
    return pd.Series([x for _list in series for x in _list])

flavour_profiles = flavours(df['Notes']).value_counts(normalize= True).head(20)
flavour_profiles.plot(kind = 'barh')

#Converting the notes column into a separate dataframe and renaming the first 3 notes

split_df = pd.DataFrame(df['Notes'].tolist())
split_df.rename(columns={0:'Primary',1:'Secondary',2:'Tertiary'}, inplace=True)

#Attaching the new dataframe to our existing dataframe

table = pd.concat([df, split_df], axis=1)

#A simple function to pick gins based on their notes. (The note has to exist in the top 3)

def pick_gin(Primary_note,Secondary_note,Tertiary_note = None):
    combo = table.loc
            [(table['Primary'] == Primary_note) |
            (table['Secondary'] == Secondary_note)|
            (table['Tertiary'] == Tertiary_note)]
    return combo.iloc[:,0:9]

table.loc[(table['score'] > 4) & (table['Number of reviews'] > 20)].iloc[:,0:9] #Finding the best bottles
	from bs4 import BeautifulSoup
	import requests
	import re
	import pandas as pd

	baseurl ="https://www.thewhiskyexchange.com/"
	url ="https://www.thewhiskyexchange.com/c/338/gin"

	source = requests.get(url)
	soup = BeautifulSoup(source.content,'lxml')

	productlist = soup.find_all('li',{'class':'product-grid__item'})
	print(productlist)

	product_links = []
	for page in range(1,13):
	source = requests.get
	(f'https://www.thewhiskyexchange.com/c/338/gin?pg={page}&psize=60')
	soup = BeautifulSoup(source.content,'lxml')
	productlist = soup.find_all('li',{'class':'product-grid__item'})

	for item in productlist:
	for link in item.find_all("a",href = True):
	product_links.append(baseurl + link['href'])

	print(product_links)
	print(len(product_links))

	#Building a structure (Bombay Sapphire Gin as an example)

	r = requests.get("https://www.thewhiskyexchange.com/p/2249/bombay-sapphire-london-dry-gin")
	soup = BeautifulSoup(r.content,'lxml')

	name = soup.find("h1",class_ = "product-main__name").text
	rating_temp = soup.find("div",class_="review-overview")
	rating = rating_temp.text if rating_temp else 'N/A'
	price = soup.find('p',class_="product-action__price").text.strip()
	price_ltr = soup.find('p',class_ = 'product-action__unit-price').text
	abv = soup.find('p',class_ = 'product-main__data').text.strip()
	country = soup.find(class_ = "product-facts__data").text.strip()
	description = soup.find(class_ = 'product-main__description').text.strip()
	notes = [x.get_text() for x in soup.find_all('span', attrs={'class': 'flavour-profile__label'})]

	Gins = {"name":name,"cl/abv":abv,"price":price,"price per ltr":price_ltr,'rating':rating,"Notes":notes,
	"country":country,"description":description}
	print(Gins)

	# Building the core of our scraper
	Gin_List = []

	for link in product_links:
	r = requests.get(link,headers = headers)
	soup = BeautifulSoup(r.content,'lxml')


	try:
	name = soup.find("h1",class_ = "product-main__name").text.strip()
	price = soup.find('p',class_="product-action__price").text.strip()
	price_ltr = soup.find('p',class_ = 'product-action__unit-price').text
	rating_temp = soup.find("div",class_="review-overview")
	rating = rating_temp.text.strip() if rating_temp else 'N/A'
	abv = soup.find('p',class_ = 'product-main__data').text.strip()
	notes = [x.get_text() for x in soup.find_all
	('span', attrs={'class': 'flavour-profile__label'})]
	country = soup.find(class_ = "product-facts__data").text.strip()
	description = soup.find
	(class_ = 'product-main__description').text.strip()

	except:
	None

	Gins = {"name":name,"cl/abv":abv,"price":price,
	"price per ltr":price_ltr,'rating':rating,"Notes":notes,
	"country":country,"description":description}

	Gin_List.append(Gins)
	print("Saving: ",Gins['name'])

	#Removing format in the reviews column

	df = df.replace('\n\n\n\|\n', " ",regex = True)

	#Writing a function to extract only numeric objects using regex library

	def number(column):
	num = re.findall(r'\d[0-9]+',column)
	return ".".join(num)

	#Extracting numbers using a combination of above function and converting them to integer or float dtypes

	df['price in £']=df['price'].apply(lambda x: number(x)).astype(float)
	df['price per ltr in £']=df['price per ltr'].apply(lambda x: number(x))
	df['abv'] = df['cl/abv'].apply(lambda x: number(x)).str[3:5]
	df['score'] = df['rating'].str.extractall('(^\d\|\.\d+)').unstack().fillna('').sum(axis=1).astype(float)
	df['Number of reviews'] = df['rating'].str.extractall('(\((\d+))')[1].unstack().fillna('').sum(axis=1).astype(int)

	#Dropping the columns we do not need

	df.drop(['cl/abv','price','price per ltr','rating'], axis=1, inplace=True) # dropping all the columns we got data from

	def flavours(series):
	return pd.Series([x for _list in series for x in _list])

	flavour_profiles = flavours(df['Notes']).value_counts(normalize= True).head(20)
	flavour_profiles.plot(kind = 'barh')

	#Converting the notes column into a separate dataframe and renaming the first 3 notes

	split_df = pd.DataFrame(df['Notes'].tolist())
	split_df.rename(columns={0:'Primary',1:'Secondary',2:'Tertiary'}, inplace=True)

	#Attaching the new dataframe to our existing dataframe

	table = pd.concat([df, split_df], axis=1)

	#A simple function to pick gins based on their notes. (The note has to exist in the top 3)

	def pick_gin(Primary_note,Secondary_note,Tertiary_note = None):
	combo = table.loc
	[(table['Primary'] == Primary_note) \|
	(table['Secondary'] == Secondary_note)\|
	(table['Tertiary'] == Tertiary_note)]
	return combo.iloc[:,0:9]

	table.loc[(table['score'] > 4) & (table['Number of reviews'] > 20)].iloc[:,0:9] #Finding the best bottles