Skip to content

Instantly share code, notes, and snippets.

@khodjaevsh
Last active February 5, 2020 15:54
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save khodjaevsh/88273c2c5b84cb6904d2b0eb54844d27 to your computer and use it in GitHub Desktop.
Save khodjaevsh/88273c2c5b84cb6904d2b0eb54844d27 to your computer and use it in GitHub Desktop.
import pandas as pd
from bs4 import BeautifulSoup
import requests
from time import sleep
import datetime
def clean_string(column):
return column.apply(lambda x: x.replace("\n",'',2)).apply(lambda x: x.replace(' ',''))
def scrape_reviews(PATH, n_pages, sleep_time = 0.3):
names = []
ratings = []
headers = []
reviews = []
dates = []
locations = []
for p in range(n_pages):
sleep(sleep_time)
http = requests.get(f'{PATH}{p}&stars=1&stars=5')
bsoup = BeautifulSoup(http.text, 'html.parser')
review_containers = bsoup.find_all('div', class_ = 'review-info__body')
user_containers = bsoup.find_all('div', class_ = 'consumer-info__details')
rating_container = bsoup.find_all('div',class_ = "review-info__header__verified")
date_container = bsoup.find_all('div',class_ = "header__verified__date")
profile_link_containers = bsoup.find_all('aside', class_ = 'content-section__consumer-info' )
for x in range(len(bsoup)):
review_c = review_containers[x]
headers.append(review_c.h2.a.text)
reviews.append(review_c.p.text)
reviewer = user_containers[x]
names.append(reviewer.h3.text)
rating = rating_container[x]
ratings.append(rating.div.attrs['class'][1][12])
date = date_container[x]
dates.append(datetime.datetime.strptime(date.time.attrs['datetime'][0:10], '%Y-%m-%d').date())
prof = profile_link_containers[x]
link = 'https://www.trustpilot.com'+ prof.a['href']
c_profile = requests.get(f'{link}')
csoup = BeautifulSoup(c_profile.text, 'html.parser')
cust_container = csoup.find('div', class_ = 'user-summary-location')
locations.append(cust_container.text)
rev_df = pd.DataFrame(list(zip(names, headers, reviews, ratings, dates, locations)),
columns = ['Name','Header','Review','Rating', 'Date', 'Location'])
rev_df.Review = clean_string(rev_df.Review)
rev_df.Name = clean_string(rev_df.Name)
rev_df.Location = clean_string(rev_df.Location)
rev_df.Location = rev_df.Location.apply(lambda x: x.split(',',1)[-1])
rev_df.Rating = rev_df.Rating.astype('int')
rev_df.Date = pd.to_datetime(rev_df.Date)
return rev_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment