Created
October 27, 2023 03:04
-
-
Save ryandhubbard/c8b38c71aad364c0bca13fe6dc29fea6 to your computer and use it in GitHub Desktop.
This script will automatically scrape app reviews from ios and android app stores. It will email you the results and show any changes within the last 7 days to the top reviews displayed in each app store.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
from IPython.display import HTML | |
import smtplib | |
from email.mime.multipart import MIMEMultipart | |
from email.mime.application import MIMEApplication | |
from email.mime.text import MIMEText | |
import smtplib | |
from smtplib import SMTPException | |
import xlwt | |
from xlwt import Workbook | |
from datetime import datetime, timedelta | |
def find_between( s, first, last ): | |
try: | |
start = s.index( first ) + len( first ) | |
end = s.index( last, start ) | |
return s[start:end] | |
except ValueError: | |
return "" | |
def find_between_r( s, first, last ): | |
try: | |
start = s.rindex( first ) + len( first ) | |
end = s.rindex( last, start ) | |
return s[start:end] | |
except ValueError: | |
return "" | |
def google_reviews(url): | |
reviews = [] | |
url = url | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
mydivs = soup.find_all("div", {"class": "EGFGHd"}) | |
current_review_rating = find_between(str(soup),'<div class="jILTFe">','</div>') | |
for i in mydivs: | |
text = str(i) | |
name = find_between(text,'<div class="X5PpBb">','</div>') | |
date = find_between(text,'<span class="bp9Aid">','</span>') | |
message = find_between(text,'<div class="h3YV2d">','</div>') | |
helpful_count = find_between(text,'<div class="AJTPZc" jsname="J0d7Yd">','</div>') | |
score = find_between(text,'aria-label="Rated',' stars out of five stars') | |
dict = { | |
'app': url, | |
'name': name, | |
'date':date, | |
'message':message, | |
'helpful_count':helpful_count, | |
'score':score, | |
'agg_score':current_review_rating | |
} | |
reviews.append(dict) | |
return reviews, current_review_rating, url | |
def apple_reviews(url): | |
reviews = [] | |
url = url + '?see-all=reviews' | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
mydivs = soup.find_all("div", {"class": "we-customer-review"}) | |
current_review_rating = find_between(str(soup),'<span class="we-customer-ratings__averages__display">','</span>') | |
for i in mydivs: | |
text = str(i) | |
message = find_between(text,'<p data-test-bidi="" dir="false">','</p>') | |
nickname = find_between(text,'<span class="we-truncate we-truncate--single-line we-customer-review__user" dir="ltr">\n ','\n</span>') | |
date = find_between(text,'datetime="','">') | |
score = find_between(text,'<span class="we-star-rating-stars we-star-rating-stars-','"></span>') | |
subjects = i.find("h3", {"class": "we-customer-review__title"}).text | |
subjects = find_between(subjects,'\n ','\n') | |
developer_response = find_between(text,'Developer Response','<!-- --></blockquote>') | |
clean_developer_response = find_between(developer_response,'<p data-test-bidi="" dir="false">','</p>') | |
dict = { | |
'app': url, | |
'name': nickname, | |
'date':date, | |
'message':message, | |
'developer_response':clean_developer_response, | |
'subject':subjects, | |
'score':score, | |
'store':'ios', | |
'agg_score':current_review_rating | |
} | |
reviews.append(dict) | |
return reviews, current_review_rating, url | |
ios = [ | |
'https://apps.apple.com/us/app/temu-shop-like-a-billionaire/id1641486558', | |
'https://apps.apple.com/us/app/chatgpt/id6448311069', | |
'https://apps.apple.com/us/app/google/id284815942' | |
] | |
android = [ | |
'https://play.google.com/store/apps/details?id=com.zhiliaoapp.musically&hl=en_US&gl=US', | |
'https://play.google.com/store/apps/details?id=com.google.android.apps.subscriptions.red&hl=en_US&gl=US', | |
'https://play.google.com/store/apps/details?id=com.espn.score_center&hl=en_US&gl=US' | |
] | |
ios_reviews = [] | |
android_review = [] | |
def get_ios_reviews(): | |
for i in range(len(ios)): | |
if ios[i] != '': | |
ios_reviews.append(apple_reviews(ios[i])) | |
def get_android_reviews(): | |
for i in range(len(android)): | |
if android[i] != '': | |
android_review.append(google_reviews(android[i])) | |
get_ios_reviews() | |
get_android_reviews() | |
# %% | |
df2 = pd.DataFrame() | |
df3 = pd.DataFrame() | |
df2 = pd.DataFrame([review for app_reviews in android_review for review in app_reviews[0]]) | |
df3 = pd.DataFrame([review for app_reviews in ios_reviews for review in app_reviews[0]]) | |
# %% | |
df5 = df2.copy() | |
# df5.loc[df5[['app','agg_score'].duplicated(),['app','agg_score']]=''] | |
df5=df5.set_index(['app','agg_score','name']) #export to excel without index=False | |
df6 = df3.copy() | |
df6=df6.set_index(['app','agg_score','name']) #export to excel without index=False | |
writer = pd.ExcelWriter('Top_App_Reviews.xlsx', engine = 'xlsxwriter') | |
df5.to_excel(writer, sheet_name = 'Android') | |
df6.to_excel(writer, sheet_name = 'iOS') | |
writer.close() | |
# %% | |
recent_ios = df5[(pd.to_datetime(df5['date'])>datetime.today() - timedelta(days=7))] | |
recent_Android = df6[(pd.to_datetime(df6['date'], format='%Y-%m-%dT%H:%M:%S.%fZ')>datetime.today() - timedelta(days=7))] | |
# %% | |
result = recent_ios.to_html() | |
result2 = recent_Android.to_html() | |
message = """<span STYLE="font-weight:bold">Recent iOS Reviews: </span></br>""" | |
if recent_ios.empty: | |
message += 'No New Top iOS Reviews in last 7 Days </br> </br> <span STYLE="font-weight:bold">Recent Android Reviews:</span></br>' | |
else: | |
message += result + '</br> </br> Recent Android Reviews:' | |
if recent_Android.empty: | |
message += 'No Recent Top Android Reviews in last 7 Days' | |
else: | |
message += result2 | |
message += """ | |
</br> </br></br> </br></br> </br>This is an automated report. | |
""" | |
try: | |
# open and read the file in binary | |
msg = MIMEMultipart() | |
body_part = MIMEText(message, 'html') | |
msg['Subject'] = 'Top App Reviews' | |
msg['From'] = 'reviews@example.com' | |
msg['To'] = 'example@example.com;' | |
# Add body to email | |
msg.attach(body_part) | |
# open and read the file in binary | |
with open('Top_App_Reviews.xlsx','rb') as file: | |
# Attach the file with filename to the email | |
msg.attach(MIMEApplication(file.read(), Name='Top_App_Reviews.xlsx')) | |
# Create SMTP objectsmtp.proflowers.com | |
smtp_obj = smtplib.SMTP('smtp.domain.com') | |
# Convert the message to a string and send it | |
smtp_obj.sendmail(msg['From'], msg['To'], msg.as_string()) | |
smtp_obj.quit() | |
except SMTPException as e: | |
error_code = e.smtp_code | |
error_message = e.smtp_error | |
print(error_message) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment