-
-
Save zero731/b02476d54c14745978b96f393a02e230 to your computer and use it in GitHub Desktop.
## Loop through apps to get reviews | |
for app_name, app_id in zip(app_names, app_ids): | |
# Get start time | |
start = dt.datetime.now(tz=get_localzone()) | |
fmt= "%m/%d/%y - %T %p" | |
# Print starting output for app | |
print('---'*20) | |
print('---'*20) | |
print(f'***** {app_name} started at {start.strftime(fmt)}') | |
print() | |
# Empty list for storing reviews | |
app_reviews = [] | |
# Number of reviews to scrape per batch | |
count = 200 | |
# To keep track of how many batches have been completed | |
batch_num = 0 | |
# Retrieve reviews (and continuation_token) with reviews function | |
rvws, token = reviews( | |
app_id, # found in app's url | |
lang='en', # defaults to 'en' | |
country='us', # defaults to 'us' | |
sort=Sort.NEWEST, # start with most recent | |
count=count # batch size | |
) | |
# For each review obtained | |
for r in rvws: | |
r['app_name'] = app_name # add key for app's name | |
r['app_id'] = app_id # add key for app's id | |
# Add the list of review dicts to overall list | |
app_reviews.extend(rvws) | |
# Increase batch count by one | |
batch_num +=1 | |
print(f'Batch {batch_num} completed.') | |
# Wait 1 to 5 seconds to start next batch | |
time.sleep(random.randint(1,5)) | |
# Append review IDs to list prior to starting next batch | |
pre_review_ids = [] | |
for rvw in app_reviews: | |
pre_review_ids.append(rvw['reviewId']) | |
# Loop through at most max number of batches | |
for batch in range(4999): | |
rvws, token = reviews( # store continuation_token | |
app_id, | |
lang='en', | |
country='us', | |
sort=Sort.NEWEST, | |
count=count, | |
# using token obtained from previous batch | |
continuation_token=token | |
) | |
# Append unique review IDs from current batch to new list | |
new_review_ids = [] | |
for r in rvws: | |
new_review_ids.append(r['reviewId']) | |
# And add keys for name and id to ea review dict | |
r['app_name'] = app_name # add key for app's name | |
r['app_id'] = app_id # add key for app's id | |
# Add the list of review dicts to main app_reviews list | |
app_reviews.extend(rvws) | |
# Increase batch count by one | |
batch_num +=1 | |
# Break loop and stop scraping for current app if most recent batch | |
# did not add any unique reviews | |
all_review_ids = pre_review_ids + new_review_ids | |
if len(set(pre_review_ids)) == len(set(all_review_ids)): | |
print(f'No reviews left to scrape. Completed {batch_num} batches.\n') | |
break | |
# all_review_ids becomes pre_review_ids to check against | |
# for next batch | |
pre_review_ids = all_review_ids | |
# At every 100th batch | |
if batch_num%100==0: | |
# print update on number of batches | |
print(f'Batch {batch_num} completed.') | |
# insert reviews into collection | |
review_collection.insert_many(app_reviews) | |
# print update about num reviews inserted | |
store_time = dt.datetime.now(tz=get_localzone()) | |
print(f""" | |
Successfully inserted {len(app_reviews)} {app_name} | |
reviews into collection at {store_time.strftime(fmt)}.\n | |
""") | |
# empty our list for next round of 100 batches | |
app_reviews = [] | |
# Wait 1 to 5 seconds to start next batch | |
time.sleep(random.randint(1,5)) | |
# Print update when max number of batches has been reached | |
# OR when last batch didn't add any unique reviews | |
print(f'Done scraping {app_name}.') | |
print(f'Scraped a total of {len(set(pre_review_ids))} unique reviews.\n') | |
# Insert remaining reviews into collection | |
review_collection.insert_many(app_reviews) | |
# Get end time | |
end = dt.datetime.now(tz=get_localzone()) | |
# Print ending output for app | |
print(f""" | |
Successfully inserted all {app_name} reviews into collection | |
at {end.strftime(fmt)}.\n | |
""") | |
print(f'Time elapsed for {app_name}: {end-start}') | |
print('---'*20) | |
print('---'*20) | |
print('\n') | |
# Wait 1 to 5 seconds to start scraping next app | |
time.sleep(random.randint(1,5)) |
Does not work at all.
I'd be happy to try and troubleshoot as this has worked for myself and others. Are you running into specific error messages?
Trying to download reviews of this
https://play.google.com/store/apps/details?id=in.gov.umang.negd.g2c&hl=en_IN&gl=US
modified the script for
app_names = ("UMANG")
app_ids = ("in.gov.umang.negd.g2c")
Errors out into
***** A started at 05/20/21 - 22:40:47 PM
Batch 1 completed.
No reviews left to scrape. Completed 2 batches.
Done scraping A.
Scraped a total of 0 unique reviews.
Traceback (most recent call last):
File "cool.py", line 155, in
review_collection.insert_many(app_reviews)
NameError: name 'review_collection' is not defined
@graizada You forgot to define review_collection. Add the following code before review scraping.
review_collection = app_proj_db['review_collection']
for complete guide follow this link: https://python.plainenglish.io/scraping-storing-google-play-app-reviews-with-python-5640c933c476
Does not work at all.