-
-
Save BernardOng/0fd229d579a4900a5d1a54601b97094f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
------------------------------------------------------------------------------ | |
Creator: Bernard Ong | |
Created: Aug 2016 | |
Project: Web Scraping Project | |
Purpose: Scraper Code for Amazon Kindle eBook Site | |
------------------------------------------------------------------------------ | |
''' | |
# import libraries | |
import pandas as pd | |
from rc import ReviewCorpus | |
from amazon import AmazonKindle | |
import sys | |
# get the command line argument for the Amazon search | |
query = sys.argv[1] | |
# initialize the variables to capture price/rating/review (prr) and review corpus | |
prr = [] | |
rc = ReviewCorpus() | |
titles = [] | |
# instantiate class for the Amazon Kindle ebook list based on keywords | |
amz = AmazonKindle(query) | |
# execute the class iterator method | |
# build out the prr collection | |
for a in amz: | |
if not a: pass | |
else: | |
title, price, rating, review = a | |
rc.add(title, rating) | |
prr.append([price, rating, review]) | |
titles.append(rc.cleanupTitle(title)) | |
# build out the ebook list with frequency count of each rating | |
rev_corp = [] | |
for key, val in rc.corpus.iteritems(): | |
key = rc.cleanupTitle(key) | |
if key.strip() != "": rev_corp.append([key] + val) | |
# Convert to dataframe and Export Data to csv files | |
df_revcorp = pd.DataFrame(rev_corp) | |
df_prr = pd.DataFrame(prr) | |
df_titles = pd.DataFrame(titles) | |
# convert dataset to csv format for export readiness | |
rc_data = df_revcorp.to_csv(index=False, header=['title', 'tr1', 'tr2', 'tr3', 'tr4', 'tr5']) | |
pr_data = df_prr.to_csv(index=False, header=['price', 'rating', 'review']) | |
titles_data = df_titles.to_csv(index=False) | |
# opne the data files for writing | |
csv_revcorp = open(query.replace(' ', '_') + '_revcorp.csv', 'w') | |
csv_prr = open(query.replace(' ', '_') + '_prr.csv', 'w') | |
csv_titles = open(query.replace(' ', '_') + '_titles.csv', 'w') | |
# write the entire dataset to file | |
csv_revcorp.write(rc_data) | |
csv_prr.write(pr_data) | |
csv_titles.write(titles_data) | |
# close the files | |
csv_revcorp.close() | |
csv_prr.close() | |
csv_titles.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment