Last active
January 14, 2019 12:13
-
-
Save ShaikeA/926bcc05b0ad526c9f3f904c4d43fea4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
try: | |
# Instert all articles into the articles table. | |
# Note that we use INSERT IGNORE which means that duplicates will not be inserted to DB (checked against doi_link). | |
for key, item in all_articles.items(): | |
cursor.execute("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations) | |
VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3]))) | |
# The for loop above can be replaced with: | |
# cur.executemany("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations) | |
# VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])), | |
# [--tuples of the data to be updated--]) | |
# Catch any error that might occur and logging it into the logs file. | |
except (KeyError, IndexError, TypeError) as err: | |
logger.error("There was an error during articles insertion. The error: {}".format(err)) | |
# Save the problematic articles to csv file for future inspection | |
df = pd.DataFrame(list(all_articles.values()), index=list(all_articles.keys())) | |
df.to_csv("Articles.csv", sep=',') | |
# This is for debugging. | |
# steps is a dict whose keys are the steps in the scraping processand its values are 1 or 0 (succeeded or failed) | |
steps['Articles'] = 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment