Skip to content

Instantly share code, notes, and snippets.

@ShaikeA
Last active January 14, 2019 12:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ShaikeA/926bcc05b0ad526c9f3f904c4d43fea4 to your computer and use it in GitHub Desktop.
Save ShaikeA/926bcc05b0ad526c9f3f904c4d43fea4 to your computer and use it in GitHub Desktop.
try:
# Instert all articles into the articles table.
# Note that we use INSERT IGNORE which means that duplicates will not be inserted to DB (checked against doi_link).
for key, item in all_articles.items():
cursor.execute("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations)
VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])))
# The for loop above can be replaced with:
# cur.executemany("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations)
# VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])),
# [--tuples of the data to be updated--])
# Catch any error that might occur and logging it into the logs file.
except (KeyError, IndexError, TypeError) as err:
logger.error("There was an error during articles insertion. The error: {}".format(err))
# Save the problematic articles to csv file for future inspection
df = pd.DataFrame(list(all_articles.values()), index=list(all_articles.keys()))
df.to_csv("Articles.csv", sep=',')
# This is for debugging.
# steps is a dict whose keys are the steps in the scraping processand its values are 1 or 0 (succeeded or failed)
steps['Articles'] = 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment