Skip to content

Instantly share code, notes, and snippets.

@JarrydWannenburg
Last active September 3, 2022 17:07
Show Gist options
  • Save JarrydWannenburg/382d872826d2f37697ac1f4be7ab1a00 to your computer and use it in GitHub Desktop.
Save JarrydWannenburg/382d872826d2f37697ac1f4be7ab1a00 to your computer and use it in GitHub Desktop.
Google_News_Extraction_Article
# Best format for published date actually comes from the newsAPI pull object, so let's make a list of those dates
wells_fargo_publishedAt = [i['publishedAt'][0:10] for i in wells_fargo] # [0:10] returns just YYYY-MM-DD
# Create our function that takes a list of Article objects and returns one dataframe
def get_details(article_list):
# Initialize empty lists for dictionaries later on
titles = []
urls = []
authors = []
keywords =[]
publishedAt = wells_fargo_publishedAt
descriptions = []
content = []
# For-loop to append the article's details for each Article object
for article in article_list:
titles.append(article.title)
authors.append(article.authors)
urls.append(article.url)
content.append(article.text)
# There are three locations for keywords. News Keywords is best source.
# If statement to hierarchically append keywords based on preference order
if len(article.meta_data['news_keywords']) > 0:
keywords.append(article.meta_data['news_keywords'].split(', '))
elif len(article.meta_keywords[0]) > 0: # index 0 because no metakeywords returns[''] and len(['']) = 1
keywords.append(article.meta_keywords)
elif len(article.keywords) > 0:
keywords.append(article.keywords)
else:
keywords.append([])
# There are three locations for description. Meta_data description is the best source.
# If statement to hierarchically append descriptions/summaries based on preference order.
if len(article.meta_description) > 0:
descriptions.append(article.meta_description)
elif len(article.meta_data['description']) > 0:
descriptions.append(article.meta_data['description'])
elif len(article.summary) > 0:
descriptions.append(article.summary)
else:
descriptions.append('')
# Create a dictionary with the information collected for each article
article_dict = {'Title':titles,
'Description':descriptions,
'Published': publishedAt,
'Keywords':keywords,
'Content':content,
'Authors':authors,
'URL':urls}
# Using the dictionary above, create a dataframe to return
article_df = pd.DataFrame(article_dict)
return(article_df)
# Using the function above, extract our dataframe
wells_fargo_df = get_details(wells_fargo_articles)
wells_fargo_df.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment