Last active
September 3, 2022 17:07
-
-
Save JarrydWannenburg/382d872826d2f37697ac1f4be7ab1a00 to your computer and use it in GitHub Desktop.
Google_News_Extraction_Article
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Best format for published date actually comes from the newsAPI pull object, so let's make a list of those dates | |
wells_fargo_publishedAt = [i['publishedAt'][0:10] for i in wells_fargo] # [0:10] returns just YYYY-MM-DD | |
# Create our function that takes a list of Article objects and returns one dataframe | |
def get_details(article_list): | |
# Initialize empty lists for dictionaries later on | |
titles = [] | |
urls = [] | |
authors = [] | |
keywords =[] | |
publishedAt = wells_fargo_publishedAt | |
descriptions = [] | |
content = [] | |
# For-loop to append the article's details for each Article object | |
for article in article_list: | |
titles.append(article.title) | |
authors.append(article.authors) | |
urls.append(article.url) | |
content.append(article.text) | |
# There are three locations for keywords. News Keywords is best source. | |
# If statement to hierarchically append keywords based on preference order | |
if len(article.meta_data['news_keywords']) > 0: | |
keywords.append(article.meta_data['news_keywords'].split(', ')) | |
elif len(article.meta_keywords[0]) > 0: # index 0 because no metakeywords returns[''] and len(['']) = 1 | |
keywords.append(article.meta_keywords) | |
elif len(article.keywords) > 0: | |
keywords.append(article.keywords) | |
else: | |
keywords.append([]) | |
# There are three locations for description. Meta_data description is the best source. | |
# If statement to hierarchically append descriptions/summaries based on preference order. | |
if len(article.meta_description) > 0: | |
descriptions.append(article.meta_description) | |
elif len(article.meta_data['description']) > 0: | |
descriptions.append(article.meta_data['description']) | |
elif len(article.summary) > 0: | |
descriptions.append(article.summary) | |
else: | |
descriptions.append('') | |
# Create a dictionary with the information collected for each article | |
article_dict = {'Title':titles, | |
'Description':descriptions, | |
'Published': publishedAt, | |
'Keywords':keywords, | |
'Content':content, | |
'Authors':authors, | |
'URL':urls} | |
# Using the dictionary above, create a dataframe to return | |
article_df = pd.DataFrame(article_dict) | |
return(article_df) | |
# Using the function above, extract our dataframe | |
wells_fargo_df = get_details(wells_fargo_articles) | |
wells_fargo_df.head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment