Last active
September 7, 2020 12:27
-
-
Save PandaWhoCodes/05ef3fd7d0607865e90084192651d5b5 to your computer and use it in GitHub Desktop.
Get all items in an RSS feed. You can set the MAX ID higher to get more items from the feeds
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests | |
import feedparser | |
import time | |
import requests | |
def parse_rss_feed(url): | |
# Read feed xml data | |
# Try 3 times requesting the url if error | |
for i in range(0, 4): | |
try: | |
news_feed = feedparser.parse(url) | |
break | |
except: | |
print("ERROR calling URL:", url, "iter: ", (i + 1)) | |
pass | |
# print(news_feed.entries) | |
# Flatten data | |
df_news_feed = pd.json_normalize(news_feed.entries) | |
return df_news_feed | |
def get_all_posts(max_id, rss_url): | |
""" | |
Get all items in an RSS feed. You can set the MAX ID higher to get more items from the feeds. The output is a pandas dataframe | |
All links of posts can be found under df["link"] | |
print a df.df.columns to see all available columns | |
""" | |
final_df = pd.DataFrame() | |
for x in range(1, max_id + 1): | |
if requests.get("{0}/?paged={1}/".format(rss_url, x)).status_code == 200: | |
print("Got page number: {}".format(x)) | |
df = parse_rss_feed("{0}/?paged={1}/".format(rss_url, x)) | |
final_df = pd.concat([df, final_df], ignore_index=True) | |
else: | |
print("got 404, count at: {}".format(max_id)) | |
break | |
return final_df | |
if __name__ == "__main__": | |
num_of_pages = 10 | |
posts = get_all_posts(num_of_pages, "https://www.doraithodla.com/feed") | |
print(posts["link"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment