Skip to content

Instantly share code, notes, and snippets.

@PandaWhoCodes
Last active September 7, 2020 12:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PandaWhoCodes/05ef3fd7d0607865e90084192651d5b5 to your computer and use it in GitHub Desktop.
Save PandaWhoCodes/05ef3fd7d0607865e90084192651d5b5 to your computer and use it in GitHub Desktop.
Get all items in an RSS feed. You can set the MAX ID higher to get more items from the feeds
import pandas as pd
import requests
import feedparser
import time
import requests
def parse_rss_feed(url):
# Read feed xml data
# Try 3 times requesting the url if error
for i in range(0, 4):
try:
news_feed = feedparser.parse(url)
break
except:
print("ERROR calling URL:", url, "iter: ", (i + 1))
pass
# print(news_feed.entries)
# Flatten data
df_news_feed = pd.json_normalize(news_feed.entries)
return df_news_feed
def get_all_posts(max_id, rss_url):
"""
Get all items in an RSS feed. You can set the MAX ID higher to get more items from the feeds. The output is a pandas dataframe
All links of posts can be found under df["link"]
print a df.df.columns to see all available columns
"""
final_df = pd.DataFrame()
for x in range(1, max_id + 1):
if requests.get("{0}/?paged={1}/".format(rss_url, x)).status_code == 200:
print("Got page number: {}".format(x))
df = parse_rss_feed("{0}/?paged={1}/".format(rss_url, x))
final_df = pd.concat([df, final_df], ignore_index=True)
else:
print("got 404, count at: {}".format(max_id))
break
return final_df
if __name__ == "__main__":
num_of_pages = 10
posts = get_all_posts(num_of_pages, "https://www.doraithodla.com/feed")
print(posts["link"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment