Skip to content

Instantly share code, notes, and snippets.

@cyberandy
Last active January 5, 2024 19:48
Show Gist options
  • Save cyberandy/807d5623d842a44c6010af92c478963e to your computer and use it in GitHub Desktop.
Save cyberandy/807d5623d842a44c6010af92c478963e to your computer and use it in GitHub Desktop.
A super-simple python script to read Google News RSS feeds and store data in a CSV file.
# Expected use >> python read-GNews.py -q [query] -l [language] -p [country]
# The following command will search for the latest news written in German from Austria about "Redbull"
# python read-GNews.py -q Redbull -l de -p AT
#
# Queries can be provided as strings using quotation marks >> python read-GNews.py -q "Redbull Media House" -l de
# Multiple queries can be executed at once >> python read-GNews.py -q "Redbull Media House" -q Redbull -l de -p at -p de
# The script will save a CSV file containing Title, Link, pubDate, Description, Source and Alexa Traffic Rank.
import feedparser
import time
import sys
import pandas as pd
import re
import urllib
import urllib.request as ur
import argparse
import bs4
# Feed URL
base_url = 'https://news.google.com/rss/search?q='
# Get the parameters
parser = argparse.ArgumentParser()
parser.add_argument('-q', action='append', dest='queries', nargs='+',
default=[],
help='Add all queries',
)
parser.add_argument('-l', action='store', dest='language',
default="en",
help='Store language')
parser.add_argument('-p', action='append', dest='locations', nargs='+',
default=[],
help='Add all places')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
# Get Alexa Rank - remember it only works from USA so you need a proxy
def getMetrics(url):
cleanDomain = '/'.join(url.split('/')[:3])
try:
alexa_rank = bs4.BeautifulSoup(ur.urlopen("http://data.alexa.com/data?cli=10&dat=s&url="+ url), "xml").find("REACH")["RANK"]
except:
alexa_rank = None
return alexa_rank
# HTML cleanup function
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
# Prepare the data frame to store the items
d = []
# Access the feed and store data in d
def readFeed(url,query):
feed = feedparser.parse(url)
# Loop items in the feed
for post in feed.entries:
title = post.title
link = post.link
# Converting published date to aaaa/mm/dd
pubDate = "%d/%02d/%02d" % (post.published_parsed.tm_year,\
post.published_parsed.tm_mon, \
post.published_parsed.tm_mday)
description = cleanhtml(post.summary)
source = post.source.title
# Get Alexa Rank
alexa_rank = getMetrics(link)
d.append((title, link, pubDate, description, source, query, alexa_rank))
print(d)
# Add delay between calls
time.sleep(2)
return d
# Get the parameters
args = parser.parse_args()
# Set the language (default = "en")
language = args.language.lower()
# Make sure there is at least one query
if len(args.queries) == 0:
print("Please add at least one query using the -q parameter")
exit
# Looping the different combination of queries and places
# Make sure there is at least one place
if len(args.locations) > 0:
# Looping queries and places
for a in args.queries:
for b in args.locations:
query = ''.join(map(str, a))
# URL encode the query and add quotes around it
encoded_query = '"' + urllib.parse.quote_plus(query) + '"'
place = urllib.parse.quote_plus(''.join(map(str, b)).upper() + ":" + ''.join(map(str, b)).lower())
# Compose the URL
url = base_url + encoded_query + "&hl=" + language + "&ceid=" + place
print("Reading now: ", url)
# Read the Feed
readFeed(url, query)
else:
# Just use the query(ies)
for a in args.queries:
query = ''.join(map(str, a))
# URL encode the query and add quotes around it
encoded_query = '"' + urllib.parse.quote_plus(query) + '"'
# Compose the URL
url = base_url + encoded_query
print("Reading now: ",url)
# Read the Feed
readFeed(url, query)
# Set the file name
cleanQuery = re.sub('\W+','', query)
file_name = cleanQuery + ".csv"
df = pd.DataFrame(d, columns=('Title', 'Link', 'pubDate', 'Description','Source', 'Query', 'Alexa Rank'))
# Remove all rows with the same link - you might want to comment this when using different keywords
df.drop_duplicates(subset ="Link",
keep = False, inplace = True)
# Store data to CSV
df.to_csv(file_name, encoding='utf-8', index=False)
print(len(df), "Articles saved on ", file_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment