Skip to content

Instantly share code, notes, and snippets.

@eliasdabbas
Created March 14, 2022 18:19
Show Gist options
  • Save eliasdabbas/4bd1a96240f2be4f82755aea50b01fb5 to your computer and use it in GitHub Desktop.
Save eliasdabbas/4bd1a96240f2be4f82755aea50b01fb5 to your computer and use it in GitHub Desktop.
import datetime
import advertools as adv
import pandas as pd
stopwords = ['to', 'of', 'the', 'in', 'for', 'and', 'on', 'a', 'as', 'with',
'from', 'over', 'is', 'at', '—', '-', 'be', '2022', '–', 'it', 'by',
'we', 'why', 'but', 'my', 'how', 'not', 'an', 'are', 'no', 'go',
'your', 'up', 'his']
def news_sitemap_wordcount(news_sitemap, name, phrase_len=1, showtop=30,
filter_func=lambda df: df):
news_sitemap = adv.sitemap_to_df(news_sitemap)
news_sitemap = filter_func(news_sitemap)
title = adv.word_frequency(news_sitemap['news_title'], rm_words=stopwords,
phrase_len=phrase_len)
nowraw = datetime.datetime.utcnow()
now = datetime.datetime.strftime(nowraw, '%d %b, %Y')
return (title[:showtop]
.set_index(pd.Index(list(range(1, showtop+1))))
.style.set_caption(
f'<h2>{name} news topics</h2><h5>{now}</h5>')
.bar(subset=['abs_freq'], color='lightgray'))
news_sitemap_urls = [
('https://www.ft.com/sitemaps/news.xml', 'FT', lambda x: x),
('https://www.nytimes.com/sitemaps/new/news.xml.gz', 'NYTimes',
lambda df: df[df['loc'].str.contains('/2022/')]),
('https://www.bbc.com/sitemaps/https-index-com-news.xml', 'BBC',
lambda df: df[df['publication_name'].eq('BBC News')]),
('https://www.economist.com/googlenews.xml', 'Economist', lambda x: x),
('https://www.bloomberg.com/feeds/bbiz/sitemap_news.xml', 'Bloomberg', lambda x: x),
('https://news.sky.com/sitemap/sitemap-news.xml', 'SKY', lambda x: x),
('https://www.washingtonpost.com/arcio/news-sitemap/', 'Wash.Post', lambda x: x),
('https://www.foxnews.com/sitemap.xml?type=news', 'FOX', lambda x: x)
]
sitemaps_df = pd.DataFrame(news_sitemap_urls, columns=['url', 'name', 'filter_func'])
final_dfs = []
for sitemap, name, filterfunc in sitemaps_df.values:
for ngram in [1, 2]:
df = news_sitemap_wordcount(sitemap,name,
filter_func=filterfunc,
showtop=20, phrase_len=ngram)
final_dfs.append(df)
@eliasdabbas
Copy link
Author

The script was run twice for each sitemap, with 1 and 2-grams, for additional perspective on topics:

Screen Shot 2022-03-14 at 7 17 51 PM

Screen Shot 2022-03-14 at 7 18 06 PM

Screen Shot 2022-03-14 at 7 18 28 PM

Screen Shot 2022-03-14 at 7 18 41 PM

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment