Skip to content

Instantly share code, notes, and snippets.

@conormm
Last active June 12, 2018 23:20
Show Gist options
  • Save conormm/0fec9f28ac3d8ab693302191739abd71 to your computer and use it in GitHub Desktop.
Save conormm/0fec9f28ac3d8ab693302191739abd71 to your computer and use it in GitHub Desktop.
import pandas as pd
import textacy
import re
import newspaper
import textacy as tcy
import matplotlib.pyplot as plt
from collections import defaultdict
the_guardian = "https://www.theguardian.com"
breitbart = "http://www.breitbart.com"
def article_extractor(newspaper_url, title_topic=None):
dd = defaultdict(list)
source = newspaper.build(newspaper_url)
arts = [i.url for i in source.articles]
if title_topic is None:
relevant_arts = [i for i in arts]
else:
relevant_arts = [i for i in arts if title_topic in i]
for i in relevant_arts:
art = newspaper.build_article(i)
art.download()
art.parse()
dd["title"].append(art.title)
dd["text"].append(art.text)
return pd.DataFrame.from_dict(dd)
def get_articles(*newspaper_url, **kwargs):
results = []
for url in newspaper_url:
articles = article_extractor(url, title_topic="Trump")
articles["paper"] = url
results.append(articles)
return pd.concat(results)
def clean_text(string):
string = re.sub(r"SIGN UP FOR OUR NEWSLETTER", "", string)
string = re.sub(r"Read more here", "", string)
string = re.sub(r"REUTERS", "", string)
string = re.sub(r"\?", "'", string)
string = re.sub(r"\n", "", string)
return string
def preprocess_articles(articles):
clean_arts = []
for art in articles:
clean_art = tcy.preprocess.preprocess_text(art,
fix_unicode=True,
lowercase=True,
no_currency_symbols=True,
no_numbers=True,
no_urls=True)
clean_arts.append(clean_art)
return clean_arts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment