Last active
June 12, 2018 23:20
-
-
Save conormm/0fec9f28ac3d8ab693302191739abd71 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import textacy | |
import re | |
import newspaper | |
import textacy as tcy | |
import matplotlib.pyplot as plt | |
from collections import defaultdict | |
the_guardian = "https://www.theguardian.com" | |
breitbart = "http://www.breitbart.com" | |
def article_extractor(newspaper_url, title_topic=None): | |
dd = defaultdict(list) | |
source = newspaper.build(newspaper_url) | |
arts = [i.url for i in source.articles] | |
if title_topic is None: | |
relevant_arts = [i for i in arts] | |
else: | |
relevant_arts = [i for i in arts if title_topic in i] | |
for i in relevant_arts: | |
art = newspaper.build_article(i) | |
art.download() | |
art.parse() | |
dd["title"].append(art.title) | |
dd["text"].append(art.text) | |
return pd.DataFrame.from_dict(dd) | |
def get_articles(*newspaper_url, **kwargs): | |
results = [] | |
for url in newspaper_url: | |
articles = article_extractor(url, title_topic="Trump") | |
articles["paper"] = url | |
results.append(articles) | |
return pd.concat(results) | |
def clean_text(string): | |
string = re.sub(r"SIGN UP FOR OUR NEWSLETTER", "", string) | |
string = re.sub(r"Read more here", "", string) | |
string = re.sub(r"REUTERS", "", string) | |
string = re.sub(r"\?", "'", string) | |
string = re.sub(r"\n", "", string) | |
return string | |
def preprocess_articles(articles): | |
clean_arts = [] | |
for art in articles: | |
clean_art = tcy.preprocess.preprocess_text(art, | |
fix_unicode=True, | |
lowercase=True, | |
no_currency_symbols=True, | |
no_numbers=True, | |
no_urls=True) | |
clean_arts.append(clean_art) | |
return clean_arts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment