conormm/articles_extraction_preprocess_funcs.py

## articles_extraction_preprocess_funcs.py
import pandas as pd
import textacy
import re
import newspaper
import textacy as tcy
import matplotlib.pyplot as plt
from collections import defaultdict

the_guardian = "https://www.theguardian.com"
breitbart = "http://www.breitbart.com"

def article_extractor(newspaper_url, title_topic=None):

    dd = defaultdict(list)

    source = newspaper.build(newspaper_url)
    arts = [i.url for i in source.articles]
    if title_topic is None:
        relevant_arts = [i for i in arts]
    else:
        relevant_arts = [i for i in arts if title_topic in i]

    for i in relevant_arts:
        art = newspaper.build_article(i)
        art.download()
        art.parse()
        dd["title"].append(art.title)
        dd["text"].append(art.text)
    return pd.DataFrame.from_dict(dd)

def get_articles(*newspaper_url, **kwargs):

    results = []
    for url in newspaper_url:
        articles = article_extractor(url, title_topic="Trump")
        articles["paper"] = url
        results.append(articles)
    return pd.concat(results)

def clean_text(string):

    string = re.sub(r"SIGN UP FOR OUR NEWSLETTER", "", string)
    string = re.sub(r"Read more here", "", string)
    string = re.sub(r"REUTERS", "", string)
    string = re.sub(r"\?", "'", string)
    string = re.sub(r"\n", "", string)
    return string

def preprocess_articles(articles):

    clean_arts = []
    for art in articles:
        clean_art = tcy.preprocess.preprocess_text(art,
                                          fix_unicode=True,
                                          lowercase=True,
                                          no_currency_symbols=True,
                                          no_numbers=True,
                                          no_urls=True)
        clean_arts.append(clean_art)
    return clean_arts
	import pandas as pd
	import textacy
	import re
	import newspaper
	import textacy as tcy
	import matplotlib.pyplot as plt
	from collections import defaultdict

	the_guardian = "https://www.theguardian.com"
	breitbart = "http://www.breitbart.com"

	def article_extractor(newspaper_url, title_topic=None):

	dd = defaultdict(list)

	source = newspaper.build(newspaper_url)
	arts = [i.url for i in source.articles]
	if title_topic is None:
	relevant_arts = [i for i in arts]
	else:
	relevant_arts = [i for i in arts if title_topic in i]

	for i in relevant_arts:
	art = newspaper.build_article(i)
	art.download()
	art.parse()
	dd["title"].append(art.title)
	dd["text"].append(art.text)
	return pd.DataFrame.from_dict(dd)

	def get_articles(newspaper_url, *kwargs):

	results = []
	for url in newspaper_url:
	articles = article_extractor(url, title_topic="Trump")
	articles["paper"] = url
	results.append(articles)
	return pd.concat(results)

	def clean_text(string):

	string = re.sub(r"SIGN UP FOR OUR NEWSLETTER", "", string)
	string = re.sub(r"Read more here", "", string)
	string = re.sub(r"REUTERS", "", string)
	string = re.sub(r"\?", "'", string)
	string = re.sub(r"\n", "", string)
	return string

	def preprocess_articles(articles):

	clean_arts = []
	for art in articles:
	clean_art = tcy.preprocess.preprocess_text(art,
	fix_unicode=True,
	lowercase=True,
	no_currency_symbols=True,
	no_numbers=True,
	no_urls=True)
	clean_arts.append(clean_art)
	return clean_arts