Jarryd Wannenburg JarrydWannenburg

## TextBlob.py
# Create functions to get polarity and subjectivity using TextBlob
def getPolarity(text):
    return round(TextBlob(text).sentiment[0],4)
def getSubjectivity(text):
    return round(TextBlob(text).sentiment[1],4)

## applying_functions.py
wells_fargo_df['Top_ORGs'] = wells_fargo_df.Content.apply(get_org_counts)
wells_fargo_df['Top_PERSONs'] = wells_fargo_df.Content.apply(get_person_counts)
wells_fargo_df['Polarity'] = wells_fargo_df.Description.apply(getPolarity)
wells_fargo_df['Subjectivity'] = wells_fargo_df.Content.apply(getSubjectivity)

## get_person_counts.py
# Create a function to get a count of the top n people mentioned in the article with counts
def get_person_counts(text):
    # Remove linebreaks from the text
    text = text.replace("\n"," paragraph break ")
    doc = nlp(text)

    # Loop through the doc object and extract PERSON (people) entities
    res = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON':

## get_org_counts.py
# Create a function to get a count of the top n organizations mentioned in the article with counts
def get_org_counts(text):
    # Remove linebreaks from the text
    text = text.replace("\n"," paragraph break ")
    doc = nlp(text)

    # Loop through the doc object and extract ORG (organization) entities
    res = []
    for ent in doc.ents:
        if ent.label_ == 'ORG':

## get_details_function.py
# Best format for published date actually comes from the newsAPI pull object, so let's make a list of those dates
wells_fargo_publishedAt = [i['publishedAt'][0:10] for i in wells_fargo] # [0:10] returns just YYYY-MM-DD

# Create our function that takes a list of Article objects and returns one dataframe
def get_details(article_list):
    # Initialize empty lists for dictionaries later on
    titles = []
    urls = []
    authors = []
    keywords =[]

## newspaper3k_extraction.py
# Assign just the information on the articles to our wells_fargo obj
wells_fargo = wells_fargo['articles'] # 100 is the max length of articles to return

# Extract the urls for each article returned by newsAPI
wells_fargo_urls = [i['url'] for i in wells_fargo]

# Using newspaper3k, create a function to return an article given its URL
# See https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html for more detail
def get_article(url):
    article = Article(url, fetch_images=False, memoize_articles = False)

## newsAPI_pull.py
# Init
newsapi = NewsApiClient(api_key=key)

# Pull articles with "Wells Fargo" in title. Exclude forbes because of subscription req.
# See https://newsapi.org/docs/endpoints/everything for more details
wells_fargo = newsapi.get_everything(qintitle='Wells Fargo',
                                     exclude_domains='forbes.com',
                                     language='en',
                                     from_param='2022-08-29', # Date Range (YYYY-MM-DD)
                                     to='2022-09-03', # Date Range (YYYY-MM-DD)

## setup.py
# installations
## Spacy
pip install -U pip setuptools wheel
pip install -U spacy
python -m spacy download en_core_web_sm

## NewsAPI
pip install newsapi-python

## Newspaper3k

## modeling.py
X,y = df.drop(columns=['Transported']), df.loc[:, ['Transported']] #Target Variable

#  Replacing the predicted true/false from a string to a boolean
y.replace({True:'True', False:'False'}, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)

# Using ravel to silence a user warning. No big deal though. Doesn't change anything.
full_pipe.fit(X_train, y_train.values.ravel());
y_pred = full_pipe.predict(X_test)

## Processor_and_pipe.py
# Using ColumnTransformer and Selector to choose which columns to use
preprocessor = ColumnTransformer([
        ('cat', cat_pipe, Selector(dtype_exclude=np.number)),
        ('num', num_pipe, Selector(dtype_include=np.number))
        ])

# Combining the processor with a model for a happy ending :)
full_pipe = Pipeline(steps=[
    ('features', preprocessor),
    ('model', SVC())
	# Create functions to get polarity and subjectivity using TextBlob
	def getPolarity(text):
	return round(TextBlob(text).sentiment[0],4)
	def getSubjectivity(text):
	return round(TextBlob(text).sentiment[1],4)
	wells_fargo_df['Top_ORGs'] = wells_fargo_df.Content.apply(get_org_counts)
	wells_fargo_df['Top_PERSONs'] = wells_fargo_df.Content.apply(get_person_counts)
	wells_fargo_df['Polarity'] = wells_fargo_df.Description.apply(getPolarity)
	wells_fargo_df['Subjectivity'] = wells_fargo_df.Content.apply(getSubjectivity)
	# Create a function to get a count of the top n people mentioned in the article with counts
	def get_person_counts(text):
	# Remove linebreaks from the text
	text = text.replace("\n"," paragraph break ")
	doc = nlp(text)

	# Loop through the doc object and extract PERSON (people) entities
	res = []
	for ent in doc.ents:
	if ent.label_ == 'PERSON':
	# Create a function to get a count of the top n organizations mentioned in the article with counts
	def get_org_counts(text):
	# Remove linebreaks from the text
	text = text.replace("\n"," paragraph break ")
	doc = nlp(text)

	# Loop through the doc object and extract ORG (organization) entities
	res = []
	for ent in doc.ents:
	if ent.label_ == 'ORG':
	# Best format for published date actually comes from the newsAPI pull object, so let's make a list of those dates
	wells_fargo_publishedAt = [i['publishedAt'][0:10] for i in wells_fargo] # [0:10] returns just YYYY-MM-DD

	# Create our function that takes a list of Article objects and returns one dataframe
	def get_details(article_list):
	# Initialize empty lists for dictionaries later on
	titles = []
	urls = []
	authors = []
	keywords =[]
	# Assign just the information on the articles to our wells_fargo obj
	wells_fargo = wells_fargo['articles'] # 100 is the max length of articles to return

	# Extract the urls for each article returned by newsAPI
	wells_fargo_urls = [i['url'] for i in wells_fargo]

	# Using newspaper3k, create a function to return an article given its URL
	# See https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html for more detail
	def get_article(url):
	article = Article(url, fetch_images=False, memoize_articles = False)
	# Init
	newsapi = NewsApiClient(api_key=key)

	# Pull articles with "Wells Fargo" in title. Exclude forbes because of subscription req.
	# See https://newsapi.org/docs/endpoints/everything for more details
	wells_fargo = newsapi.get_everything(qintitle='Wells Fargo',
	exclude_domains='forbes.com',
	language='en',
	from_param='2022-08-29', # Date Range (YYYY-MM-DD)
	to='2022-09-03', # Date Range (YYYY-MM-DD)
	# installations
	## Spacy
	pip install -U pip setuptools wheel
	pip install -U spacy
	python -m spacy download en_core_web_sm

	## NewsAPI
	pip install newsapi-python

	## Newspaper3k
	X,y = df.drop(columns=['Transported']), df.loc[:, ['Transported']] #Target Variable

	# Replacing the predicted true/false from a string to a boolean
	y.replace({True:'True', False:'False'}, inplace=True)

	X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)

	# Using ravel to silence a user warning. No big deal though. Doesn't change anything.
	full_pipe.fit(X_train, y_train.values.ravel());
	y_pred = full_pipe.predict(X_test)
	# Using ColumnTransformer and Selector to choose which columns to use
	preprocessor = ColumnTransformer([
	('cat', cat_pipe, Selector(dtype_exclude=np.number)),
	('num', num_pipe, Selector(dtype_include=np.number))
	])

	# Combining the processor with a model for a happy ending :)
	full_pipe = Pipeline(steps=[
	('features', preprocessor),
	('model', SVC())