Skip to content

Instantly share code, notes, and snippets.

View JarrydWannenburg's full-sized avatar
🤔
Learning

Jarryd Wannenburg JarrydWannenburg

🤔
Learning
View GitHub Profile
@JarrydWannenburg
JarrydWannenburg / TextBlob.py
Created September 3, 2022 16:42
Google_News_Extraction_Article
# Create functions to get polarity and subjectivity using TextBlob
def getPolarity(text):
return round(TextBlob(text).sentiment[0],4)
def getSubjectivity(text):
return round(TextBlob(text).sentiment[1],4)
@JarrydWannenburg
JarrydWannenburg / applying_functions.py
Last active September 3, 2022 16:47
Google_News_Extraction_Article
wells_fargo_df['Top_ORGs'] = wells_fargo_df.Content.apply(get_org_counts)
wells_fargo_df['Top_PERSONs'] = wells_fargo_df.Content.apply(get_person_counts)
wells_fargo_df['Polarity'] = wells_fargo_df.Description.apply(getPolarity)
wells_fargo_df['Subjectivity'] = wells_fargo_df.Content.apply(getSubjectivity)
@JarrydWannenburg
JarrydWannenburg / get_person_counts.py
Created September 3, 2022 15:59
Google_News_Extraction_Article
# Create a function to get a count of the top n people mentioned in the article with counts
def get_person_counts(text):
# Remove linebreaks from the text
text = text.replace("\n"," paragraph break ")
doc = nlp(text)
# Loop through the doc object and extract PERSON (people) entities
res = []
for ent in doc.ents:
if ent.label_ == 'PERSON':
@JarrydWannenburg
JarrydWannenburg / get_org_counts.py
Created September 3, 2022 15:58
Google_News_Extraction_Article
# Create a function to get a count of the top n organizations mentioned in the article with counts
def get_org_counts(text):
# Remove linebreaks from the text
text = text.replace("\n"," paragraph break ")
doc = nlp(text)
# Loop through the doc object and extract ORG (organization) entities
res = []
for ent in doc.ents:
if ent.label_ == 'ORG':
@JarrydWannenburg
JarrydWannenburg / get_details_function.py
Last active September 3, 2022 17:07
Google_News_Extraction_Article
# Best format for published date actually comes from the newsAPI pull object, so let's make a list of those dates
wells_fargo_publishedAt = [i['publishedAt'][0:10] for i in wells_fargo] # [0:10] returns just YYYY-MM-DD
# Create our function that takes a list of Article objects and returns one dataframe
def get_details(article_list):
# Initialize empty lists for dictionaries later on
titles = []
urls = []
authors = []
keywords =[]
@JarrydWannenburg
JarrydWannenburg / newspaper3k_extraction.py
Created September 3, 2022 15:15
Google_News_Extraction_Article
# Assign just the information on the articles to our wells_fargo obj
wells_fargo = wells_fargo['articles'] # 100 is the max length of articles to return
# Extract the urls for each article returned by newsAPI
wells_fargo_urls = [i['url'] for i in wells_fargo]
# Using newspaper3k, create a function to return an article given its URL
# See https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html for more detail
def get_article(url):
article = Article(url, fetch_images=False, memoize_articles = False)
@JarrydWannenburg
JarrydWannenburg / newsAPI_pull.py
Last active September 3, 2022 15:15
Google_News_Extraction_Article
# Init
newsapi = NewsApiClient(api_key=key)
# Pull articles with "Wells Fargo" in title. Exclude forbes because of subscription req.
# See https://newsapi.org/docs/endpoints/everything for more details
wells_fargo = newsapi.get_everything(qintitle='Wells Fargo',
exclude_domains='forbes.com',
language='en',
from_param='2022-08-29', # Date Range (YYYY-MM-DD)
to='2022-09-03', # Date Range (YYYY-MM-DD)
@JarrydWannenburg
JarrydWannenburg / setup.py
Last active September 3, 2022 17:01
Google_News_Extraction_Article
# installations
## Spacy
pip install -U pip setuptools wheel
pip install -U spacy
python -m spacy download en_core_web_sm
## NewsAPI
pip install newsapi-python
## Newspaper3k
@JarrydWannenburg
JarrydWannenburg / modeling.py
Created August 26, 2022 19:35
Scalable_Pipeline_Article
X,y = df.drop(columns=['Transported']), df.loc[:, ['Transported']] #Target Variable
# Replacing the predicted true/false from a string to a boolean
y.replace({True:'True', False:'False'}, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)
# Using ravel to silence a user warning. No big deal though. Doesn't change anything.
full_pipe.fit(X_train, y_train.values.ravel());
y_pred = full_pipe.predict(X_test)
@JarrydWannenburg
JarrydWannenburg / Processor_and_pipe.py
Created August 26, 2022 19:31
Scalable_Pipeline_Article
# Using ColumnTransformer and Selector to choose which columns to use
preprocessor = ColumnTransformer([
('cat', cat_pipe, Selector(dtype_exclude=np.number)),
('num', num_pipe, Selector(dtype_include=np.number))
])
# Combining the processor with a model for a happy ending :)
full_pipe = Pipeline(steps=[
('features', preprocessor),
('model', SVC())