This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.metrics import ( | |
accuracy_score, | |
auc, | |
precision_score, | |
recall_score, | |
roc_curve, | |
) | |
from sklearn.model_selection import StratifiedShuffleSplit | |
from sklearn.pipeline import Pipeline |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.compose import ColumnTransformer | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.impute import SimpleImputer | |
from sklearn.pipeline import Pipeline | |
from sklearn.svm import SVC | |
# Define the names of the text and numerical features | |
text_features = "text" | |
numerical_features = ["n_words", "mean_word_length"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import GridSearchCV | |
# params is a dictionary, the keys are the hyperparameter and the vaules are a list of values | |
# to search over. | |
params = [ | |
{ | |
"transform__txt__max_features": [None, 100, 10], | |
"transform__num__selector__attribute_names": [ | |
["n_words"], | |
["mean_word_length"], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.base import BaseEstimator, TransformerMixin | |
class CountWords(BaseEstimator, TransformerMixin): | |
# Creates a dataframe from a series of text documents by creating a new column named n_words, | |
# that contains the number of words in each document | |
def __init__(self, new_col_name): | |
self.new_col_name = new_col_name | |
def fit(self, series, y=None): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from instascrape import Post, Profile | |
def get_post_data(post_object, | |
attributes=['caption', | |
'upload_date', | |
'location', | |
'likes', | |
'comments', | |
'id']): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from instascrape import Hashtag | |
#Substitute 'ad' with the word you | |
#want to search for (as a string) | |
hashtag = Hashtag('ad') | |
#Scrape the profile | |
hashtag.scrape() | |
#Get list of the recent posts | |
recents = hashtag.get_recent_posts() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Scrape profile and get recent posts | |
natgeo = Profile('natgeo') | |
natgeo.scrape() | |
recents = natgeo.get_recent_posts() | |
#Filter list to separate images from videos | |
recent_photos = [post for post in recents if not post.is_video] | |
#Save photos in a loop | |
for i, post in enumerate(recent_photos): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List, Tuple | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
from matplotlib.axes import Axes | |
from textacy import extract, make_spacy_doc | |
def decompose_keyterms(keyterm_list: List[str]) -> Tuple: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import count | |
import matplotlib.pyplot as plt | |
import networkx as nx | |
import numpy as np | |
import pandas as pd | |
import textacy | |
with open("news_article.txt", "r") as file: | |
data = file.read().replace("\n", "") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from textacy import extract, make_spacy_doc | |
# Load the entire article text | |
with open("news_article.txt", "r") as file: | |
data = file.read().replace("\n", "") | |
article = data.replace(u"\xa0", u" ") | |
# Create doc object |
OlderNewer