Oliver Batey oliver-batey

## sampling.py
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    auc,
    precision_score,
    recall_score,
    roc_curve,
)
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline

## pipeline.py
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Define the names of the text and numerical features
text_features = "text"
numerical_features = ["n_words", "mean_word_length"]

## gridsearchcv.py
from sklearn.model_selection import GridSearchCV

# params is a dictionary, the keys are the hyperparameter and the vaules are a list of values
# to search over.
params = [
    {
        "transform__txt__max_features": [None, 100, 10],
        "transform__num__selector__attribute_names": [
            ["n_words"],
            ["mean_word_length"],

## custom_transformers.py
from sklearn.base import BaseEstimator, TransformerMixin


class CountWords(BaseEstimator, TransformerMixin):
    # Creates a dataframe from a series of text documents by creating a new column named n_words,
    # that contains the number of words in each document
    def __init__(self, new_col_name):
        self.new_col_name = new_col_name

    def fit(self, series, y=None):

## get_post_info.py
import pandas as pd
from instascrape import Post, Profile

def get_post_data(post_object,
             attributes=['caption',
                        'upload_date',
                        'location',
                        'likes',
                        'comments',
                        'id']):

## hashtag_posts.py
from instascrape import Hashtag
#Substitute 'ad' with the word you
#want to search for (as a string)
hashtag = Hashtag('ad')

#Scrape the profile
hashtag.scrape()

#Get list of the recent posts
recents = hashtag.get_recent_posts()

## download_images.py
#Scrape profile and get recent posts
natgeo = Profile('natgeo')
natgeo.scrape()
recents = natgeo.get_recent_posts()

#Filter list to separate images from videos
recent_photos = [post for post in recents if not post.is_video]

#Save photos in a loop
for i, post in enumerate(recent_photos):

## plot_keyterms.py
from typing import List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
from textacy import extract, make_spacy_doc


def decompose_keyterms(keyterm_list: List[str]) -> Tuple:

## subject_dependencies.py
from itertools import count

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import textacy

with open("news_article.txt", "r") as file:
    data = file.read().replace("\n", "")

## filter_language_structures.py
import numpy as np
import pandas as pd
from textacy import extract, make_spacy_doc

# Load the entire article text
with open("news_article.txt", "r") as file:
    data = file.read().replace("\n", "")
article = data.replace(u"\xa0", u" ")

# Create doc object
	import pandas as pd
	from sklearn.metrics import (
	accuracy_score,
	auc,
	precision_score,
	recall_score,
	roc_curve,
	)
	from sklearn.model_selection import StratifiedShuffleSplit
	from sklearn.pipeline import Pipeline
	from sklearn.compose import ColumnTransformer
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.impute import SimpleImputer
	from sklearn.pipeline import Pipeline
	from sklearn.svm import SVC

	# Define the names of the text and numerical features
	text_features = "text"
	numerical_features = ["n_words", "mean_word_length"]
	from sklearn.model_selection import GridSearchCV

	# params is a dictionary, the keys are the hyperparameter and the vaules are a list of values
	# to search over.
	params = [
	{
	"transform__txt__max_features": [None, 100, 10],
	"transform__num__selector__attribute_names": [
	["n_words"],
	["mean_word_length"],
	from sklearn.base import BaseEstimator, TransformerMixin


	class CountWords(BaseEstimator, TransformerMixin):
	# Creates a dataframe from a series of text documents by creating a new column named n_words,
	# that contains the number of words in each document
	def __init__(self, new_col_name):
	self.new_col_name = new_col_name

	def fit(self, series, y=None):
	import pandas as pd
	from instascrape import Post, Profile

	def get_post_data(post_object,
	attributes=['caption',
	'upload_date',
	'location',
	'likes',
	'comments',
	'id']):
	from instascrape import Hashtag
	#Substitute 'ad' with the word you
	#want to search for (as a string)
	hashtag = Hashtag('ad')

	#Scrape the profile
	hashtag.scrape()

	#Get list of the recent posts
	recents = hashtag.get_recent_posts()
	#Scrape profile and get recent posts
	natgeo = Profile('natgeo')
	natgeo.scrape()
	recents = natgeo.get_recent_posts()

	#Filter list to separate images from videos
	recent_photos = [post for post in recents if not post.is_video]

	#Save photos in a loop
	for i, post in enumerate(recent_photos):
	from typing import List, Tuple

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	from matplotlib.axes import Axes
	from textacy import extract, make_spacy_doc


	def decompose_keyterms(keyterm_list: List[str]) -> Tuple:
	from itertools import count

	import matplotlib.pyplot as plt
	import networkx as nx
	import numpy as np
	import pandas as pd
	import textacy

	with open("news_article.txt", "r") as file:
	data = file.read().replace("\n", "")