Skip to content

Instantly share code, notes, and snippets.

View oliver-batey's full-sized avatar

Oliver Batey oliver-batey

  • Sunderland, United Kingdom
View GitHub Profile
@oliver-batey
oliver-batey / sampling.py
Last active March 22, 2022 11:48
Building a sampling distribution
import pandas as pd
from sklearn.metrics import (
accuracy_score,
auc,
precision_score,
recall_score,
roc_curve,
)
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
@oliver-batey
oliver-batey / pipeline.py
Last active March 22, 2022 16:40
Pipeline example
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
# Define the names of the text and numerical features
text_features = "text"
numerical_features = ["n_words", "mean_word_length"]
@oliver-batey
oliver-batey / gridsearchcv.py
Last active March 22, 2022 13:16
Example of a gridsearch in scikit-learn
from sklearn.model_selection import GridSearchCV
# params is a dictionary, the keys are the hyperparameter and the vaules are a list of values
# to search over.
params = [
{
"transform__txt__max_features": [None, 100, 10],
"transform__num__selector__attribute_names": [
["n_words"],
["mean_word_length"],
from sklearn.base import BaseEstimator, TransformerMixin
class CountWords(BaseEstimator, TransformerMixin):
# Creates a dataframe from a series of text documents by creating a new column named n_words,
# that contains the number of words in each document
def __init__(self, new_col_name):
self.new_col_name = new_col_name
def fit(self, series, y=None):
@oliver-batey
oliver-batey / get_post_info.py
Created December 22, 2020 20:22
Instagram Post Info
import pandas as pd
from instascrape import Post, Profile
def get_post_data(post_object,
attributes=['caption',
'upload_date',
'location',
'likes',
'comments',
'id']):
from instascrape import Hashtag
#Substitute 'ad' with the word you
#want to search for (as a string)
hashtag = Hashtag('ad')
#Scrape the profile
hashtag.scrape()
#Get list of the recent posts
recents = hashtag.get_recent_posts()
@oliver-batey
oliver-batey / download_images.py
Created December 23, 2020 20:16
Download instagram images
#Scrape profile and get recent posts
natgeo = Profile('natgeo')
natgeo.scrape()
recents = natgeo.get_recent_posts()
#Filter list to separate images from videos
recent_photos = [post for post in recents if not post.is_video]
#Save photos in a loop
for i, post in enumerate(recent_photos):
@oliver-batey
oliver-batey / plot_keyterms.py
Last active November 29, 2021 11:45
Plot keyterms of a document
from typing import List, Tuple
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
from textacy import extract, make_spacy_doc
def decompose_keyterms(keyterm_list: List[str]) -> Tuple:
@oliver-batey
oliver-batey / subject_dependencies.py
Last active November 28, 2021 19:55
Calculate the distance between nodes of dependency network
from itertools import count
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import textacy
with open("news_article.txt", "r") as file:
data = file.read().replace("\n", "")
@oliver-batey
oliver-batey / filter_language_structures.py
Last active November 28, 2021 18:29
Get relevant SVO patterns and sentences
import numpy as np
import pandas as pd
from textacy import extract, make_spacy_doc
# Load the entire article text
with open("news_article.txt", "r") as file:
data = file.read().replace("\n", "")
article = data.replace(u"\xa0", u" ")
# Create doc object