Skip to content

Instantly share code, notes, and snippets.

View prrao87's full-sized avatar

Prashanth Rao prrao87

View GitHub Profile
@prrao87
prrao87 / transform_stance.py
Created January 12, 2019 23:40
classification task-head for transformer
def transform_stance(X1):
# Input transform for classification task-head
n_batch = len(X1)
xmb = np.zeros((n_batch, 1, n_ctx, 2), dtype=np.int32)
mmb = np.zeros((n_batch, 1, n_ctx), dtype=np.float32)
start = encoder['_start_']
for i, x1 in enumerate(X1):
x12 = [start] + x1[:max_len] + [clf_token]
l12 = len(x12)
xmb[i, 0, :l12, 0] = x12
@prrao87
prrao87 / clean_stance.py
Created January 13, 2019 00:03
clean input tweet data to only have ascii characters
def _stance(path, topic=None):
def clean_ascii(text):
# function to remove non-ASCII chars from data
return ''.join(i for i in text if ord(i) < 128)
orig = pd.read_csv(path, delimiter='\t', header=0, encoding = "latin-1")
orig['Tweet'] = orig['Tweet'].apply(clean_ascii)
df = orig
# Get only those tweets that pertain to a single topic in the training data
if topic is not None:
df = df.loc[df['Target'] == topic]
@prrao87
prrao87 / split_input_stance.py
Created January 13, 2019 00:04
split tweet data into training, validation and test sets for the transformer
def stance(data_dir, topic=None):
path = Path(data_dir)
trainfile = 'semeval2016-task6-trainingdata.txt'
testfile = 'SemEval2016-Task6-subtaskA-testdata.txt'
X, Y = _stance(path/trainfile, topic=topic)
teX, _ = _stance(path/testfile, topic=topic)
tr_text, va_text, tr_sent, va_sent = train_test_split(X, Y, test_size=0.2, random_state=seed)
trX = []
trY = []
@prrao87
prrao87 / tree2tabular.py
Last active August 26, 2019 15:58
Convert SST-5 tree data to tabular form
# Load data
import pytreebank
import sys
import os
out_path = os.path.join(sys.path[0], 'sst_{}.txt')
dataset = pytreebank.load_sst('./raw_data')
# Store train, dev and test in separate files
for category in ['train', 'test', 'dev']:
@prrao87
prrao87 / base_utils.py
Created August 26, 2019 20:25
Base utilities class for all classifiers
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score
class Base:
"""Base class that houses common utilities for reading in test data
and calculating model accuracy and F1 scores.
"""
def __init__(self) -> None:
pass
@prrao87
prrao87 / example_sentiment_class.py
Last active August 29, 2019 14:51
Example sentiment predictor class
class ExampleSentiment(Base):
"""Predict sentiment scores using using X classifier"""
def __init__(self, model_file: str=None) -> None:
super().__init__() # Inherit methods from Base class
def score(self, text: str) -> int:
"""Return a sentiment score on sample text, an integer in the range [1, 2, 3, 4, 5]"""
# Apply some sentiment scoring technique here
def predict(self, train_file: None, test_file: str, lower_case: bool) -> pd.DataFrame:
class TextBlobSentiment(Base):
"""Predict fine-grained sentiment classes using TextBlob."""
def __init__(self, model_file: str=None) -> None:
super().__init__()
def score(self, text: str) -> float:
# pip install textblob
from textblob import TextBlob
return TextBlob(text).sentiment.polarity
class VaderSentiment(Base):
"""Predict fine-grained sentiment classes using Vader."""
def __init__(self, model_file: str=None) -> None:
super().__init__()
from nltk.sentiment.vader import SentimentIntensityAnalyzer
self.vader = SentimentIntensityAnalyzer()
def score(self, text: str) -> float:
return self.vader.polarity_scores(text)['compound']
class LogisticRegressionSentiment(Base):
"""Predict fine-grained sentiment scores using a sklearn Logistic Regression pipeline."""
def __init__(self, model_file: str=None) -> None:
super().__init__()
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
self.pipeline = Pipeline(
[
('vect', CountVectorizer()),
class SVMSentiment(Base):
"""Predict fine-grained sentiment scores using a sklearn
linear Support Vector Machine (SVM) pipeline."""
def __init__(self, model_file: str=None) -> None:
super().__init__()
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
self.pipeline = Pipeline(
[