Skip to content

Instantly share code, notes, and snippets.

View vikeshsingh37's full-sized avatar

Vikesh Singh Baghel vikeshsingh37

View GitHub Profile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
train_text = df_train_augmented.text.tolist()
X_train = CountVectorizer(ngram_range=(1, 2)).fit_transform(train_text)
clf = LogisticRegression(solver="lbfgs")
clf.fit(X=X_train, y=df_train_augmented.label.values)
from snorkel.slicing import slicing_function
@slicing_function()
def short_link(x):
"""Return whether text matches common pattern for shortened ".ly" links."""
return int(bool(re.search(r"\w+\.ly", x.text)))
from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier
tf_policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)
df_train_augmented = tf_applier.apply(df_train)
import random
import nltk
from nltk.corpus import wordnet as wn
from snorkel.augmentation import transformation_function
nltk.download("wordnet", quiet=True)
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")
from snorkel.labeling import PandasLFApplier
# Define the set of labeling functions (LFs)
lfs = [lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_textblob_polarity]
# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
from snorkel.labeling import labeling_function
from textblob import TextBlob
import re
@labeling_function()
def lf_keyword_my(x):
"""Many spam comments talk about 'my channel', 'my video', etc."""
return SPAM if "my" in x.text.lower() else ABSTAIN
@labeling_function()
@vikeshsingh37
vikeshsingh37 / read_large_csvfile.py
Last active March 30, 2019 08:13
A sample script to process a huge csv file in python which otherwise cannot be processed due to memory limitations
import pandas as pd
# Split data into smaller chunks and process in parts
chunk_size = 100000
required_data = pd.DataFrame()
for data in pd.read_csv(myfile,chunksize = chunk_size):
data["datetime"]= pd.to_datetime(data["timestamp"],unit = 's')
data["datetime"]=data["datetime"].dt.tz_localize('UTC').dt.tz_convert('Asia/Kolkata')
data["date"] =data["datetime"].dt.date
data["week"] =data["datetime"].dt.week
data["hour"] = data["datetime"].dt.hour
@vikeshsingh37
vikeshsingh37 / stack_model_example.py
Created February 23, 2019 16:17
Example to use stack_model.py
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from stack_model import stack_model
rf_model = RandomForestRegressor()
gbm_model = GradientBoostingRegressor()
en_model = ElasticNet()
base_models = [rf_model,gbm_model,en_model]
meta_model = RandomForestRegressor(n_estimators =100)
@vikeshsingh37
vikeshsingh37 / stack_model.py
Created February 23, 2019 16:11
meta_estimator regressor
# Author: Vikesh Singh Baghel
# Date: 16-Feb-2019
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
class stack_model:
def __init__(self,base_models,meta_model,train_X,train_y):
self.base_models = base_models