Skip to content

Instantly share code, notes, and snippets.

Mohd Sanad Zaki Rizvi mohdsanadzakirizvi

Block or report user

Report or block mohdsanadzakirizvi

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View bert_acc.py
from sklearn.metrics import accuracy_score
print(accuracy_score(y_val, pred_bert))
View bert_model.py
from sklearn.linear_model import LogisticRegression
# LR model
model_bert = LogisticRegression()
# train
model_bert = model_bert.fit(X_tr_bert, y_tr)
# predict
pred_bert = model_bert.predict(X_val_bert)
View bert_embd.py
from bert_serving.client import BertClient
# make a connection with the BERT server using it's ip address
bc = BertClient(ip="YOUR_SERVER_IP")
# get the embedding for train and val sets
X_tr_bert = bc.encode(X_tr.tolist())
X_val_bert = bc.encode(X_val.tolist())
View bert_split.py
from sklearn.model_selection import train_test_split
# split into training and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(train.clean_text, train.label, test_size=0.25, random_state=42)
print('X_tr shape:',X_tr.shape)
View bert_clean.py
import re
# clean text from noise
def clean_text(text):
# filter to allow only alphabets
text = re.sub(r'[^a-zA-Z\']', ' ', text)
# remove Unicode characters
text = re.sub(r'[^\x00-\x7F]+', '', text)
View bert_data.py
import pandas as pd
import numpy as np
# load training data
train = pd.read_csv('BERT_proj/train_E6oV3lV.csv', encoding='iso-8859-1')
train.shape
View connect_bert.py
from bert_serving.client import BertClient
# make a connection with the BERT server using it's ip address; do not give any ip if same computer
bc = BertClient(ip="SERVER_IP_HERE")
# get the embedding
embedding = bc.encode(["I love data science and analytics vidhya."])
# check the shape of embedding, it should be 1x768
print(embedding.shape)
View lm_infer.py
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
in_text = seed_text
# generate a fixed number of characters
for _ in range(n_chars):
# encode the characters as integers
encoded = [mapping[char] for char in in_text]
# truncate sequences to a fixed length
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
# predict character
View lm_model.py
# define model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
print(model.summary())
# compile the model
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
# fit the model
View lm_val.py
from sklearn.model_selection import train_test_split
# vocabulary size
vocab = len(mapping)
sequences = np.array(sequences)
# create X and y
X, y = sequences[:,:-1], sequences[:,-1]
# one hot encode y
y = to_categorical(y, num_classes=vocab)
# create train and validation sets
You can’t perform that action at this time.