Skip to content

Instantly share code, notes, and snippets.

Mohd Sanad Zaki Rizvi mohdsanadzakirizvi

Block or report user

Report or block mohdsanadzakirizvi

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
from sklearn.metrics import accuracy_score
print(accuracy_score(y_val, pred_bert))
from sklearn.linear_model import LogisticRegression
# LR model
model_bert = LogisticRegression()
# train
model_bert =, y_tr)
# predict
pred_bert = model_bert.predict(X_val_bert)
from bert_serving.client import BertClient
# make a connection with the BERT server using it's ip address
bc = BertClient(ip="YOUR_SERVER_IP")
# get the embedding for train and val sets
X_tr_bert = bc.encode(X_tr.tolist())
X_val_bert = bc.encode(X_val.tolist())
from sklearn.model_selection import train_test_split
# split into training and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(train.clean_text, train.label, test_size=0.25, random_state=42)
print('X_tr shape:',X_tr.shape)
import re
# clean text from noise
def clean_text(text):
# filter to allow only alphabets
text = re.sub(r'[^a-zA-Z\']', ' ', text)
# remove Unicode characters
text = re.sub(r'[^\x00-\x7F]+', '', text)
import pandas as pd
import numpy as np
# load training data
train = pd.read_csv('BERT_proj/train_E6oV3lV.csv', encoding='iso-8859-1')
from bert_serving.client import BertClient
# make a connection with the BERT server using it's ip address; do not give any ip if same computer
bc = BertClient(ip="SERVER_IP_HERE")
# get the embedding
embedding = bc.encode(["I love data science and analytics vidhya."])
# check the shape of embedding, it should be 1x768
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
in_text = seed_text
# generate a fixed number of characters
for _ in range(n_chars):
# encode the characters as integers
encoded = [mapping[char] for char in in_text]
# truncate sequences to a fixed length
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
# predict character
# define model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
# compile the model
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
# fit the model
from sklearn.model_selection import train_test_split
# vocabulary size
vocab = len(mapping)
sequences = np.array(sequences)
# create X and y
X, y = sequences[:,:-1], sequences[:,-1]
# one hot encode y
y = to_categorical(y, num_classes=vocab)
# create train and validation sets
You can’t perform that action at this time.