Skip to content

Instantly share code, notes, and snippets.

@joshua-taylor
Created November 24, 2019 13:16
Show Gist options
  • Save joshua-taylor/3abad7e105e25cfb46f3c0f56ad16c85 to your computer and use it in GitHub Desktop.
Save joshua-taylor/3abad7e105e25cfb46f3c0f56ad16c85 to your computer and use it in GitHub Desktop.
BERT vectors and TFIDF
import spacy
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
import IPython
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
torch.set_default_tensor_type("torch.cuda.FloatTensor")
nlp = spacy.load("en_trf_bertbaseuncased_lg")
vectorizer = TfidfVectorizer(min_df=0.0,lowercase=True)
tfidf = vectorizer.fit(df.questionText.values)
tkn = tfidf.build_tokenizer()
print('creating a lookup dictionary') #this speeds up the script significantly...
tfidf_lookup = {}
for key,value in tfidf.vocabulary_.items():
tfidf_lookup[key]=tfidf.idf_[value]
from tqdm import tqdm
vect = []
for doc in tqdm(nlp.pipe(df.questionText.values,batch_size=5000)):
weighted_doc_tensor = []
try:
for cnt, wrd_vec in enumerate(doc.tensor):
word = doc[cnt].text
try:
weight = tfidf_lookup[word.lower()]
except:
#print('{} not found'.format(word))
weight = 0.5
pass
doc.tensor[cnt] = doc.tensor[cnt]*weight
vect.append(np.mean(doc.tensor,axis=0))
except:
vect.append(np.zeros(768,))#In case there are any blank items
pass
vect = np.vstack(vect)
#takes 39 sec w/o gpu, 6 with!
np.save('question_vects_tfidf.npy', vect)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment