Skip to content

Instantly share code, notes, and snippets.

model_skipgram = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5, sg = 1)
similar_words_skipgram = {search_term: [item for item in model_skipgram.wv.most_similar
([search_term], topn=300)]
for search_term in ['comorbidity']}
similar_words_skipgram
sample = open('all_docs.txt', 'r', encoding = 'utf-8')
s = sample.read()
f = s.replace("\n", " ")
data = []
for i in tqdm(sent_tokenize(f)):
temp = []
for j in word_tokenize(i):
temp.append(j.lower())
data.append(temp)
with open('all_abstracts_lemmas.txt', 'r', encoding = "utf-8") as f:
lemma_text = f.read()
n = 4
word = r'\W*([\w]+)'
text_search_como = re.findall(r'{}\W*{}{}'.format(word*n,'(?:comorbid|comorbidity)',word*n),
lemma_text)
flatten_text_search = [element for sublist in text_search_como for element in sublist
if len(element) >3]
with open('all_abstracts_tokens.txt', encoding = "utf-8") as f, open('all_abstracts_lemmas.txt', 'w', encoding = "utf-8") as out_f:
text = f.read()
tokens = word_tokenize(text)
lemma = WordNetLemmatizer()
lemmed = [lemma.lemmatize(word) for word in tokens]
new_lem_text = ' '.join(lemmed)
out_f.write(new_lem_text)
docs = []
for file in tqdm(file_list):
j = json.load(open(file, "rb"))
abstract = ""
try:
if j['abstract']:
for entry in j['abstract']:
abstract += entry['text'] +'\n\n'
except KeyError:
pass
counter = 0
file_list = []
for dirname, _, filenames in os.walk(file_dir):
for filename in filenames:
if filename[-5:]==".json":
file_list.append(os.path.join(dirname, filename))
file_list.sort()
total_files = len(file_list); total_files
@mlai-demo
mlai-demo / tfidf.py
Created January 19, 2020 03:51
TF-IDF, pairwise similarity matrix, and pandas dataframe
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
documents = [open(f).read() for f in texts]
tfidf = TfidfVectorizer(stop_words=my_stop_words).fit_transform(documents)
pairwise_similarity = tfidf * tfidf.T
pairwise_similarity_matrix = pairwise_similarity.todense()
psm_df = pd.DataFrame(pairwise_similarity_matrix, index = titles, columns = titles).round(3)
psm_df
for i in range(len(text_topic_nmf)):
top_topics = np.argsort(text_topic_nmf[i,:])[::-1][0:4]
top_topics_str = ' '.join(str(t) for t in top_topics)
print("{}: {}".format(titles[i], top_topics_str))
@mlai-demo
mlai-demo / text_topics.py
Last active January 19, 2020 22:23
create text topics using NMF
num_top_words = 10
topic_words_nmf = []
text_topic_nmf = deco_nmf.fit_transform(tm_sparse)
for topic in deco_nmf.components_:
word_idx = np.argsort(topic)[::-1][0:num_top_words]
topic_words_nmf.append([vocab[i] for i in word_idx])
for t in range(len(topic_words_nmf)):
@mlai-demo
mlai-demo / vectorize_text.py
Last active January 19, 2020 22:23
vectorize text, and create sparse matrix and numpy array
import numpy as np
import sklearn.feature_extraction.text as text
vectorizer = text.CountVectorizer(input='filename', stop_words=my_stop_words, min_df=text_number)
tm_sparse = vectorizer.fit_transform(texts)
tm_array = vectorizer.fit_transform(texts).toarray()
vocab = np.array(vectorizer.get_feature_names())