This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model_skipgram = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5, sg = 1) | |
similar_words_skipgram = {search_term: [item for item in model_skipgram.wv.most_similar | |
([search_term], topn=300)] | |
for search_term in ['comorbidity']} | |
similar_words_skipgram |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sample = open('all_docs.txt', 'r', encoding = 'utf-8') | |
s = sample.read() | |
f = s.replace("\n", " ") | |
data = [] | |
for i in tqdm(sent_tokenize(f)): | |
temp = [] | |
for j in word_tokenize(i): | |
temp.append(j.lower()) | |
data.append(temp) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with open('all_abstracts_lemmas.txt', 'r', encoding = "utf-8") as f: | |
lemma_text = f.read() | |
n = 4 | |
word = r'\W*([\w]+)' | |
text_search_como = re.findall(r'{}\W*{}{}'.format(word*n,'(?:comorbid|comorbidity)',word*n), | |
lemma_text) | |
flatten_text_search = [element for sublist in text_search_como for element in sublist | |
if len(element) >3] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with open('all_abstracts_tokens.txt', encoding = "utf-8") as f, open('all_abstracts_lemmas.txt', 'w', encoding = "utf-8") as out_f: | |
text = f.read() | |
tokens = word_tokenize(text) | |
lemma = WordNetLemmatizer() | |
lemmed = [lemma.lemmatize(word) for word in tokens] | |
new_lem_text = ' '.join(lemmed) | |
out_f.write(new_lem_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docs = [] | |
for file in tqdm(file_list): | |
j = json.load(open(file, "rb")) | |
abstract = "" | |
try: | |
if j['abstract']: | |
for entry in j['abstract']: | |
abstract += entry['text'] +'\n\n' | |
except KeyError: | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
counter = 0 | |
file_list = [] | |
for dirname, _, filenames in os.walk(file_dir): | |
for filename in filenames: | |
if filename[-5:]==".json": | |
file_list.append(os.path.join(dirname, filename)) | |
file_list.sort() | |
total_files = len(file_list); total_files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
documents = [open(f).read() for f in texts] | |
tfidf = TfidfVectorizer(stop_words=my_stop_words).fit_transform(documents) | |
pairwise_similarity = tfidf * tfidf.T | |
pairwise_similarity_matrix = pairwise_similarity.todense() | |
psm_df = pd.DataFrame(pairwise_similarity_matrix, index = titles, columns = titles).round(3) | |
psm_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for i in range(len(text_topic_nmf)): | |
top_topics = np.argsort(text_topic_nmf[i,:])[::-1][0:4] | |
top_topics_str = ' '.join(str(t) for t in top_topics) | |
print("{}: {}".format(titles[i], top_topics_str)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
num_top_words = 10 | |
topic_words_nmf = [] | |
text_topic_nmf = deco_nmf.fit_transform(tm_sparse) | |
for topic in deco_nmf.components_: | |
word_idx = np.argsort(topic)[::-1][0:num_top_words] | |
topic_words_nmf.append([vocab[i] for i in word_idx]) | |
for t in range(len(topic_words_nmf)): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import sklearn.feature_extraction.text as text | |
vectorizer = text.CountVectorizer(input='filename', stop_words=my_stop_words, min_df=text_number) | |
tm_sparse = vectorizer.fit_transform(texts) | |
tm_array = vectorizer.fit_transform(texts).toarray() | |
vocab = np.array(vectorizer.get_feature_names()) |
NewerOlder