-
-
Save xeoncross/7ae8bb98d9a8168e79f9bafd5f2ebd39 to your computer and use it in GitHub Desktop.
Topic Modeling with Spacy and Gensim
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<component name="ProjectDictionaryState"> | |
<dictionary name="ravidrichards"> | |
<words> | |
<w>readlines</w> | |
<w>spacy</w> | |
</words> | |
</dictionary> | |
</component> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<module type="PYTHON_MODULE" version="4"> | |
<component name="NewModuleRootManager"> | |
<content url="file://$MODULE_DIR$" /> | |
<orderEntry type="jdk" jdkName="Python 3.5.2 (~/anaconda/bin/python)" jdkType="Python SDK" /> | |
<orderEntry type="sourceFolder" forTests="false" /> | |
</component> | |
<component name="TestRunnerService"> | |
<option name="PROJECT_TEST_RUNNER" value="Unittests" /> | |
</component> | |
</module> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project version="4"> | |
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.2 (~/anaconda/bin/python)" project-jdk-type="Python SDK" /> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project version="4"> | |
<component name="ProjectModuleManager"> | |
<modules> | |
<module fileurl="file://$PROJECT_DIR$/.idea/KJV_Spacy.iml" filepath="$PROJECT_DIR$/.idea/KJV_Spacy.iml" /> | |
</modules> | |
</component> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
source: 01_gensim_prepro.py | |
purpose: Created dictionary and corpus from raw text after cleaning | |
author: David Richards | |
PLAN | |
* Improve lemmatization (spacy?) | |
* Add sense2vec | |
* Autofind book & chapter breaks (treat each as own "topic volume") | |
""" | |
# IMPORTS | |
import os | |
import string | |
import gensim | |
import nltk | |
from nltk.stem.wordnet import WordNetLemmatizer | |
# GLOBALS | |
SCOPE_NAME = 'gen1' | |
MODELS_DIR = os.path.expanduser('~/Documents/Coding/KJV_Modeler/Models/') | |
def main(): | |
# GET TEXT | |
verses = nltk.corpus.gutenberg.sents("bible-kjv.txt") # KJV, parsed by verse, by word | |
# verses[3:] skips the KJV title; [2:] skips the 1st 3 items (ch:vs) in each verse (Gen = 3:1471) | |
texts = [[word.lower() for word in verse] for verse in verses[3:36]] | |
# CREATE & CLEAN DICTIONARY | |
# remove punctuation, single-character words, and numbers, stopwords, words used only once | |
texts_bu = texts | |
lmtzr = WordNetLemmatizer() | |
punct = set(string.punctuation) | |
texts = [[lmtzr.lemmatize(word) for word in text if word not in punct and len(word) > 1 and not word.isdigit()] | |
for text in texts] | |
dictionary = gensim.corpora.Dictionary(texts) | |
stopword_ids = map(dictionary.token2id.get, stopwords()) | |
dictionary.filter_tokens(stopword_ids) | |
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1] | |
dictionary.filter_tokens(once_ids) | |
dictionary.compactify() | |
# SAVE DICTIONARY & CORPUS | |
dictionary.save(MODELS_DIR + SCOPE_NAME + '.dict') | |
dictionary.save_as_text(MODELS_DIR + SCOPE_NAME + '_dict.txt') | |
# pprint(dictionary.token2id) | |
corpus = [dictionary.doc2bow(text) for text in texts] | |
gensim.corpora.MmCorpus.serialize(MODELS_DIR + SCOPE_NAME + '.mm', corpus) | |
def stopwords(): | |
local_stopwords = set('cannot could unto wa'.split()) | |
return set(nltk.corpus.stopwords.words('english')).union(local_stopwords) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
source: 01_tokenize.py | |
from: | |
author: david richards | |
date: 2017-01-25 | |
""" | |
# IMPORTS | |
import logging | |
import os | |
from collections import Counter | |
import gensim | |
import spacy | |
import textacy | |
from gensim import corpora, models, similarities | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
# GLOBALS | |
nlp = spacy.load('en') | |
FILE_NAME = 'gen1.txt' | |
FILE_DIR = os.path.expanduser('~/Documents/Coding/KJV_Spacy/SourceText/') | |
MODEL_DIR = os.path.expanduser('~/Documents/Coding/KJV_Spacy/Models/') | |
def main(): | |
# Process `text` with Spacy nlp Parser | |
with open(FILE_DIR + FILE_NAME, 'r') as file: | |
text = file.read() | |
# TODO: what are thee file options? readlines, decode(utf-8)? | |
# text = read_file(FILE_DIR + FILE_NAME) | |
doc = nlp(text) | |
texts = [[str(word) for word in sent] for sent in doc.sents] | |
dictionary = gensim.corpora.Dictionary(texts) | |
s = "the buck stops here" | |
dictionary.save(MODEL_DIR + 'gen1.dict') | |
dictionary.save_as_text(MODEL_DIR + 'gen1_dict.txt') | |
corpus = [dictionary.doc2bow(text) for text in texts] | |
tfidf = models.TfidfModel(corpus) | |
# keywords = Counter() | |
# for chunk in doc.noun_chunks: | |
# if nlp.vocab[chunk.lemma_].prob < - 8: # probability value -8 is arbitrarily selected threshold | |
# keywords[chunk.lemma_] += 1 | |
# | |
# print(keywords.most_common(20)) | |
# Iterate over base NPs, e.g. "all their good ideas" | |
# for np in doc.noun_chunks: | |
# # Only keep adjectives and nouns, e.g. "good ideas" | |
# while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'): | |
# np = np[1:] | |
# if len(np) > 1: | |
# # Merge the tokens, e.g. good_ideas | |
# np.merge(np.root.tag_, np.text, np.root.ent_type_) | |
# # Iterate over named entities | |
# for ent in doc.ents: | |
# if len(ent) > 1: | |
# # Merge them into single tokens | |
# ent.merge(ent.root.tag_, ent.text, ent.label_) | |
# tokens = [token.lemma_ for token in doc if token.isalpha() and token.is_stop == False] | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
�cgensim.corpora.dictionary | |
Dictionary | |
q )�q}q(X num_posqM�X num_nnzqM�X __scipysq]qX id2tokenq}qX num_docsq K<X dfsq | |
}q(K K!KKKKKKKKKKKKKKKKK KK | |
KKKKKK | |
KKKKKKKKKKKKKKKKK | |
KKKKKKKKKKKKKKKKKKKKK K | |
K!KK"KK#KK$KK%KK&KK'KK(KK)KK*KK+KK,KK-KK.KK/KK0K K1KK2KK3KK4KK5K | |
K6KK7KK8KK9KK:KK;KK<KK=KK>KK?KK@KKAKKBKKCKKDKKEKKFKKGKKHKKIKKJKKKKKLKKMKKNKKOKKPKKQKKRKKSKKTKKUKKVKKWKKXKKYKKZKK[KK\KK]KK^KK_KK`KKaKKbKKcKKdKKeKKfKKgKKhKKiKKjKKkKKlKKmKKnKKoKKpKKqKKrKKsKKtKKuKKvKKwKKxKKyKKzKK{KK|KK}KK~KKKK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KuX | |