Skip to content

Instantly share code, notes, and snippets.

@denjn5
Created February 4, 2017 10:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save denjn5/404a99cd494942fe97b36e773d9b147a to your computer and use it in GitHub Desktop.
Save denjn5/404a99cd494942fe97b36e773d9b147a to your computer and use it in GitHub Desktop.
Topic Modeling with Spacy and Gensim
<component name="ProjectDictionaryState">
<dictionary name="ravidrichards">
<words>
<w>readlines</w>
<w>spacy</w>
</words>
</dictionary>
</component>
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.5.2 (~/anaconda/bin/python)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.2 (~/anaconda/bin/python)" project-jdk-type="Python SDK" />
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/KJV_Spacy.iml" filepath="$PROJECT_DIR$/.idea/KJV_Spacy.iml" />
</modules>
</component>
</project>
"""
source: 01_gensim_prepro.py
purpose: Created dictionary and corpus from raw text after cleaning
author: David Richards
PLAN
* Improve lemmatization (spacy?)
* Add sense2vec
* Autofind book & chapter breaks (treat each as own "topic volume")
"""
# IMPORTS
import os
import string
import gensim
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
# GLOBALS
SCOPE_NAME = 'gen1'
MODELS_DIR = os.path.expanduser('~/Documents/Coding/KJV_Modeler/Models/')
def main():
# GET TEXT
verses = nltk.corpus.gutenberg.sents("bible-kjv.txt") # KJV, parsed by verse, by word
# verses[3:] skips the KJV title; [2:] skips the 1st 3 items (ch:vs) in each verse (Gen = 3:1471)
texts = [[word.lower() for word in verse] for verse in verses[3:36]]
# CREATE & CLEAN DICTIONARY
# remove punctuation, single-character words, and numbers, stopwords, words used only once
texts_bu = texts
lmtzr = WordNetLemmatizer()
punct = set(string.punctuation)
texts = [[lmtzr.lemmatize(word) for word in text if word not in punct and len(word) > 1 and not word.isdigit()]
for text in texts]
dictionary = gensim.corpora.Dictionary(texts)
stopword_ids = map(dictionary.token2id.get, stopwords())
dictionary.filter_tokens(stopword_ids)
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(once_ids)
dictionary.compactify()
# SAVE DICTIONARY & CORPUS
dictionary.save(MODELS_DIR + SCOPE_NAME + '.dict')
dictionary.save_as_text(MODELS_DIR + SCOPE_NAME + '_dict.txt')
# pprint(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in texts]
gensim.corpora.MmCorpus.serialize(MODELS_DIR + SCOPE_NAME + '.mm', corpus)
def stopwords():
local_stopwords = set('cannot could unto wa'.split())
return set(nltk.corpus.stopwords.words('english')).union(local_stopwords)
if __name__ == "__main__":
main()
"""
source: 01_tokenize.py
from:
author: david richards
date: 2017-01-25
"""
# IMPORTS
import logging
import os
from collections import Counter
import gensim
import spacy
import textacy
from gensim import corpora, models, similarities
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# GLOBALS
nlp = spacy.load('en')
FILE_NAME = 'gen1.txt'
FILE_DIR = os.path.expanduser('~/Documents/Coding/KJV_Spacy/SourceText/')
MODEL_DIR = os.path.expanduser('~/Documents/Coding/KJV_Spacy/Models/')
def main():
# Process `text` with Spacy nlp Parser
with open(FILE_DIR + FILE_NAME, 'r') as file:
text = file.read()
# TODO: what are thee file options? readlines, decode(utf-8)?
# text = read_file(FILE_DIR + FILE_NAME)
doc = nlp(text)
texts = [[str(word) for word in sent] for sent in doc.sents]
dictionary = gensim.corpora.Dictionary(texts)
s = "the buck stops here"
dictionary.save(MODEL_DIR + 'gen1.dict')
dictionary.save_as_text(MODEL_DIR + 'gen1_dict.txt')
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
# keywords = Counter()
# for chunk in doc.noun_chunks:
# if nlp.vocab[chunk.lemma_].prob < - 8: # probability value -8 is arbitrarily selected threshold
# keywords[chunk.lemma_] += 1
#
# print(keywords.most_common(20))
# Iterate over base NPs, e.g. "all their good ideas"
# for np in doc.noun_chunks:
# # Only keep adjectives and nouns, e.g. "good ideas"
# while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'):
# np = np[1:]
# if len(np) > 1:
# # Merge the tokens, e.g. good_ideas
# np.merge(np.root.tag_, np.text, np.root.ent_type_)
# # Iterate over named entities
# for ent in doc.ents:
# if len(ent) > 1:
# # Merge them into single tokens
# ent.merge(ent.root.tag_, ent.text, ent.label_)
# tokens = [token.lemma_ for token in doc if token.isalpha() and token.is_stop == False]
if __name__ == "__main__":
main()
�cgensim.corpora.dictionary
Dictionary
q)�q}q(Xnum_posqM�Xnum_nnzqM�X__scipysq]qXid2tokenq}qXnum_docsq K<Xdfsq
}q (KK!KKKKKKKKKKKKKKKKK KK
KK KK KK
KKKKKKKKKKKKKKKKK
KKKKKKKKKKKKKKKKKKKKK K
K!KK"KK#KK$K K%KK&KK'KK(KK)KK*KK+KK,KK-KK.KK/KK0K K1KK2KK3KK4KK5K
K6KK7KK8KK9KK:KK;KK<KK=KK>KK?KK@KKAKKBKKCKKDKKEKKFKKGKKHKKIKKJKKKKKLKKMKKNKKOKKPKKQKKRKKSKKTKKUKKVKKWKKXKKYKKZKK[KK\KK]KK^KK_KK`KKaKKbKKcKKdKKeKKfKKgKKhKKiKKjKKkKKlKKmKKnKKoKKpKKqKKrKKsKKtKKuKKvKKwKKxKKyKKzKK{KK|KK}KK~KKKK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KK�KuX
__ignoredsq ]q
X__recursive_saveloadsq]qX__numpysq]qXtoken2idq}q(X22qK�XeveningqK.XusqK�XsignsqKgXthatqK$XimageqK�X gatheringqKNXthirdqKbXtwoqKtX25qK�XtheqKX18qKzXbearingq K�X28q!K�Xblessedq"K�X
abundantlyq#KX.q$KXhimq%K�X11q&KPX17q'KwXNightq(K-Xwhoseq)KVXalsoq*KpXlivingq+K�Xwithoutq,KXHeavenq-KAXbringq.K[X1q/KXBeq0K�Xmorningq1K1Xcreatedq2KX5q3K)Xmovedq4KXyouq5K�X
q6KXfirstq7K2Xgoodq8K%Xfruitfulq9K�Xbeholdq:K�X14q;KcXfromq<K(Xyieldingq=K^X7q>K:Xmaleq?K�Xfruitq@K\XgreatqAKrXwhalesqBK�XlesserqCKsX firmamentqDK4XtheirqEK�X4qFK"XthereqGKXstarsqHKvXmadeqIK;XopenqJK�XsawqKK'XcreatureqLK�XbeastqMK�XdivideqNK9XgreenqOK�XdominionqPK�XinqQK5XwhichqRK>X26qSK�XuntoqTKJXnightqUKkXmidstqVK8XcalledqWK,XisqXKWX16qYKoXtoqZKiXflyq[K�Xmakeq\K�Xearthq]K
Xmovingq^K�Xandq_K Xaboveq`K<XwingedqaK�X24qbK�X9qcKCX2qdKXBeholdqeK�XformqfKXyearsqgKfXoverqhKyXthemqiKjXthingqjK�XcreepingqkK�XitselfqlKYXSeasqmKOXhaveqnK�XletqoK6XfifthqpK�X20qqK}XlightsqrKhXbeqsK!XlightqtKXofquKXDayqvK+XAndqwKXseasqxK�XlikenessqyK�XcattleqzK�X:q{KX29q|K�Xwasq}KXafterq~KSXmanqK�Xforthq�KXXsecondq�KBXwhereinq�K�X3q�KXsixthq�K�Xdaysq�KlXbroughtq�K`Xvoidq�K Xveryq�K�XInq�KXallq�K�Xitq�K&X8q�K@Xlifeq�K�Xseaq�K�Xwatersq�KXsetq�KxXeveryq�K�X19q�K{Xownq�K�Xourq�K�Xhathq�K�X replenishq�K�Xfillq�K�Xdarknessq�K
Xunderq�K?Xplaceq�KKX10q�KLXappearq�KDXSpiritq�KXkindq�KQXSoq�K�X30q�K�Xgivenq�K�Xoneq�KFX6q�K3Xfishq�K�XIq�K�Xtogetherq�KGXmayq�K�Xheq�K*Xfowlq�K~X23q�K�Xlandq�KHXmeatq�K�Xhisq�KTXmultiplyq�K�Xsubdueq�K�X13q�KaXLetq�KXuponq�KXEarthq�KMX27q�K�Xtreeq�KRXgrassq�KUX beginningq�KXdayq�K0Xgreaterq�KuXgiveq�KmXGodq�K X12q�K_Xdryq�KIXwereq�K/Xsaidq�K Xfourthq�K|Xdeepq�KXaq�K7Xairq�K�Xsayingq�K�X,q�KXcreepethq�K�Xgatheredq�KEX;q�KX21q�K�Xherbq�KZXfaceq�KXhadq�K�Xforq�KnXseasonsq�KdX15q�KeXsoq�K=Xshallq�K�Xfemaleq�K�Xdividedq�K#Xheavenq�KX31q�K�Xmovethq�K�Xruleq�KqXseedq�K]uub.
6
28
16 , 23
0 . 33
2 1 1
76 10 1
80 11 1
95 12 1
97 13 1
99 14 1
101 15 1
111 16 1
119 17 1
122 18 1
123 19 1
5 2 1
125 20 1
135 21 1
142 22 1
150 23 1
152 24 1
157 25 1
159 26 1
172 27 1
178 28 1
181 29 1
27 3 1
189 30 1
192 31 1
34 4 1
41 5 1
51 6 1
58 7 1
64 8 1
67 9 1
28 : 16
18 ; 6
15 And 31
146 Be 2
188 Behold 1
43 Day 1
77 Earth 1
9 God 25
65 Heaven 1
186 I 2
3 In 1
30 Let 8
45 Night 1
79 Seas 1
174 So 1
25 Spirit 1
55 a 2
60 above 2
127 abundantly 2
83 after 6
164 air 3
162 all 2
112 also 1
11 and 31
68 appear 1
33 be 5
184 bearing 1
155 beast 3
4 beginning 1
195 behold 1
145 blessed 2
91 bring 3
96 brought 2
44 called 3
156 cattle 3
1 created 3
130 creature 3
158 creepeth 3
154 creeping 2
13 darkness 4
48 day 9
108 days 1
23 deep 1
57 divide 3
35 divided 2
160 dominion 2
73 dry 2
10 earth 14
46 evening 6
141 every 7
20 face 3
177 female 1
151 fifth 1
143 fill 1
52 firmament 6
50 first 1
167 fish 2
128 fly 1
110 for 3
17 form 1
88 forth 5
124 fourth 1
126 fowl 6
40 from 5
92 fruit 3
148 fruitful 2
69 gathered 1
78 gathering 1
109 give 2
185 given 2
37 good 7
85 grass 2
114 great 2
117 greater 1
191 green 1
193 had 1
131 hath 1
163 have 4
42 he 5
7 heaven 5
90 herb 4
175 him 1
84 his 6
161 image 2
53 in 10
87 is 3
38 it 16
89 itself 2
81 kind 5
72 land 2
115 lesser 1
54 let 5
134 life 2
29 light 6
104 lights 2
170 likeness 1
137 living 3
59 made 4
168 make 1
173 male 1
165 man 2
133 may 1
182 meat 2
56 midst 1
49 morning 6
26 moved 1
136 moveth 2
132 moving 1
149 multiply 2
107 night 3
19 of 14
70 one 1
129 open 1
169 our 1
121 over 3
176 own 1
75 place 1
180 replenish 1
113 rule 2
32 said 10
39 saw 7
147 saying 1
166 sea 2
144 seas 1
100 seasons 1
66 second 1
93 seed 3
120 set 1
183 shall 1
103 signs 1
196 sixth 1
61 so 6
118 stars 1
179 subdue 1
36 that 11
8 the 31
138 their 2
106 them 6
31 there 4
153 thing 6
98 third 1
105 to 5
71 together 2
82 tree 3
116 two 1
63 under 2
74 unto 2
21 upon 10
171 us 1
194 very 1
12 void 1
14 was 15
24 waters 8
47 were 7
140 whales 1
190 wherein 1
62 which 3
86 whose 2
139 winged 1
22 without 1
102 years 1
94 yielding 3
187 you 1

Bible Chapter Themes

Goal: Identify the common, simple themes in a chapter, such that those themes can be compared (Venn diagram style) with other chapters.

  • Extra points awarded for working in key phrases

Process

  1. Find keywords
  2. Rank by tfidf.
    • How do I work in ranking based on relationships to other key words?
  3. Find key phrases
  4. How to mix them in?
    • Use a weighted tfidf
    • Add to key single-words (to clarify meaning)

Output:

  1. Wordcloud for any section of scripture
  2. Find overlap between chapters
    • Find textually similar chapters, no matter where they are.
  3. Across scripture, graph words / themes
    • Imagine a moving graph where words pop in and out at the right level based on how often they appear.

Ideas:

  • Add relationship ranking
  • Count 1st occurrence of phrase in the document. Terms that tend to appear at the start or at the end of a document are more likely to be keyphrases.
  • Length of phrase is also effective factor, works on defining keyphrase. It might be single word, two word or more word combination. It can be controlled at development time with KEA API. Node degree of a candidate phrase is the number of phrases in the candidate set that are semantically related to this phrase. This is computed with the help of the thesaurus. Phrases with high degree are more likely to be keyphrases.

Sites:

1 In the beginning God created the heaven and the earth.
2 And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.
3 And God said, Let there be light: and there was light.
4 And God saw the light, that it was good: and God divided the light from the darkness.
5 And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day.
6 And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters.
7 And God made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it was so.
8 And God called the firmament Heaven. And the evening and the morning were the second day.
9 And God said, Let the waters under the heaven be gathered together unto one place, and let the dry land appear: and it was so.
10 And God called the dry land Earth; and the gathering together of the waters called he Seas: and God saw that it was good.
11 And God said, Let the earth bring forth grass, the herb yielding seed, and the fruit tree yielding fruit after his kind, whose seed is in itself, upon the earth: and it was so.
12 And the earth brought forth grass, and herb yielding seed after his kind, and the tree yielding fruit, whose seed was in itself, after his kind: and God saw that it was good.
13 And the evening and the morning were the third day.
14 And God said, Let there be lights in the firmament of the heaven to divide the day from the night; and let them be for signs, and for seasons, and for days, and years:
15 And let them be for lights in the firmament of the heaven to give light upon the earth: and it was so.
16 And God made two great lights; the greater light to rule the day, and the lesser light to rule the night: he made the stars also.
17 And God set them in the firmament of the heaven to give light upon the earth,
18 And to rule over the day and over the night, and to divide the light from the darkness: and God saw that it was good.
19 And the evening and the morning were the fourth day.
20 And God said, Let the waters bring forth abundantly the moving creature that hath life, and fowl that may fly above the earth in the open firmament of heaven.
21 And God created great whales, and every living creature that moveth, which the waters brought forth abundantly, after their kind, and every winged fowl after his kind: and God saw that it was good.
22 And God blessed them, saying, Be fruitful, and multiply, and fill the waters in the seas, and let fowl multiply in the earth.
23 And the evening and the morning were the fifth day.
24 And God said, Let the earth bring forth the living creature after his kind, cattle, and creeping thing, and beast of the earth after his kind: and it was so.
25 And God made the beast of the earth after his kind, and cattle after their kind, and every thing that creepeth upon the earth after his kind: and God saw that it was good.
26 And God said, Let us make man in our image, after our likeness: and let them have dominion over the fish of the sea, and over the fowl of the air, and over the cattle, and over all the earth, and over every creeping thing that creepeth upon the earth.
27 So God created man in his own image, in the image of God created he him; male and female created he them.
28 And God blessed them, and God said unto them, Be fruitful, and multiply, and replenish the earth, and subdue it: and have dominion over the fish of the sea, and over the fowl of the air, and over every living thing that moveth upon the earth.
29 And God said, Behold, I have given you every herb bearing seed, which is upon the face of all the earth, and every tree, in the which is the fruit of a tree yielding seed; to you it shall be for meat.
30 And to every beast of the earth, and to every fowl of the air, and to every thing that creepeth upon the earth, wherein there is life, I have given every green herb for meat: and it was so.
31 And God saw every thing that he had made, and, behold, it was very good. And the evening and the morning were the sixth day.
Then was Jesus led up of the Spirit into the wilderness to be tempted of the devil.
2 And when he had fasted forty days and forty nights, he was afterward an hungred.
3 And when the tempter came to him, he said, If thou be the Son of God, command that these stones be made bread.
4 But he answered and said, It is written, Man shall not live by bread alone, but by every word that proceedeth out of the mouth of God.
5 Then the devil taketh him up into the holy city, and setteth him on a pinnacle of the temple,
6 And saith unto him, If thou be the Son of God, cast thyself down: for it is written, He shall give his angels charge concerning thee: and in their hands they shall bear thee up, lest at any time thou dash thy foot against a stone.
7 Jesus said unto him, It is written again, Thou shalt not tempt the Lord thy God.
8 Again, the devil taketh him up into an exceeding high mountain, and sheweth him all the kingdoms of the world, and the glory of them;
9 And saith unto him, All these things will I give thee, if thou wilt fall down and worship me.
10 Then saith Jesus unto him, Get thee hence, Satan: for it is written, Thou shalt worship the Lord thy God, and him only shalt thou serve.
11 Then the devil leaveth him, and, behold, angels came and ministered unto him.
12 Now when Jesus had heard that John was cast into prison, he departed into Galilee;
13 And leaving Nazareth, he came and dwelt in Capernaum, which is upon the sea coast, in the borders of Zabulon and Nephthalim:
14 That it might be fulfilled which was spoken by Esaias the prophet, saying,
15 The land of Zabulon, and the land of Nephthalim, by the way of the sea, beyond Jordan, Galilee of the Gentiles;
16 The people which sat in darkness saw great light; and to them which sat in the region and shadow of death light is sprung up.
17 From that time Jesus began to preach, and to say, Repent: for the kingdom of heaven is at hand.
18 And Jesus, walking by the sea of Galilee, saw two brethren, Simon called Peter, and Andrew his brother, casting a net into the sea: for they were fishers.
19 And he saith unto them, Follow me, and I will make you fishers of men.
20 And they straightway left their nets, and followed him.
21 And going on from thence, he saw other two brethren, James the son of Zebedee, and John his brother, in a ship with Zebedee their father, mending their nets; and he called them.
22 And they immediately left the ship and their father, and followed him.
23 And Jesus went about all Galilee, teaching in their synagogues, and preaching the gospel of the kingdom, and healing all manner of sickness and all manner of disease among the people.
24 And his fame went throughout all Syria: and they brought unto him all sick people that were taken with divers diseases and torments, and those which were possessed with devils, and those which were lunatick, and those that had the palsy; and he healed them.
25 And there followed him great multitudes of people from Galilee, and from Decapolis, and from Jerusalem, and from Judaea, and from beyond Jordan.
"""
source: x_spacy_analyze.py
from:
author: david richards
date: 2017-01-25
"""
import os
import spacy
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import hist
from cycler import cycler
from collections import Counter, defaultdict
NUM_BINS = 10
nlp = spacy.load('en')
FILE_NAME = 'mat4.txt'
FILE_DIR = os.path.expanduser('~/Documents/Coding/KJV_Spacy/SourceText/')
def main():
# Process `text` with Spacy NLP Parser
text = read_file(FILE_DIR + FILE_NAME)
doc = nlp(text)
# How many sentences are in the book (Pride & Prejudice)?
sentences = [s for s in doc.sents]
print(len(sentences))
# Print sentences from index 10 to index 15, to make sure that we have parsed the correct book
print(sentences[10:15])
print(find_character_occurences(doc)[:20])
character_occurences = get_character_offsets(doc)
# plot_character_timeseries(character_occurrences, ['darcy', 'bingley'], normalization_constant=len(doc))
plot_character_timeseries(character_occurences, ['jesus', 'james'])
print(get_character_adjectives(doc, 'jesus'))
# Find characters that are 'talking', 'saying', 'doing' the most. Find the relationship between
# entities and corresponding root verbs.
character_verb_counter = Counter()
VERB_LEMMA = 'say'
for ent in doc.ents:
if ent.label_ == 'PERSON' and ent.root.head.lemma_ == VERB_LEMMA:
character_verb_counter[ent.text] += 1
print(character_verb_counter.most_common(10))
# Find all the characters that got married in the book.
#
# Here is an example sentence from which this information could be extracted:
#
# "her mother was talking to that one person (Lady Lucas) freely,
# openly, and of nothing else but her expectation that Jane would soon
# be married to Mr. Bingley."
# Extract Keywords using noun chunks from the news article (file 'article.txt').
# Spacy will pick some noun chunks that are not informative at all (e.g. we, what, who).
# Try to find a way to remove non informative keywords.
keywords = Counter()
for chunk in doc.noun_chunks:
if nlp.vocab[chunk.lemma_].prob < - 8: # probability value -8 is arbitrarily selected threshold
keywords[chunk.lemma_] += 1
print(keywords.most_common(20))
h = "hello"
def read_file(file_name):
with open(file_name, 'r') as file:
return file.read() # .decode('utf-8')
# Extract all the personal names from Pride & Prejudice and count their occurrences.
# Expected output is a list in the following form: [('elizabeth', 622), ('darcy', 312), ('jane', 286), ('bennet', 266) ...].
def find_character_occurences(doc):
"""
Return a list of actors from `doc` with corresponding occurences.
:param doc: Spacy NLP parsed document
:return: list of tuples in form
[('elizabeth', 622), ('darcy', 312), ('jane', 286), ('bennet', 266)]
"""
characters = Counter()
for ent in doc.ents:
if ent.label_ == 'PERSON':
characters[ent.lemma_] += 1
return characters.most_common()
# Plot characters' mentions as a time series relative to the position of the actor's occurrence in a book.
def get_character_offsets(doc):
"""
For every character in a `doc` collect all the occurrences offsets and store them into a list.
The function returns a dictionary that has actor lemma as a key and list of occurences as a value for every character.
:param doc: Spacy NLP parsed document
:return: dict object in form
{'elizabeth': [123, 543, 4534], 'darcy': [205, 2111]}
"""
character_offsets = defaultdict(list)
for ent in doc.ents:
if ent.label_ == 'PERSON':
character_offsets[ent.lemma_].append(ent.start)
return dict(character_offsets)
# ERROR: doc not defined in this scope
def normalize(occurencies, normalization_constant):
return [o / float(len(doc)) for o in occurencies]
def plot_character_timeseries(character_offsets, character_labels, normalization_constant=None):
"""
Plot characters' personal names specified in `character_labels` list as time series.
:param character_offsets: dict object in form {'elizabeth': [123, 543, 4534], 'darcy': [205, 2111]}
:param character_labels: list of strings that should match some of the keys in `character_offsets`
:param normalization_constant: int
"""
x = [character_offsets[character_label] for character_label in character_labels]
with plt.style.context('fivethirtyeight'):
plt.figure()
n, bins, patches = plt.hist(x, NUM_BINS, label=character_labels)
plt.clf()
ax = plt.subplot(111)
for i, a in enumerate(n):
ax.plot([float(x) / (NUM_BINS - 1) for x in range(len(a))], a, label=character_labels[i])
matplotlib.rcParams['axes.prop_cycle'] = cycler(color=['r', 'k', 'c', 'b', 'y', 'm', 'g', '#54a1FF'])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# Find words (adjectives) that describe Mr. Darcy.
def get_character_adjectives(doc, character_lemma):
"""
Find all the adjectives related to `character_lemma` in `doc`
:param doc: Spacy NLP parsed document
:param character_lemma: string object
:return: list of adjectives related to `character_lemma`
"""
adjectives = []
for ent in doc.ents:
if ent.lemma_ == character_lemma:
for token in ent.subtree:
if token.pos_ == 'ADJ': # Replace with if token.dep_ == 'amod':
adjectives.append(token.lemma_)
for ent in doc.ents:
if ent.lemma_ == character_lemma:
if ent.root.dep_ == 'nsubj':
for child in ent.root.head.children:
if child.dep_ == 'acomp':
adjectives.append(child.lemma_)
return adjectives
if __name__ == "__main__":
main()
"""
source: x_spacy_basics.py
from: https://github.com/cytora/pycon-nlp-in-10-lines/blob/master/00_spacy_intro.ipynb
"""
import spacy # See "Installing spaCy"
nlp = spacy.load('en') # You are here.
doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...\nFor God so loved the world that He gave his '
u'only begotten son, that whosever believes in him shall not perish but have everlasting life.')
# print("\n*** 1) Get 1ST TOKEN of document.***")
# token = doc[0]
# print(token)
#
# print("\n*** 2) Print SENTENCES (1 per line)***")
# for sent in doc.sents:
# print(sent)
# print("\n*** 3) For each token, print corresponding PART OF SPEECH tag***")
# # print(list((w.text, w.pos_) for w in doc)) # See "Doc, Span and Token"
# for token in doc:
# print('{} - {}'.format(token, token.pos_))
print("\n*** 3a) PRINT NOUNS based on POS***")
# token.nbor
kp = [token.lemma_ + ' ' + token[0].nbor() for token in doc if token.pos_ == "NOUN" or token.pos_ == "VERB"]
print(kp)
token_count = doc.__len__()
# def tokens_to_root(tkn):
# """
# Walk up the syntactic tree, collecting tokens to the root of the given `token`.
# :param tkn: Spacy token
# :return: list of Spacy tokens
# """
# tokens_to_r = []
# while tkn.head is not tkn:
# tokens_to_r.append(tkn)
# tkn = tkn.head
# tokens_to_r.append(tkn)
#
# return tokens_to_r
#
# print("\n*** 4) Print each token's PATH TO THE ROOT***")
# for token in doc:
# print('{} --> {}'.format(token, tokens_to_root(token)))
#
# print("\n*** 5) Print DEPENDENCY LABELS of the tokens***")
# for token in doc:
# print('-> '.join(['{}-{}'.format(dependent_token, dependent_token.dep_) for dependent_token in tokens_to_root(token)]))
#
#
# print("\n*** 6) Print all NAMED ENTITIES with named entity types***")
# doc_2 = nlp(u"I went to Paris where I met my old friend Jack from uni.")
# for ent in doc_2.ents:
# print('{} - {}'.format(ent, ent.label_))
#
#
# print("\n*** 7) Print NOUN CHUNKS for doc_2")
# print([chunk for chunk in doc_2.noun_chunks])
#
#
# print("\n*** 8) For every token, print LOG-PROBABILITY OF THE WORD, estimated from counts from a large corpus***")
# for token in doc_2:
# print(token, ',', round(token.prob, 2))
#
#
# print("\n*** 9) CALCULATE SIMILARITY between 'apples' and 'oranges' and 'boots' and 'hippos'")
# doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.***")
# apples = doc[0]
# oranges = doc[2]
# boots = doc[6]
# hippos = doc[8]
# print(round(apples.similarity(oranges), 2))
# print(round(boots.similarity(hippos), 2))
#
# print("\n*** 10) Print similarity between sentence and word 'fruit'***")
# apples_sent, boots_sent = doc.sents
# fruit = doc.vocab[u'fruit']
# print(apples_sent.similarity(fruit))
# print(boots_sent.similarity(fruit))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment