Last active
October 7, 2019 20:08
-
-
Save Jiali-Qi/02d5702d7d43bd489f7ecc4249cc781d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Oct 7 10:13:48 2019 | |
@author: qijia | |
""" | |
import spacy | |
import re | |
import requests | |
from collections import Counter | |
# Use the Python requests package to download a book. | |
oliver_twist_full_text = requests.get('http://www.gutenberg.org/ebooks/730.txt.utf-8').text | |
print(oliver_twist_full_text) | |
# Load the small English model | |
nlp = spacy.load('en_core_web_lg') | |
# Q1 How many tokens are in the document? | |
doc = nlp(oliver_twist_full_text) | |
print(format(len(doc))) | |
# Q2 How many verbs are in the document? | |
verbs=0; | |
for token in doc: | |
if token.pos_ == "VERB": | |
verbs+=1; | |
print(format(verbs)) | |
# Q3 What is the most frequent named entity? | |
entity=[x.text for x in doc.ents] | |
elist_counter=Counter(entity) | |
print(format(elist_counter.most_common(1))) | |
# Q4 How many setences are in the document? | |
sentences=list(doc.sents) | |
print(format(len(sentences))) | |
# Q5 Of all the sentences in the text that are at least 10 words in length, which two are most similar (but not identical)? | |
# Find all sentences in the text that are at least 10 words in length | |
f_sent=[] | |
for sent in sentences: | |
s_words=sent.text.split(" ") | |
if len(s_words) > 9: | |
f_sent.append(sent) | |
for i in range(0,len(f_sent)): | |
for j in range (0,len(f_sent)): | |
if i != j: | |
sim = f_sent[i].similarity(f_sent[j]) | |
if max_sim = max(sim): | |
print(f_sent[i].text +"####"+f_sent[j].text +"#### are most similar") | |
# Q6 What is the vector representation of the first word in the 15th sentence in the document? | |
15th_sentence=sentences[15] | |
print(15th_sentence) | |
print(15th_sentence.vector) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment