Jiali-Qi/My 5th homework

## My 5th homework
# -*- coding: utf-8 -*-
"""
Created on Mon Oct  7 10:13:48 2019

@author: qijia
"""

import spacy
import re
import requests
from collections import Counter

# Use the Python requests package to download a book.
oliver_twist_full_text = requests.get('http://www.gutenberg.org/ebooks/730.txt.utf-8').text
print(oliver_twist_full_text)

# Load the small English model
nlp = spacy.load('en_core_web_lg')
# Q1 How many tokens are in the document?
doc = nlp(oliver_twist_full_text)
print(format(len(doc)))

# Q2 How many verbs are in the document?
verbs=0;
for token in doc:
    if token.pos_ == "VERB":
        verbs+=1;
print(format(verbs))

# Q3 What is the most frequent named entity?
entity=[x.text for x in doc.ents]
elist_counter=Counter(entity)
print(format(elist_counter.most_common(1)))

# Q4 How many setences are in the document?
sentences=list(doc.sents)
print(format(len(sentences)))

# Q5 Of all the sentences in the text that are at least 10 words in length, which two are most similar (but not identical)?
# Find all sentences in the text that are at least 10 words in length
f_sent=[]
for sent in sentences:
    s_words=sent.text.split(" ")
    if len(s_words) > 9:
        f_sent.append(sent)

for i in range(0,len(f_sent)):
    for j in range (0,len(f_sent)):
        if i != j:
            sim = f_sent[i].similarity(f_sent[j])
            if max_sim = max(sim):
                print(f_sent[i].text +"####"+f_sent[j].text +"#### are most similar")

# Q6 What is the vector representation of the first word in the 15th sentence in the document?
15th_sentence=sentences[15]
print(15th_sentence)
print(15th_sentence.vector)
	# -- coding: utf-8 --
	"""
	Created on Mon Oct 7 10:13:48 2019

	@author: qijia
	"""

	import spacy
	import re
	import requests
	from collections import Counter

	# Use the Python requests package to download a book.
	oliver_twist_full_text = requests.get('http://www.gutenberg.org/ebooks/730.txt.utf-8').text
	print(oliver_twist_full_text)

	# Load the small English model
	nlp = spacy.load('en_core_web_lg')
	# Q1 How many tokens are in the document?
	doc = nlp(oliver_twist_full_text)
	print(format(len(doc)))

	# Q2 How many verbs are in the document?
	verbs=0;
	for token in doc:
	if token.pos_ == "VERB":
	verbs+=1;
	print(format(verbs))

	# Q3 What is the most frequent named entity?
	entity=[x.text for x in doc.ents]
	elist_counter=Counter(entity)
	print(format(elist_counter.most_common(1)))

	# Q4 How many setences are in the document?
	sentences=list(doc.sents)
	print(format(len(sentences)))

	# Q5 Of all the sentences in the text that are at least 10 words in length, which two are most similar (but not identical)?
	# Find all sentences in the text that are at least 10 words in length
	f_sent=[]
	for sent in sentences:
	s_words=sent.text.split(" ")
	if len(s_words) > 9:
	f_sent.append(sent)

	for i in range(0,len(f_sent)):
	for j in range (0,len(f_sent)):
	if i != j:
	sim = f_sent[i].similarity(f_sent[j])
	if max_sim = max(sim):
	print(f_sent[i].text +"####"+f_sent[j].text +"#### are most similar")

	# Q6 What is the vector representation of the first word in the 15th sentence in the document?
	15th_sentence=sentences[15]
	print(15th_sentence)
	print(15th_sentence.vector)