Skip to content

Instantly share code, notes, and snippets.

@Jiali-Qi
Last active October 7, 2019 20:08
Show Gist options
  • Save Jiali-Qi/02d5702d7d43bd489f7ecc4249cc781d to your computer and use it in GitHub Desktop.
Save Jiali-Qi/02d5702d7d43bd489f7ecc4249cc781d to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 7 10:13:48 2019
@author: qijia
"""
import spacy
import re
import requests
from collections import Counter
# Use the Python requests package to download a book.
oliver_twist_full_text = requests.get('http://www.gutenberg.org/ebooks/730.txt.utf-8').text
print(oliver_twist_full_text)
# Load the small English model
nlp = spacy.load('en_core_web_lg')
# Q1 How many tokens are in the document?
doc = nlp(oliver_twist_full_text)
print(format(len(doc)))
# Q2 How many verbs are in the document?
verbs=0;
for token in doc:
if token.pos_ == "VERB":
verbs+=1;
print(format(verbs))
# Q3 What is the most frequent named entity?
entity=[x.text for x in doc.ents]
elist_counter=Counter(entity)
print(format(elist_counter.most_common(1)))
# Q4 How many setences are in the document?
sentences=list(doc.sents)
print(format(len(sentences)))
# Q5 Of all the sentences in the text that are at least 10 words in length, which two are most similar (but not identical)?
# Find all sentences in the text that are at least 10 words in length
f_sent=[]
for sent in sentences:
s_words=sent.text.split(" ")
if len(s_words) > 9:
f_sent.append(sent)
for i in range(0,len(f_sent)):
for j in range (0,len(f_sent)):
if i != j:
sim = f_sent[i].similarity(f_sent[j])
if max_sim = max(sim):
print(f_sent[i].text +"####"+f_sent[j].text +"#### are most similar")
# Q6 What is the vector representation of the first word in the 15th sentence in the document?
15th_sentence=sentences[15]
print(15th_sentence)
print(15th_sentence.vector)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment