Skip to content

Instantly share code, notes, and snippets.

@nivir
Last active April 23, 2021 05:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nivir/64173b23a71a1a32d7245a9830926d0c to your computer and use it in GitHub Desktop.
Save nivir/64173b23a71a1a32d7245a9830926d0c to your computer and use it in GitHub Desktop.
spacy_example.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#https://spacy.io/usage/spacy-101
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(unicode("Apple is looking at buying U.K. startup for $1 billion"))
def fun_annotations():
for token in doc:
print(token.text, token.pos_, token.dep_)
print doc
def fun_tokenizes():
for token in doc:
print(token.text)
def fun_part_of_speech():
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
def fun_named_entities():
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
def fun_word_vector():
tokens = nlp(unicode("dog cat banana afskfsd"))
for token in tokens:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
def fun_word_vector_2():
pass
nlp = spacy.load("en_core_web_md") # make sure to use larger package!
doc1 = nlp(unicode("I like salty fries and hamburgers."))
doc2 = nlp(unicode("Fast food tastes very good."))
# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))
def fun_voab_hashes_lexemes():
nlp = spacy.load("en_core_web_sm")
doc = nlp(unicode("I love coffee"))
print(doc.vocab.strings["coffee"]) # 3197928453018144401
print(doc.vocab.strings[3197928453018144401]) # 'coffee'
def fun_voab_hashes_lexemes_2():
for word in doc:
lexeme = doc.vocab[word.text]
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
def fun_voab_hashes_lexemes_3_wrong():
from spacy.tokens import Doc
from spacy.vocab import Vocab
print(doc.vocab.strings["coffee"]) # 3197928453018144401
print(doc.vocab.strings[3197928453018144401]) # 'coffee' 👍
empty_doc = Doc(Vocab()) # New Doc with empty Vocab
# empty_doc.vocab.strings[3197928453018144401] will raise an error :(
empty_doc.vocab.strings.add("coffee") # Add "coffee" and generate hash
print(empty_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍
new_doc = Doc(doc.vocab) # Create new doc with first doc's vocab
print(new_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍
#fun_annotations()
#fun_tokenizes()
#fun_part_of_speech()
#fun_named_entities()
#fun_word_vector()
#fun_word_vector_2()
#fun_voab_hashes_lexemes()
#fun_voab_hashes_lexemes_2()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment