Last active
April 23, 2021 05:53
-
-
Save nivir/64173b23a71a1a32d7245a9830926d0c to your computer and use it in GitHub Desktop.
spacy_example.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
#https://spacy.io/usage/spacy-101 | |
import spacy | |
nlp = spacy.load("en_core_web_sm") | |
doc = nlp(unicode("Apple is looking at buying U.K. startup for $1 billion")) | |
def fun_annotations(): | |
for token in doc: | |
print(token.text, token.pos_, token.dep_) | |
print doc | |
def fun_tokenizes(): | |
for token in doc: | |
print(token.text) | |
def fun_part_of_speech(): | |
for token in doc: | |
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, | |
token.shape_, token.is_alpha, token.is_stop) | |
def fun_named_entities(): | |
for ent in doc.ents: | |
print(ent.text, ent.start_char, ent.end_char, ent.label_) | |
def fun_word_vector(): | |
tokens = nlp(unicode("dog cat banana afskfsd")) | |
for token in tokens: | |
print(token.text, token.has_vector, token.vector_norm, token.is_oov) | |
def fun_word_vector_2(): | |
pass | |
nlp = spacy.load("en_core_web_md") # make sure to use larger package! | |
doc1 = nlp(unicode("I like salty fries and hamburgers.")) | |
doc2 = nlp(unicode("Fast food tastes very good.")) | |
# Similarity of two documents | |
print(doc1, "<->", doc2, doc1.similarity(doc2)) | |
# Similarity of tokens and spans | |
french_fries = doc1[2:4] | |
burgers = doc1[5] | |
print(french_fries, "<->", burgers, french_fries.similarity(burgers)) | |
def fun_voab_hashes_lexemes(): | |
nlp = spacy.load("en_core_web_sm") | |
doc = nlp(unicode("I love coffee")) | |
print(doc.vocab.strings["coffee"]) # 3197928453018144401 | |
print(doc.vocab.strings[3197928453018144401]) # 'coffee' | |
def fun_voab_hashes_lexemes_2(): | |
for word in doc: | |
lexeme = doc.vocab[word.text] | |
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, | |
lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_) | |
def fun_voab_hashes_lexemes_3_wrong(): | |
from spacy.tokens import Doc | |
from spacy.vocab import Vocab | |
print(doc.vocab.strings["coffee"]) # 3197928453018144401 | |
print(doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 | |
empty_doc = Doc(Vocab()) # New Doc with empty Vocab | |
# empty_doc.vocab.strings[3197928453018144401] will raise an error :( | |
empty_doc.vocab.strings.add("coffee") # Add "coffee" and generate hash | |
print(empty_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 | |
new_doc = Doc(doc.vocab) # Create new doc with first doc's vocab | |
print(new_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 | |
#fun_annotations() | |
#fun_tokenizes() | |
#fun_part_of_speech() | |
#fun_named_entities() | |
#fun_word_vector() | |
#fun_word_vector_2() | |
#fun_voab_hashes_lexemes() | |
#fun_voab_hashes_lexemes_2() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment