Skip to content

Instantly share code, notes, and snippets.

@vinhkhuc
Last active November 27, 2021 18:16
Show Gist options
  • Save vinhkhuc/e1db899b7d48795d306e8d922166e076 to your computer and use it in GitHub Desktop.
Save vinhkhuc/e1db899b7d48795d306e8d922166e076 to your computer and use it in GitHub Desktop.
spaCy courses (https://course.spacy.io/)
# Source: https://course.spacy.io/
# =========================== Chapter 1 =========================== #
# Import the English language class
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher
# Create the nlp object
nlp = English()
# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")
# Iterate over tokens in a Doc
for token in doc:
print(token.text)
span = doc[1:4]
print(span.text)
doc = nlp("It costs $5.")
print('Index: ', [token.i for token in doc])
print('Text: ', [token.text for token in doc])
print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])
# Text
nlp = spacy.load('en_core_web_sm')
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
# Iterate over the predicted entities
for ent in doc.ents:
# Print the entity text and its label
print(ent.text, ent.label_)
# Predicting named entities in context
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"
# Process the text
doc = nlp(text)
# Iterate over the entities
for ent in doc.ents:
# Print the entity text and label
print(ent.text, ent.label_)
# Get the span for "iPhone X"
iphone_x = doc[1:3]
# Print the span text
print("Missing entity:", iphone_x.text)
# Section 11
doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake")
# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)
# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", None, pattern)
# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])
# ============= Section 12 - Part 1 ============= #
doc = nlp(
"After making the iOS update you won't notice a radical system-wide "
"redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
"iOS 11's furniture remains the same as in iOS 10. But you will discover "
"some tweaks once you delve a little deeper."
)
# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"LOWER": "ios"}, {"IS_DIGIT": True}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
print("Match found:", doc[start:end].text)
# ============= Section 12 - Part 2 ============= #
doc = nlp(
"i downloaded Fortnite on my laptop and can't open the game at all. Help? "
"so when I was downloading Minecraft, I got the Windows version where it "
"is the '.zip' folder and I used the default program to unpack it... do "
"I also need to download Winzip?"
)
# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
print("Match found:", doc[start:end].text)
# ============= Section 12 - Part 3 ============= #
doc = nlp(
"Features of the app include a beautiful design, smart search, automatic "
"labels and optional voice responses."
)
# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
print("Match found:", doc[start:end].text)
# =========================== Chapter 2 =========================== #
import json
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Span
nlp = spacy.load('en_core_web_sm')
doc = nlp("coffee")
coffee_hash = nlp.vocab.strings["coffee"]
coffee_string = nlp.vocab.strings[coffee_hash]
###### Section 2 - Part 1 #######
doc = nlp("I have a cat")
# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings["cat"]
print(cat_hash)
# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)
###### Section 2 - Part 2 #######
doc = nlp("David Bowie is a PERSON")
person_hash = nlp.vocab.strings["PERSON"]
print(person_hash)
person_string = nlp.vocab.strings[person_hash]
print(person_string)
###### Section 7 #########
doc = nlp("Berlin is a nice city")
# Iterate over the tokens
for token in doc:
# Check if the current token is a proper noun
if token.pos_ == "PROPN":
# Check if the next token is a verb
if doc[token.i + 1].pos_ == "VERB":
print("Found proper noun before a verb:", token.text)
###### Section 8 ##########
# # Load a larger model with vectors
# print("Loading medium model ...")
# nlp = spacy.load('en_core_web_md')
#
# # Compare two documents
# doc1 = nlp("I like fast food")
# doc2 = nlp("I like pizza")
# print(doc1.similarity(doc2))
####### Section 13 #########
doc = nlp(
"Twitch Prime, the perks program for Amazon Prime members offering free "
"loot, games and other benefits, is ditching one of its best features: "
"ad-free viewing. According to an email sent out to Amazon Prime members "
"today, ad-free viewing will no longer be included as a part of Twitch "
"Prime for new members, beginning on September 14. However, members with "
"existing annual subscriptions will be able to continue to enjoy ad-free "
"viewing until their subscription comes up for renewal. Those with "
"monthly subscriptions will have access to ad-free viewing until October 15."
)
# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"}, {"LOWER": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]
# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)
# Iterate over the matches
for match_id, start, end in matcher(doc):
# Print pattern string name and text of matched span
print(doc.vocab.strings[match_id], doc[start:end].text)
######### Section 14 ##########
with open("exercises/countries.json") as f:
COUNTRIES = json.loads(f.read())
nlp = English()
doc = nlp("Czech Republic may help Slovakia protect its airspace")
matcher = PhraseMatcher(nlp.vocab)
# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)
# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])
########### Section 15 ###########
with open("exercises/countries.json") as f:
COUNTRIES = json.loads(f.read())
with open("exercises/country_text.txt") as f:
TEXT = f.read()
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)
# Create a doc and find matches in it
doc = nlp(TEXT)
# Iterate over the matches
for match_id, start, end in matcher(doc):
# Create a Span with the label for "GPE"
span = Span(doc, start, end, label="GPE")
# Overwrite the doc.ents and add the span
doc.ents = list(doc.ents) + [span]
# Get the span's root head token
span_root_head = span.root.head
# Print the text of the span root's head token and the span text
print(span_root_head.text, "-->", span.text)
# Print the entities in the document
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])
# =========================== Chapter 3 =========================== #
import json
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span, Token
######## Section 6 #########
# Define the custom component
def length_component(doc):
# Get the doc's length
doc_length = len(doc)
print("This document is {} tokens long.".format(doc_length))
# Return the doc
return doc
# Load the small English model
nlp = spacy.load("en_core_web_sm")
# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)
# Process a text
doc = nlp("Hello there")
######### Section 7 #########
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)
# Define the custom component
def animal_component(doc):
# Apply the matcher to the doc
matches = matcher(doc)
# Create a Span for each match and assign the label 'ANIMAL'
spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
# Overwrite the doc.ents with the matched spans
doc.ents = spans
return doc
# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(animal_component, after='ner')
print(nlp.pipe_names)
# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])
########### Section 9 ######################
def get_reversed(token):
return token.text[::-1]
# Register the Token extension attribute 'is_country' with the default value False
Token.set_extension('is_country', default=False)
Token.set_extension('reversed', getter=get_reversed)
# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True
# Print the token text and the is_country attribute for all tokens
print([(token, token._.is_country) for token in doc])
print([(token, token._.reversed) for token in doc])
########### Section 14 ######################
with open("exercises/tweets.json") as f:
TEXTS = json.loads(f.read())
# Process the texts and print the adjectives
for text in TEXTS:
doc = nlp(text)
print([token.text for token in doc if token.pos_ == "ADJ"])
# Batch processing
docs = list(nlp.pipe(TEXTS))
for doc in docs:
print([token.text for token in doc if token.pos_ == "ADJ"])
# =========================== Chapter 4 =========================== #
import json
import random
import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English
#
# ###### Create training data ########
# with open("exercises/iphone.json") as f:
# TEXTS = json.loads(f.read())
#
# nlp = English()
# matcher = Matcher(nlp.vocab)
# pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
# pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
# patterns = [pattern1, pattern2]
# matcher.add("GADGET", None, *patterns)
#
# TRAINING_DATA = []
#
# # Create a Doc object for each text in TEXTS
# for doc in nlp.pipe(TEXTS):
# # Match on the doc and create a list of matched spans
# spans = [doc[start:end] for match_id, start, end in matcher(doc)]
#
# # Get (start character, end character, label) tuples of matches
# entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
#
# # Format the matches as a (doc.text, entities) tuple
# training_example = (doc.text, {"entities": entities})
#
# # Append the example to the training data
# TRAINING_DATA.append(training_example)
#
# print(*TRAINING_DATA, sep="\n")
########## Section 6 - Setting up pipeline ##############
TRAINING_DATA = [
["How to preorder the iPhone X", { "entities": [[20, 28, "GADGET"]] }],
["iPhone X is coming", { "entities": [[0, 8, "GADGET"]] }],
["Should I pay $1,000 for the iPhone X?", { "entities": [[28, 36, "GADGET"]] }],
["The iPhone 8 reviews are here", { "entities": [[4, 12, "GADGET"]] }],
["Your iPhone goes up to 11 today", { "entities": [[5, 11, "GADGET"]] }],
["I need a new phone! Any tips?", { "entities": [] }]
]
# Create a blank NLP
nlp = spacy.blank("en")
# Create a new NER module
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
# Add the label 'GADGET' to the entity recognizer
ner.add_label("GADGET")
# Start the training
nlp.begin_training()
# Loop for 10 iterations
for itn in range(10):
# Shuffle the training data
random.shuffle(TRAINING_DATA)
losses = {}
# Batch the examples and iterate over them
for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
texts = [text for text, entities in batch]
annotations = [entities for text, entities in batch]
# Update the model
nlp.update(texts, annotations, losses=losses)
print(losses)
test_texts = [
"Apple is slowing down the iPhone 8 and iPhone X - how to stop it",
"I finally understand what the iPhone X ‘notch’ is for",
"Everything you need to know about the Samsung Galaxy S9",
"Looking to compare iPad models? Here’s how the 2018 lineup stacks up",
"The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple",
"what is the cheapest ipad, especially ipad pro???",
"Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics"
]
for doc in nlp.pipe(test_texts):
print(doc.text, [(ent.label_, ent.text, ent.start_char, ent.end_char) for ent in doc.ents])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment