vinhkhuc/spacy_course_all_chapters.py

## spacy_course_all_chapters.py
# Source: https://course.spacy.io/

# =========================== Chapter 1 =========================== #
# Import the English language class
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher

# Create the nlp object
nlp = English()

# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

span = doc[1:4]
print(span.text)

doc = nlp("It costs $5.")
print('Index:   ', [token.i for token in doc])
print('Text:    ', [token.text for token in doc])

print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])

# Text
nlp = spacy.load('en_core_web_sm')
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

# Predicting named entities in context
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

# Section 11
doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", None, pattern)

# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

# ============= Section 12 - Part 1 ============= #
doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"LOWER": "ios"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

# ============= Section 12 - Part 2 ============= #
doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

# ============= Section 12 - Part 3 ============= #
doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

# =========================== Chapter 2 =========================== #

import json
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Span


nlp = spacy.load('en_core_web_sm')

doc = nlp("coffee")
coffee_hash = nlp.vocab.strings["coffee"]
coffee_string = nlp.vocab.strings[coffee_hash]

###### Section 2 - Part 1 #######

doc = nlp("I have a cat")

# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings["cat"]
print(cat_hash)

# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

###### Section 2 - Part 2 #######
doc = nlp("David Bowie is a PERSON")

person_hash = nlp.vocab.strings["PERSON"]
print(person_hash)

person_string = nlp.vocab.strings[person_hash]
print(person_string)

###### Section 7 #########
doc = nlp("Berlin is a nice city")

# Iterate over the tokens
for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        # Check if the next token is a verb
        if doc[token.i + 1].pos_ == "VERB":
            print("Found proper noun before a verb:", token.text)

###### Section 8 ##########
# # Load a larger model with vectors
# print("Loading medium model ...")
# nlp = spacy.load('en_core_web_md')
#
# # Compare two documents
# doc1 = nlp("I like fast food")
# doc2 = nlp("I like pizza")
# print(doc1.similarity(doc2))

####### Section 13 #########
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"}, {"LOWER": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

######### Section 14 ##########
with open("exercises/countries.json") as f:
    COUNTRIES = json.loads(f.read())

nlp = English()
doc = nlp("Czech Republic may help Slovakia protect its airspace")

matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

########### Section 15 ###########
with open("exercises/countries.json") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/country_text.txt") as f:
    TEXT = f.read()

nlp = English()
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# Create a doc and find matches in it
doc = nlp(TEXT)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Create a Span with the label for "GPE"
    span = Span(doc, start, end, label="GPE")

    # Overwrite the doc.ents and add the span
    doc.ents = list(doc.ents) + [span]

    # Get the span's root head token
    span_root_head = span.root.head
    # Print the text of the span root's head token and the span text
    print(span_root_head.text, "-->", span.text)

# Print the entities in the document
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])


# =========================== Chapter 3 =========================== #

import json

import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span, Token


######## Section 6 #########

# Define the custom component
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print("This document is {} tokens long.".format(doc_length))
    # Return the doc
    return doc

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("Hello there")


######### Section 7 #########

animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)

matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'ANIMAL'
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(animal_component, after='ner')
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

########### Section 9 ######################
def get_reversed(token):
    return token.text[::-1]

# Register the Token extension attribute 'is_country' with the default value False
Token.set_extension('is_country', default=False)
Token.set_extension('reversed', getter=get_reversed)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True

# Print the token text and the is_country attribute for all tokens
print([(token, token._.is_country) for token in doc])

print([(token, token._.reversed) for token in doc])

########### Section 14 ######################

with open("exercises/tweets.json") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the adjectives
for text in TEXTS:
    doc = nlp(text)
    print([token.text for token in doc if token.pos_ == "ADJ"])

# Batch processing
docs = list(nlp.pipe(TEXTS))
for doc in docs:
    print([token.text for token in doc if token.pos_ == "ADJ"])

# =========================== Chapter 4 =========================== #

import json
import random

import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English
#
# ###### Create training data ########
# with open("exercises/iphone.json") as f:
#     TEXTS = json.loads(f.read())
#
# nlp = English()
# matcher = Matcher(nlp.vocab)
# pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
# pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
# patterns = [pattern1, pattern2]
# matcher.add("GADGET", None, *patterns)
#
# TRAINING_DATA = []
#
# # Create a Doc object for each text in TEXTS
# for doc in nlp.pipe(TEXTS):
#     # Match on the doc and create a list of matched spans
#     spans = [doc[start:end] for match_id, start, end in matcher(doc)]
#
#     # Get (start character, end character, label) tuples of matches
#     entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
#
#     # Format the matches as a (doc.text, entities) tuple
#     training_example = (doc.text, {"entities": entities})
#
#     # Append the example to the training data
#     TRAINING_DATA.append(training_example)
#
# print(*TRAINING_DATA, sep="\n")

########## Section 6 - Setting up pipeline ##############
TRAINING_DATA = [
    ["How to preorder the iPhone X", { "entities": [[20, 28, "GADGET"]] }],
    ["iPhone X is coming", { "entities": [[0, 8, "GADGET"]] }],
    ["Should I pay $1,000 for the iPhone X?", { "entities": [[28, 36, "GADGET"]] }],
    ["The iPhone 8 reviews are here", { "entities": [[4, 12, "GADGET"]] }],
    ["Your iPhone goes up to 11 today", { "entities": [[5, 11, "GADGET"]] }],
    ["I need a new phone! Any tips?", { "entities": [] }]
]

# Create a blank NLP
nlp = spacy.blank("en")

# Create a new NER module
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)

# Add the label 'GADGET' to the entity recognizer
ner.add_label("GADGET")

# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Update the model
        nlp.update(texts, annotations, losses=losses)
        print(losses)


test_texts = [
    "Apple is slowing down the iPhone 8 and iPhone X - how to stop it",
    "I finally understand what the iPhone X ‘notch’ is for",
    "Everything you need to know about the Samsung Galaxy S9",
    "Looking to compare iPad models? Here’s how the 2018 lineup stacks up",
    "The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple",
    "what is the cheapest ipad, especially ipad pro???",
    "Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics"
]

for doc in nlp.pipe(test_texts):
    print(doc.text, [(ent.label_, ent.text, ent.start_char, ent.end_char) for ent in doc.ents])
	# Source: https://course.spacy.io/

	# =========================== Chapter 1 =========================== #
	# Import the English language class
	import spacy
	from spacy.lang.en import English
	from spacy.matcher import Matcher

	# Create the nlp object
	nlp = English()

	# Created by processing a string of text with the nlp object
	doc = nlp("Hello world!")

	# Iterate over tokens in a Doc
	for token in doc:
	print(token.text)

	span = doc[1:4]
	print(span.text)

	doc = nlp("It costs $5.")
	print('Index: ', [token.i for token in doc])
	print('Text: ', [token.text for token in doc])

	print('is_alpha:', [token.is_alpha for token in doc])
	print('is_punct:', [token.is_punct for token in doc])
	print('like_num:', [token.like_num for token in doc])

	# Text
	nlp = spacy.load('en_core_web_sm')
	doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

	# Iterate over the predicted entities
	for ent in doc.ents:
	# Print the entity text and its label
	print(ent.text, ent.label_)

	# Predicting named entities in context
	text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

	# Process the text
	doc = nlp(text)

	# Iterate over the entities
	for ent in doc.ents:
	# Print the entity text and label
	print(ent.text, ent.label_)

	# Get the span for "iPhone X"
	iphone_x = doc[1:3]

	# Print the span text
	print("Missing entity:", iphone_x.text)

	# Section 11
	doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake")

	# Initialize the Matcher with the shared vocabulary
	matcher = Matcher(nlp.vocab)

	# Create a pattern matching two tokens: "iPhone" and "X"
	pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

	# Add the pattern to the matcher
	matcher.add("IPHONE_X_PATTERN", None, pattern)

	# Use the matcher on the doc
	matches = matcher(doc)
	print("Matches:", [doc[start:end].text for match_id, start, end in matches])

	# ============= Section 12 - Part 1 ============= #
	doc = nlp(
	"After making the iOS update you won't notice a radical system-wide "
	"redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
	"iOS 11's furniture remains the same as in iOS 10. But you will discover "
	"some tweaks once you delve a little deeper."
	)

	# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
	pattern = [{"LOWER": "ios"}, {"IS_DIGIT": True}]

	# Add the pattern to the matcher and apply the matcher to the doc
	matcher.add("IOS_VERSION_PATTERN", None, pattern)
	matches = matcher(doc)
	print("Total matches found:", len(matches))

	# Iterate over the matches and print the span text
	for match_id, start, end in matches:
	print("Match found:", doc[start:end].text)

	# ============= Section 12 - Part 2 ============= #
	doc = nlp(
	"i downloaded Fortnite on my laptop and can't open the game at all. Help? "
	"so when I was downloading Minecraft, I got the Windows version where it "
	"is the '.zip' folder and I used the default program to unpack it... do "
	"I also need to download Winzip?"
	)

	# Write a pattern that matches a form of "download" plus proper noun
	pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

	# Add the pattern to the matcher and apply the matcher to the doc
	matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
	matches = matcher(doc)
	print("Total matches found:", len(matches))

	# Iterate over the matches and print the span text
	for match_id, start, end in matches:
	print("Match found:", doc[start:end].text)

	# ============= Section 12 - Part 3 ============= #
	doc = nlp(
	"Features of the app include a beautiful design, smart search, automatic "
	"labels and optional voice responses."
	)

	# Write a pattern for adjective plus one or two nouns
	pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

	# Add the pattern to the matcher and apply the matcher to the doc
	matcher.add("ADJ_NOUN_PATTERN", None, pattern)
	matches = matcher(doc)
	print("Total matches found:", len(matches))

	# Iterate over the matches and print the span text
	for match_id, start, end in matches:
	print("Match found:", doc[start:end].text)

	# =========================== Chapter 2 =========================== #

	import json
	import spacy
	from spacy.lang.en import English
	from spacy.matcher import Matcher, PhraseMatcher
	from spacy.tokens import Span


	nlp = spacy.load('en_core_web_sm')

	doc = nlp("coffee")
	coffee_hash = nlp.vocab.strings["coffee"]
	coffee_string = nlp.vocab.strings[coffee_hash]

	###### Section 2 - Part 1 #######

	doc = nlp("I have a cat")

	# Look up the hash for the word "cat"
	cat_hash = nlp.vocab.strings["cat"]
	print(cat_hash)

	# Look up the cat_hash to get the string
	cat_string = nlp.vocab.strings[cat_hash]
	print(cat_string)

	###### Section 2 - Part 2 #######
	doc = nlp("David Bowie is a PERSON")

	person_hash = nlp.vocab.strings["PERSON"]
	print(person_hash)

	person_string = nlp.vocab.strings[person_hash]
	print(person_string)

	###### Section 7 #########
	doc = nlp("Berlin is a nice city")

	# Iterate over the tokens
	for token in doc:
	# Check if the current token is a proper noun
	if token.pos_ == "PROPN":
	# Check if the next token is a verb
	if doc[token.i + 1].pos_ == "VERB":
	print("Found proper noun before a verb:", token.text)

	###### Section 8 ##########
	# # Load a larger model with vectors
	# print("Loading medium model ...")
	# nlp = spacy.load('en_core_web_md')
	#
	# # Compare two documents
	# doc1 = nlp("I like fast food")
	# doc2 = nlp("I like pizza")
	# print(doc1.similarity(doc2))

	####### Section 13 #########
	doc = nlp(
	"Twitch Prime, the perks program for Amazon Prime members offering free "
	"loot, games and other benefits, is ditching one of its best features: "
	"ad-free viewing. According to an email sent out to Amazon Prime members "
	"today, ad-free viewing will no longer be included as a part of Twitch "
	"Prime for new members, beginning on September 14. However, members with "
	"existing annual subscriptions will be able to continue to enjoy ad-free "
	"viewing until their subscription comes up for renewal. Those with "
	"monthly subscriptions will have access to ad-free viewing until October 15."
	)

	# Create the match patterns
	pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
	pattern2 = [{"LOWER": "ad"}, {"LOWER": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]

	# Initialize the Matcher and add the patterns
	matcher = Matcher(nlp.vocab)
	matcher.add("PATTERN1", None, pattern1)
	matcher.add("PATTERN2", None, pattern2)

	# Iterate over the matches
	for match_id, start, end in matcher(doc):
	# Print pattern string name and text of matched span
	print(doc.vocab.strings[match_id], doc[start:end].text)

	######### Section 14 ##########
	with open("exercises/countries.json") as f:
	COUNTRIES = json.loads(f.read())

	nlp = English()
	doc = nlp("Czech Republic may help Slovakia protect its airspace")

	matcher = PhraseMatcher(nlp.vocab)

	# Create pattern Doc objects and add them to the matcher
	# This is the faster version of: [nlp(country) for country in COUNTRIES]
	patterns = list(nlp.pipe(COUNTRIES))
	matcher.add("COUNTRY", None, *patterns)

	# Call the matcher on the test document and print the result
	matches = matcher(doc)
	print([doc[start:end] for match_id, start, end in matches])

	########### Section 15 ###########
	with open("exercises/countries.json") as f:
	COUNTRIES = json.loads(f.read())

	with open("exercises/country_text.txt") as f:
	TEXT = f.read()

	nlp = English()
	matcher = PhraseMatcher(nlp.vocab)
	patterns = list(nlp.pipe(COUNTRIES))
	matcher.add("COUNTRY", None, *patterns)

	# Create a doc and find matches in it
	doc = nlp(TEXT)

	# Iterate over the matches
	for match_id, start, end in matcher(doc):
	# Create a Span with the label for "GPE"
	span = Span(doc, start, end, label="GPE")

	# Overwrite the doc.ents and add the span
	doc.ents = list(doc.ents) + [span]

	# Get the span's root head token
	span_root_head = span.root.head
	# Print the text of the span root's head token and the span text
	print(span_root_head.text, "-->", span.text)

	# Print the entities in the document
	print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])


	# =========================== Chapter 3 =========================== #

	import json

	import spacy
	from spacy.matcher import PhraseMatcher
	from spacy.tokens import Span, Token


	######## Section 6 #########

	# Define the custom component
	def length_component(doc):
	# Get the doc's length
	doc_length = len(doc)
	print("This document is {} tokens long.".format(doc_length))
	# Return the doc
	return doc

	# Load the small English model
	nlp = spacy.load("en_core_web_sm")

	# Add the component first in the pipeline and print the pipe names
	nlp.add_pipe(length_component, first=True)
	print(nlp.pipe_names)

	# Process a text
	doc = nlp("Hello there")


	######### Section 7 #########

	animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
	animal_patterns = list(nlp.pipe(animals))
	print("animal_patterns:", animal_patterns)

	matcher = PhraseMatcher(nlp.vocab)
	matcher.add("ANIMAL", None, *animal_patterns)

	# Define the custom component
	def animal_component(doc):
	# Apply the matcher to the doc
	matches = matcher(doc)
	# Create a Span for each match and assign the label 'ANIMAL'
	spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
	# Overwrite the doc.ents with the matched spans
	doc.ents = spans
	return doc


	# Add the component to the pipeline after the 'ner' component
	nlp.add_pipe(animal_component, after='ner')
	print(nlp.pipe_names)

	# Process the text and print the text and label for the doc.ents
	doc = nlp("I have a cat and a Golden Retriever")
	print([(ent.text, ent.label_) for ent in doc.ents])

	########### Section 9 ######################
	def get_reversed(token):
	return token.text[::-1]

	# Register the Token extension attribute 'is_country' with the default value False
	Token.set_extension('is_country', default=False)
	Token.set_extension('reversed', getter=get_reversed)

	# Process the text and set the is_country attribute to True for the token "Spain"
	doc = nlp("I live in Spain.")
	doc[3]._.is_country = True

	# Print the token text and the is_country attribute for all tokens
	print([(token, token._.is_country) for token in doc])

	print([(token, token._.reversed) for token in doc])

	########### Section 14 ######################

	with open("exercises/tweets.json") as f:
	TEXTS = json.loads(f.read())

	# Process the texts and print the adjectives
	for text in TEXTS:
	doc = nlp(text)
	print([token.text for token in doc if token.pos_ == "ADJ"])

	# Batch processing
	docs = list(nlp.pipe(TEXTS))
	for doc in docs:
	print([token.text for token in doc if token.pos_ == "ADJ"])

	# =========================== Chapter 4 =========================== #

	import json
	import random

	import spacy
	from spacy.matcher import Matcher
	from spacy.lang.en import English
	#
	# ###### Create training data ########
	# with open("exercises/iphone.json") as f:
	# TEXTS = json.loads(f.read())
	#
	# nlp = English()
	# matcher = Matcher(nlp.vocab)
	# pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
	# pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
	# patterns = [pattern1, pattern2]
	# matcher.add("GADGET", None, *patterns)
	#
	# TRAINING_DATA = []
	#
	# # Create a Doc object for each text in TEXTS
	# for doc in nlp.pipe(TEXTS):
	# # Match on the doc and create a list of matched spans
	# spans = [doc[start:end] for match_id, start, end in matcher(doc)]
	#
	# # Get (start character, end character, label) tuples of matches
	# entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
	#
	# # Format the matches as a (doc.text, entities) tuple
	# training_example = (doc.text, {"entities": entities})
	#
	# # Append the example to the training data
	# TRAINING_DATA.append(training_example)
	#
	# print(*TRAINING_DATA, sep="\n")

	########## Section 6 - Setting up pipeline ##############
	TRAINING_DATA = [
	["How to preorder the iPhone X", { "entities": [[20, 28, "GADGET"]] }],
	["iPhone X is coming", { "entities": [[0, 8, "GADGET"]] }],
	["Should I pay $1,000 for the iPhone X?", { "entities": [[28, 36, "GADGET"]] }],
	["The iPhone 8 reviews are here", { "entities": [[4, 12, "GADGET"]] }],
	["Your iPhone goes up to 11 today", { "entities": [[5, 11, "GADGET"]] }],
	["I need a new phone! Any tips?", { "entities": [] }]
	]

	# Create a blank NLP
	nlp = spacy.blank("en")

	# Create a new NER module
	ner = nlp.create_pipe("ner")
	nlp.add_pipe(ner)

	# Add the label 'GADGET' to the entity recognizer
	ner.add_label("GADGET")

	# Start the training
	nlp.begin_training()

	# Loop for 10 iterations
	for itn in range(10):
	# Shuffle the training data
	random.shuffle(TRAINING_DATA)
	losses = {}

	# Batch the examples and iterate over them
	for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
	texts = [text for text, entities in batch]
	annotations = [entities for text, entities in batch]

	# Update the model
	nlp.update(texts, annotations, losses=losses)
	print(losses)


	test_texts = [
	"Apple is slowing down the iPhone 8 and iPhone X - how to stop it",
	"I finally understand what the iPhone X ‘notch’ is for",
	"Everything you need to know about the Samsung Galaxy S9",
	"Looking to compare iPad models? Here’s how the 2018 lineup stacks up",
	"The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple",
	"what is the cheapest ipad, especially ipad pro???",
	"Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics"
	]

	for doc in nlp.pipe(test_texts):
	print(doc.text, [(ent.label_, ent.text, ent.start_char, ent.end_char) for ent in doc.ents])