Skip to content

Instantly share code, notes, and snippets.

@shantanuo
Forked from NewscatcherAPI/all_summary.py
Created December 30, 2021 09:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shantanuo/fae2bb6f1239a9e52838597891031255 to your computer and use it in GitHub Desktop.
Save shantanuo/fae2bb6f1239a9e52838597891031255 to your computer and use it in GitHub Desktop.
spacy_vs_nltk_newscatcher_blog
summary = [article['summary'] for article in articles]
sentence = summary[0]
{'_id': 'baff70092abcf695d73af5186c4df82f',
'_score': 26.395569,
'author': None,
'authors': [],
'clean_url': 'cnbctv18.com',
'country': 'US',
'excerpt': "Some analysts believe an Ethereum ETF can be more successful than the Bitcoin ETF. For example, Grayscale's Ethereum Trust (ETHE) is witnessing more institutional investors flocking to Grayscale's…",
'is_opinion': False,
'language': 'en',
'link': 'https://www.cnbctv18.com/cryptocurrency/is-ethereum-etf-on-the-way-grayscale-ceo-deciphers-11178362.htm’,
'media': 'https://images.cnbctv18.com/wp-content/uploads/2021/09/ether-1019x573.jpg',
'published_date': '2021-10-21 13:14:33',
'published_date_precision': 'timezone unknown',
'rank': 16951,
'rights': 'cnbctv18.com',
'summary': "Following the debut of Bitcoin futures ETF in the United States, the crypto market is abuzz with talks of an impending Ether ETF.Speaking on a show on CNBC, Michael Sonnenshein, CEO of Grayscale -- an asset management company with $52 billion in assets under management -- says it is possible. He said it 'stands to reason' the Securities and Exchange Committee (SEC) will proactively consider bringing Ethereum ETF and other similar products in the US market.Canada already has Bitcoin, Ethereum ETFsWhile US regulators have allowed Bitcoin futures ETF to be traded on the exchanges, Canada has allowed both Bitcoin and Ethereum ETFs.",
'title': 'Is Ethereum ETF on the way? Grayscale CEO deciphers',
'topic': 'news',
'twitter_account': None
}
import time
text = input("Enter the text to be tokenized: \n")
choice = input("\nEnter Your choice of library: \n->spaCy (s) \n->NLTK (n) \n->Both (b)\n")
if choice == 's':
spacy_pipeline(text)
elif choice =='n':
nltk_pipeline(text)
elif choice == 'b':
start = time()
print("\t\t\tspaCy\n")
spacy_pipeline(text)
print(f"Time taken by spaCy: {time()-start}s")
start = time()
print("\n\t\t\tnltk\n")
spacy_pipeline(text)
print(f"Time taken by spaCy: {time()-start}s")
else:
print("Invalid choice!")
API_KEY = "YOUR API KEY GOES HERE"
from newscatcherapi import NewsCatcherApiClient
newscatcherapi = NewsCatcherApiClient(x_api_key=API_KEY)
data = newscatcherapi.get_search(q="Bitcoin OR Ethereum OR crypto",
lang='en',
page_size=100)
articles = data['articles']
print(articles[0])
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
pip install spacy
pip install nltk
pip install newscatcherapi
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokens = word_tokenize(sentence)
#Stemming
stemed_tokens = []
for word in tokens:
stemed_tokens.append(stemmer.stem(word))
#Lemmatization
lemmatizer = WordNetLemmatizer()
nltk_lemma_list = []
for word in stemed_tokens:
nltk_lemma_list.append(lemmatizer.lemmatize(word))
print("Stemming + Lemmatization:")
print(nltk_lemma_list)
"""
Stemming + Lemmatization:
['follow', 'the', 'debut', 'of', 'bitcoin', 'futur',
'etf', 'in', 'the', 'unit', 'state', ',', 'the', 'crypto',
'market', 'is', 'abuzz', 'with', 'talk', 'of', 'an', 'impend',
'ether', 'etf.speak', 'on', 'a', 'show', 'on', 'cnbc', ',', 'michael',
'sonnenshein', ',', 'ceo', 'of', 'grayscal', '--', 'an', 'asset', 'manag',
'compani', 'with', '$', '52', 'billion', 'in', 'asset', 'under', 'manag',
'--', 'say', 'it', 'is', 'possibl', '.', 'He', 'said', 'it', "'stand", 'to',
'reason', "'", 'the', 'secur', 'and', 'exchang', 'committe', '(', 'sec', ')',
'will', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', 'and', 'other', 'similar',
'product', 'in', 'the', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin', ',', '
ethereum', 'etfswhil', 'US', 'regul', 'have', 'allow', 'bitcoin', 'futur', 'etf', 'to',
'be', 'trade', 'on', 'the', 'exchang', ',', 'canada', 'ha', 'allow', 'both', 'bitcoin',
'and', 'ethereum', 'etf', '.']
"""
#Removing the stopwords
normalized_tokens = []
nltk_stop_words = set(stopwords.words("english"))
for w in nltk_lemma_list:
if w not in nltk_stop_words:
normalized_tokens.append(w)
#Removing the punctuations
normalized_tokens = remove_punctuations(normalized_tokens)
print(" ")
print("\nText after removing stopwords & punctuations:\n")
print(normalized_tokens)
"""
Text after removing stopwords & punctuations:
['follow', 'debut', 'bitcoin', 'futur', 'etf', 'unit', 'state',
'crypto', 'market', 'abuzz', 'talk', 'impend', 'ether', 'etf.speak',
'show', 'cnbc', 'michael', 'sonnenshein', 'ceo', 'grayscal', 'asset',
'manag', 'compani', '$', '52', 'billion', 'asset', 'manag', 'say',
'possibl', 'He', 'said', "'stand", 'reason', "'", 'secur', 'exchang',
'committe', 'sec', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', '
similar', 'product', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin',
'ethereum', 'etfswhil', 'US', 'regul', 'allow', 'bitcoin', 'futur', 'etf',
'trade', 'exchang', 'canada', 'ha', 'allow', 'bitcoin', 'ethereum', 'etf']
"""
# Neither spaCy nor NLTK have any methods for filtering punctuations
def remove_punctuations(normalized_tokens):
punctuations=['?',':','!',',','.',';','|','(',')','--']
for word in normalized_tokens:
if word in punctuations:
normalized_tokens.remove(word)
return normalized_tokens
# Neither spaCy nor NLTK have any methods for filtering punctuations
def remove_punctuations(normalized_tokens):
punctuations=['?',':','!',',','.',';','|','(',')','--']
for word in normalized_tokens:
if word in punctuations:
normalized_tokens.remove(word)
return normalized_tokens
lemma_list = []
for token in doc:
lemma_list.append(token.lemma_)
print("Lemmatized tokens:\n")
print(lemma_list)
"""Lemmatized tokens:
['follow', 'the', 'debut', 'of', 'Bitcoin', 'future',
'etf', 'in', 'the', 'United', 'States', ',', 'the', 'crypto',
'market', 'be', 'abuzz', 'with', 'talk', 'of', 'an', 'impend',
'Ether', 'etf.speake', 'on', 'a', 'show', 'on', 'CNBC', ',', 'Michael',
'Sonnenshein', ',', 'ceo', 'of', 'Grayscale', '--', 'an', 'asset',
'management', 'company', 'with', '$', '52', 'billion', 'in', 'asset',
'under', 'management', '--', 'say', '-PRON-', 'be', 'possible', '.',
'-PRON-', 'say', '-PRON-', "'", 'stand', 'to', 'reason', "'", 'the',
'Securities', 'and', 'Exchange', 'Committee', '(', 'SEC', ')', 'will',
'proactively', 'consider', 'bring', 'Ethereum', 'etf', 'and', 'other',
'similar', 'product', 'in', 'the', 'US', 'market', '.', 'Canada', 'already',
'have', 'Bitcoin', ',', 'Ethereum', 'ETFsWhile', 'US', 'regulator', 'have',
'allow', 'Bitcoin', 'future', 'etf', 'to', 'be', 'trade', 'on', 'the', 'exchange',
',', 'Canada', 'have', 'allow', 'both', 'Bitcoin', 'and', 'Ethereum', 'etf', '.']
"""
import spacy
# just keeping pos tagger and lemmatizer
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner',
'tok2vec', 'attribute_ruler'])
doc = nlp(sentence)
#Removing the stopwords
normalized_tokens =[]
for word in lemma_list:
lexeme = nlp.vocab[word]
if lexeme.is_stop == False:
normalized_tokens.append(word)
normalized_tokens = remove_punctuations(normalized_tokens)
print("\nText after removing stopwords & punctuations:\n")
print(normalized_tokens)
"""Text after removing stopwords & punctuations:
['follow', 'debut', 'Bitcoin', 'future', 'etf', 'United',
'States', 'crypto', 'market', 'abuzz', 'talk', 'impend',
'Ether', 'etf.speake', 'CNBC', 'Michael', 'Sonnenshein', 'ceo',
'Grayscale', 'asset', 'management', 'company', '$', '52', 'billion',
'asset', 'management', '-PRON-', 'possible', '-PRON-', '-PRON-', "'",
'stand', 'reason', "'", 'Securities', 'Exchange', 'Committee', 'SEC',
'proactively', 'consider', 'bring', 'Ethereum', 'etf', 'similar', 'product',
'market', 'Canada', 'Bitcoin', 'Ethereum', 'ETFsWhile', 'regulator', 'allow',
'Bitcoin', 'future', 'etf', 'trade', 'exchange', 'Canada', 'allow', 'Bitcoin',
'Ethereum', 'etf']
"""
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
vocab = English()
# Create a Tokenizer with the default settings for English
tokenizer = vocab.tokenizer
tokens = tokenizer(sentence)
print(list(tokens))
"""[Following, the, debut, of, Bitcoin, futures, ETF, in, the, United,
States, ,, the, crypto, market, is, abuzz, with, talks, of, an, impending,
Ether, ETF.Speaking, on, a, show, on, CNBC, ,, Michael, Sonnenshein, ,, CEO,
of, Grayscale, --, an, asset, management, company, with, $, 52, billion, in,
assets, under, management, --, says, it, is, possible, ., He, said, it, ',
stands, to, reason, ', the, Securities, and, Exchange, Committee, (, SEC, ),
will, proactively, consider, bringing, Ethereum, ETF, and, other, similar,
products, in, the, US, market, ., Canada, already, has, Bitcoin, ,, Ethereum,
ETFsWhile, US, regulators, have, allowed, Bitcoin, futures, ETF, to, be, traded,
on, the, exchanges, ,, Canada, has, allowed, both, Bitcoin, and, Ethereum, ETFs, .]"""
@shantanuo
Copy link
Author

normalized_tokens = remove_punctuations(normalized_tokens)
The line mentioned above will return "null", since normalized_tokens list is being updated in-place, it should be...
remove_punctuations(normalized_tokens)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment