shantanuo/all_summary.py

## all_summary.py
summary = [article['summary'] for article in articles]
sentence = summary[0]

## article_structure.py
{'_id': 'baff70092abcf695d73af5186c4df82f',
 '_score': 26.395569,
 'author': None,
 'authors': [],
 'clean_url': 'cnbctv18.com',
 'country': 'US',
 'excerpt': "Some analysts believe an Ethereum ETF can be more successful than the Bitcoin ETF. For example, Grayscale's Ethereum Trust (ETHE) is witnessing more institutional investors flocking to Grayscale's…",
 'is_opinion': False,
 'language': 'en',
 'link': 'https://www.cnbctv18.com/cryptocurrency/is-ethereum-etf-on-the-way-grayscale-ceo-deciphers-11178362.htm’,
 'media': 'https://images.cnbctv18.com/wp-content/uploads/2021/09/ether-1019x573.jpg',
 'published_date': '2021-10-21 13:14:33',
 'published_date_precision': 'timezone unknown',
 'rank': 16951,
 'rights': 'cnbctv18.com',
 'summary': "Following the debut of Bitcoin futures ETF in the United States, the crypto market is abuzz with talks of an impending Ether ETF.Speaking on a show on CNBC, Michael Sonnenshein, CEO of Grayscale -- an asset management company with $52 billion in assets under management -- says it is possible. He said it 'stands to reason' the Securities and Exchange Committee (SEC) will proactively consider bringing Ethereum ETF and other similar products in the US market.Canada already has Bitcoin, Ethereum ETFsWhile US regulators have allowed Bitcoin futures ETF to be traded on the exchanges, Canada has allowed both Bitcoin and Ethereum ETFs.",
 'title': 'Is Ethereum ETF on the way? Grayscale CEO deciphers',
 'topic': 'news',
 'twitter_account': None
}

## demo.py
import time

text = input("Enter the text to be tokenized: \n")
choice = input("\nEnter Your choice of library: \n->spaCy (s) \n->NLTK (n) \n->Both (b)\n")
if choice == 's':
    spacy_pipeline(text)
elif choice =='n':
    nltk_pipeline(text)
elif choice == 'b':
    start = time()
    print("\t\t\tspaCy\n")
    spacy_pipeline(text)
    print(f"Time taken by spaCy: {time()-start}s")
    start = time()
    print("\n\t\t\tnltk\n")
    spacy_pipeline(text)
    print(f"Time taken by spaCy: {time()-start}s")
else:
    print("Invalid choice!")

## fetch_articles.py
API_KEY = "YOUR API KEY GOES HERE"
from newscatcherapi import NewsCatcherApiClient

newscatcherapi = NewsCatcherApiClient(x_api_key=API_KEY)

data = newscatcherapi.get_search(q="Bitcoin OR Ethereum OR crypto",
                                         lang='en',
                                         page_size=100)
articles = data['articles']
print(articles[0])

## import_nltk.py
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

## install_libraries
pip install spacy
pip install nltk
pip install newscatcherapi

## nltk_pipeline.py
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
stemmer = PorterStemmer()
tokens = word_tokenize(sentence)

#Stemming
stemed_tokens = []
for word in tokens:
    stemed_tokens.append(stemmer.stem(word))

#Lemmatization
lemmatizer = WordNetLemmatizer()
nltk_lemma_list = []
for word in stemed_tokens:
    nltk_lemma_list.append(lemmatizer.lemmatize(word))

print("Stemming + Lemmatization:")
print(nltk_lemma_list)

"""
Stemming + Lemmatization:
['follow', 'the', 'debut', 'of', 'bitcoin', 'futur',
'etf', 'in', 'the', 'unit', 'state', ',', 'the', 'crypto',
'market', 'is', 'abuzz', 'with', 'talk', 'of', 'an', 'impend',
'ether', 'etf.speak', 'on', 'a', 'show', 'on', 'cnbc', ',', 'michael',
'sonnenshein', ',', 'ceo', 'of', 'grayscal', '--', 'an', 'asset', 'manag',
'compani', 'with', '$', '52', 'billion', 'in', 'asset', 'under', 'manag',
'--', 'say', 'it', 'is', 'possibl', '.', 'He', 'said', 'it', "'stand", 'to',
'reason', "'", 'the', 'secur', 'and', 'exchang', 'committe', '(', 'sec', ')',
'will', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', 'and', 'other', 'similar',
'product', 'in', 'the', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin', ',', '
ethereum', 'etfswhil', 'US', 'regul', 'have', 'allow', 'bitcoin', 'futur', 'etf', 'to',
'be', 'trade', 'on', 'the', 'exchang', ',', 'canada', 'ha', 'allow', 'both', 'bitcoin',
'and', 'ethereum', 'etf', '.']
"""

## nltk_stopwords.py
#Removing the stopwords
normalized_tokens = []
nltk_stop_words = set(stopwords.words("english"))
for w in nltk_lemma_list:
    if w not in nltk_stop_words:
        normalized_tokens.append(w)

#Removing the punctuations
normalized_tokens = remove_punctuations(normalized_tokens)
print(" ")
print("\nText after removing stopwords & punctuations:\n")
print(normalized_tokens)


"""
Text after removing stopwords & punctuations:

['follow', 'debut', 'bitcoin', 'futur', 'etf', 'unit', 'state',
'crypto', 'market', 'abuzz', 'talk', 'impend', 'ether', 'etf.speak',
'show', 'cnbc', 'michael', 'sonnenshein', 'ceo', 'grayscal', 'asset',
'manag', 'compani', '$', '52', 'billion', 'asset', 'manag', 'say',
'possibl', 'He', 'said', "'stand", 'reason', "'", 'secur', 'exchang',
'committe', 'sec', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', '
similar', 'product', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin',
'ethereum', 'etfswhil', 'US', 'regul', 'allow', 'bitcoin', 'futur', 'etf',
'trade', 'exchang', 'canada', 'ha', 'allow', 'bitcoin', 'ethereum', 'etf']
"""

## punc_remove_function.py
# Neither spaCy nor NLTK have any methods for filtering punctuations
def remove_punctuations(normalized_tokens):
    punctuations=['?',':','!',',','.',';','|','(',')','--']
    for word in normalized_tokens:
        if word in punctuations:
            normalized_tokens.remove(word)
    return normalized_tokens

## remove_punc.py
# Neither spaCy nor NLTK have any methods for filtering punctuations
def remove_punctuations(normalized_tokens):
    punctuations=['?',':','!',',','.',';','|','(',')','--']
    for word in normalized_tokens:
        if word in punctuations:
            normalized_tokens.remove(word)
    return normalized_tokens

## spacy_get_lemma.py
lemma_list = []
for token in doc:
    lemma_list.append(token.lemma_)
print("Lemmatized tokens:\n")
print(lemma_list)


"""Lemmatized tokens:
['follow', 'the', 'debut', 'of', 'Bitcoin', 'future',
'etf', 'in', 'the', 'United', 'States', ',', 'the', 'crypto',
'market', 'be', 'abuzz', 'with', 'talk', 'of', 'an', 'impend',
'Ether', 'etf.speake', 'on', 'a', 'show', 'on', 'CNBC', ',', 'Michael',
'Sonnenshein', ',', 'ceo', 'of', 'Grayscale', '--', 'an', 'asset',
'management', 'company', 'with', '$', '52', 'billion', 'in', 'asset',
'under', 'management', '--', 'say', '-PRON-', 'be', 'possible', '.',
'-PRON-', 'say', '-PRON-', "'", 'stand', 'to', 'reason', "'", 'the',
'Securities', 'and', 'Exchange', 'Committee', '(', 'SEC', ')', 'will',
'proactively', 'consider', 'bring', 'Ethereum', 'etf', 'and', 'other',
'similar', 'product', 'in', 'the', 'US', 'market', '.', 'Canada', 'already',
'have', 'Bitcoin', ',', 'Ethereum', 'ETFsWhile', 'US', 'regulator', 'have',
'allow', 'Bitcoin', 'future', 'etf', 'to', 'be', 'trade', 'on', 'the', 'exchange',
',', 'Canada', 'have', 'allow', 'both', 'Bitcoin', 'and', 'Ethereum', 'etf', '.']
"""

## spacy_nlp_pipeline_ex.py
import spacy
# just keeping pos tagger and lemmatizer
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner',
                                            'tok2vec', 'attribute_ruler'])
doc = nlp(sentence)

## spacy_punc.py
#Removing the stopwords
normalized_tokens =[]
for word in lemma_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        normalized_tokens.append(word)
normalized_tokens = remove_punctuations(normalized_tokens)
print("\nText after removing stopwords & punctuations:\n")
print(normalized_tokens)

"""Text after removing stopwords & punctuations:

['follow', 'debut', 'Bitcoin', 'future', 'etf', 'United',
'States', 'crypto', 'market', 'abuzz', 'talk', 'impend',
'Ether', 'etf.speake', 'CNBC', 'Michael', 'Sonnenshein', 'ceo',
'Grayscale', 'asset', 'management', 'company', '$', '52', 'billion',
'asset', 'management', '-PRON-', 'possible', '-PRON-', '-PRON-', "'",
'stand', 'reason', "'", 'Securities', 'Exchange', 'Committee', 'SEC',
'proactively', 'consider', 'bring', 'Ethereum', 'etf', 'similar', 'product',
'market', 'Canada', 'Bitcoin', 'Ethereum', 'ETFsWhile', 'regulator', 'allow',
'Bitcoin', 'future', 'etf', 'trade', 'exchange', 'Canada', 'allow', 'Bitcoin',
'Ethereum', 'etf']
"""

## spacy_tokenizer_example.py
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
vocab = English()
# Create a Tokenizer with the default settings for English
tokenizer = vocab.tokenizer
tokens = tokenizer(sentence)
print(list(tokens))

"""[Following, the, debut, of, Bitcoin, futures, ETF, in, the, United,
States, ,, the, crypto, market, is, abuzz, with, talks, of, an, impending,
Ether, ETF.Speaking, on, a, show, on, CNBC, ,, Michael, Sonnenshein, ,, CEO,
of, Grayscale, --, an, asset, management, company, with, $, 52, billion, in,
assets, under, management, --, says, it, is, possible, ., He, said, it, ',
stands, to, reason, ', the, Securities, and, Exchange, Committee, (, SEC, ),
will, proactively, consider, bringing, Ethereum, ETF, and, other, similar,
products, in, the, US, market, ., Canada, already, has, Bitcoin, ,, Ethereum,
ETFsWhile, US, regulators, have, allowed, Bitcoin, futures, ETF, to, be, traded,
on, the, exchanges, ,, Canada, has, allowed, both, Bitcoin, and, Ethereum, ETFs, .]"""
	summary = [article['summary'] for article in articles]
	sentence = summary[0]
	{'_id': 'baff70092abcf695d73af5186c4df82f',
	'_score': 26.395569,
	'author': None,
	'authors': [],
	'clean_url': 'cnbctv18.com',
	'country': 'US',
	'excerpt': "Some analysts believe an Ethereum ETF can be more successful than the Bitcoin ETF. For example, Grayscale's Ethereum Trust (ETHE) is witnessing more institutional investors flocking to Grayscale's…",
	'is_opinion': False,
	'language': 'en',
	'link': 'https://www.cnbctv18.com/cryptocurrency/is-ethereum-etf-on-the-way-grayscale-ceo-deciphers-11178362.htm’,
	'media': 'https://images.cnbctv18.com/wp-content/uploads/2021/09/ether-1019x573.jpg',
	'published_date': '2021-10-21 13:14:33',
	'published_date_precision': 'timezone unknown',
	'rank': 16951,
	'rights': 'cnbctv18.com',
	'summary': "Following the debut of Bitcoin futures ETF in the United States, the crypto market is abuzz with talks of an impending Ether ETF.Speaking on a show on CNBC, Michael Sonnenshein, CEO of Grayscale -- an asset management company with $52 billion in assets under management -- says it is possible. He said it 'stands to reason' the Securities and Exchange Committee (SEC) will proactively consider bringing Ethereum ETF and other similar products in the US market.Canada already has Bitcoin, Ethereum ETFsWhile US regulators have allowed Bitcoin futures ETF to be traded on the exchanges, Canada has allowed both Bitcoin and Ethereum ETFs.",
	'title': 'Is Ethereum ETF on the way? Grayscale CEO deciphers',
	'topic': 'news',
	'twitter_account': None
	}
	import time

	text = input("Enter the text to be tokenized: \n")
	choice = input("\nEnter Your choice of library: \n->spaCy (s) \n->NLTK (n) \n->Both (b)\n")
	if choice == 's':
	spacy_pipeline(text)
	elif choice =='n':
	nltk_pipeline(text)
	elif choice == 'b':
	start = time()
	print("\t\t\tspaCy\n")
	spacy_pipeline(text)
	print(f"Time taken by spaCy: {time()-start}s")
	start = time()
	print("\n\t\t\tnltk\n")
	spacy_pipeline(text)
	print(f"Time taken by spaCy: {time()-start}s")
	else:
	print("Invalid choice!")
	API_KEY = "YOUR API KEY GOES HERE"
	from newscatcherapi import NewsCatcherApiClient

	newscatcherapi = NewsCatcherApiClient(x_api_key=API_KEY)

	data = newscatcherapi.get_search(q="Bitcoin OR Ethereum OR crypto",
	lang='en',
	page_size=100)
	articles = data['articles']
	print(articles[0])
	import nltk
	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('stopwords')
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.stem.porter import *
	stemmer = PorterStemmer()
	tokens = word_tokenize(sentence)

	#Stemming
	stemed_tokens = []
	for word in tokens:
	stemed_tokens.append(stemmer.stem(word))

	#Lemmatization
	lemmatizer = WordNetLemmatizer()
	nltk_lemma_list = []
	for word in stemed_tokens:
	nltk_lemma_list.append(lemmatizer.lemmatize(word))

	print("Stemming + Lemmatization:")
	print(nltk_lemma_list)

	"""
	Stemming + Lemmatization:
	['follow', 'the', 'debut', 'of', 'bitcoin', 'futur',
	'etf', 'in', 'the', 'unit', 'state', ',', 'the', 'crypto',
	'market', 'is', 'abuzz', 'with', 'talk', 'of', 'an', 'impend',
	'ether', 'etf.speak', 'on', 'a', 'show', 'on', 'cnbc', ',', 'michael',
	'sonnenshein', ',', 'ceo', 'of', 'grayscal', '--', 'an', 'asset', 'manag',
	'compani', 'with', '$', '52', 'billion', 'in', 'asset', 'under', 'manag',
	'--', 'say', 'it', 'is', 'possibl', '.', 'He', 'said', 'it', "'stand", 'to',
	'reason', "'", 'the', 'secur', 'and', 'exchang', 'committe', '(', 'sec', ')',
	'will', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', 'and', 'other', 'similar',
	'product', 'in', 'the', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin', ',', '
	ethereum', 'etfswhil', 'US', 'regul', 'have', 'allow', 'bitcoin', 'futur', 'etf', 'to',
	'be', 'trade', 'on', 'the', 'exchang', ',', 'canada', 'ha', 'allow', 'both', 'bitcoin',
	'and', 'ethereum', 'etf', '.']
	"""
	#Removing the stopwords
	normalized_tokens = []
	nltk_stop_words = set(stopwords.words("english"))
	for w in nltk_lemma_list:
	if w not in nltk_stop_words:
	normalized_tokens.append(w)

	#Removing the punctuations
	normalized_tokens = remove_punctuations(normalized_tokens)
	print(" ")
	print("\nText after removing stopwords & punctuations:\n")
	print(normalized_tokens)


	"""
	Text after removing stopwords & punctuations:

	['follow', 'debut', 'bitcoin', 'futur', 'etf', 'unit', 'state',
	'crypto', 'market', 'abuzz', 'talk', 'impend', 'ether', 'etf.speak',
	'show', 'cnbc', 'michael', 'sonnenshein', 'ceo', 'grayscal', 'asset',
	'manag', 'compani', '$', '52', 'billion', 'asset', 'manag', 'say',
	'possibl', 'He', 'said', "'stand", 'reason', "'", 'secur', 'exchang',
	'committe', 'sec', 'proactiv', 'consid', 'bring', 'ethereum', 'etf', '
	similar', 'product', 'US', 'market.canada', 'alreadi', 'ha', 'bitcoin',
	'ethereum', 'etfswhil', 'US', 'regul', 'allow', 'bitcoin', 'futur', 'etf',
	'trade', 'exchang', 'canada', 'ha', 'allow', 'bitcoin', 'ethereum', 'etf']
	"""
	# Neither spaCy nor NLTK have any methods for filtering punctuations
	def remove_punctuations(normalized_tokens):
	punctuations=['?',':','!',',','.',';','\|','(',')','--']
	for word in normalized_tokens:
	if word in punctuations:
	normalized_tokens.remove(word)
	return normalized_tokens
	lemma_list = []
	for token in doc:
	lemma_list.append(token.lemma_)
	print("Lemmatized tokens:\n")
	print(lemma_list)



	"""Lemmatized tokens:
	['follow', 'the', 'debut', 'of', 'Bitcoin', 'future',
	'etf', 'in', 'the', 'United', 'States', ',', 'the', 'crypto',
	'market', 'be', 'abuzz', 'with', 'talk', 'of', 'an', 'impend',
	'Ether', 'etf.speake', 'on', 'a', 'show', 'on', 'CNBC', ',', 'Michael',
	'Sonnenshein', ',', 'ceo', 'of', 'Grayscale', '--', 'an', 'asset',
	'management', 'company', 'with', '$', '52', 'billion', 'in', 'asset',
	'under', 'management', '--', 'say', '-PRON-', 'be', 'possible', '.',
	'-PRON-', 'say', '-PRON-', "'", 'stand', 'to', 'reason', "'", 'the',
	'Securities', 'and', 'Exchange', 'Committee', '(', 'SEC', ')', 'will',
	'proactively', 'consider', 'bring', 'Ethereum', 'etf', 'and', 'other',
	'similar', 'product', 'in', 'the', 'US', 'market', '.', 'Canada', 'already',
	'have', 'Bitcoin', ',', 'Ethereum', 'ETFsWhile', 'US', 'regulator', 'have',
	'allow', 'Bitcoin', 'future', 'etf', 'to', 'be', 'trade', 'on', 'the', 'exchange',
	',', 'Canada', 'have', 'allow', 'both', 'Bitcoin', 'and', 'Ethereum', 'etf', '.']
	"""
	import spacy
	# just keeping pos tagger and lemmatizer
	nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner',
	'tok2vec', 'attribute_ruler'])
	doc = nlp(sentence)