Aditya1001001/consolidated_pipeline.py

## consolidated_pipeline.py
def dividend_info(article):
  headline = nlp(article['title'])
  if 'date' in [token.text.lower() for token in headline]:
    date = get_date(headline)
    if date:
      org = get_org(headline)
      ticker = get_ticker(headline)
      amount = get_amount_summary(nlp(article['summary']))
      pay_date = get_pay_date(nlp(article['summary']))
      print("HEADLINE: " + article['title'])
      print(f"\nTICKER: {ticker}" + f"\nDATE: {date}" +  f"\nAMOUNT: {amount} per share to be paid on {pay_date}\n")
  else:
    dividend = get_amount_headline(headline)
    if dividend:
      org = get_org(headline)
      ticker = get_ticker(headline)
      print("NEWS HEADLINE: " + article['title'])
      print(f"\nTICKER: {ticker}" + f"\nAMOUNT: {dividend}\n")

## demo_code.py
for article in news_articles:
  dividend_info(article)
  time.sleep(0.2)

## get_ex_date.py
def get_date(doc):
  date_matcher = Matcher(nlp.vocab)
  pattern = [{"POS": "PROPN"}, {"LIKE_NUM": True},
            {"text": ","}, {"LIKE_NUM": True}]
  date_matcher.add("EX_DATE", None, pattern)
  if len(date_matcher(doc)) > 0:
    match = date_matcher(doc)[0]
    return doc[match[1]:match[2]]
  else:
    return False

## get_headline_amt.py
def get_amount_headline(doc):
  dividend_matcher = Matcher(nlp.vocab)
  pattern = [{"ORTH": "US$"}, {"LIKE_NUM": True}]
  dividend_matcher.add("USD", None, pattern)
  if len(dividend_matcher(doc)) > 0:
    match = dividend_matcher(doc)[0]
    return doc[match[1]:match[2]]
  else:
    return False

doc = nlp("There's A Lot To Like About ConnectOne Bancorp's (NASDAQ:CNOB) Upcoming US$0.13 Dividend")
print(get_amount_headline(doc))

# OUTPUT
# US$0.13

## get_org_1.py
def get_org(doc):
  org_matcher = Matcher(nlp.vocab)
  pattern = [{'POS': 'PROPN', 'OP': '+'},
            {'POS': 'CCONJ', 'OP': '?'},
            {'POS': 'PROPN', 'OP': '*'},
            {'ORTH': '\'', 'OP': '?'},
            {'ORTH': '\'s', 'OP': '?'},
            {'ORTH': '(', 'OP': '+'}]
  org_matcher.add("ORG", None, pattern)

## get_org_2.py
  matches = org_matcher(doc)
  if len(matches) == 0:
    return f"{doc.text} -> NO MATCH FOUND"
  elif len(matches) == 1:
    match_idx = matches[0]
  else:
    max_len = 0
    for m in matches:
      if m[2] - m[1] > max_len:
        max_len = m[2] - m[1]
        match_idx = m

## get_org_3.py
  return doc[match_idx[1]:match_idx[2]-1]

## get_pay_date.py
def get_pay_date(doc):
  pay_date_matcher = Matcher(nlp.vocab)
  pattern = [{"ORTH": "paid"}, {"ORTH": "on"},
             {"POS": "PROPN"}, {"LIKE_NUM": True},
             {"ORTH": ","}, {"LIKE_NUM": True},
             {"ORTH": "."}]
  pay_date_matcher.add("AMOUNT", None, pattern)
  match = pay_date_matcher(doc)[0]
  return doc[match[1] + 2:match[2]-1]

## get_summary_amount.py
def get_amount_summary(doc):
  per_share_matcher = Matcher(nlp.vocab)
  pattern = [{"ORTH": "$"}, {"LIKE_NUM": True},
             {"LOWER": "per"}, {"LOWER": "share"}]
  per_share_matcher.add("AMOUNT", None, pattern)
  match = per_share_matcher(doc)[0]
  return doc[match[1]:match[2]-2]

## get_ticker.py
def get_ticker(doc):
  org_matcher = Matcher(nlp.vocab)
  pattern = [{'ORTH': '('}, {'IS_ALPHA': True},
             {'ORTH': ':', 'OP': '*'},
             {'IS_ALPHA': True, 'OP': '*'},
            {'ORTH': ')'}]
  org_matcher.add("ORG", None, pattern)
  match = org_matcher(doc)
  if len(match) == 0:
    return f"{doc.text} -> NO MATCH FOUND"
  else:
    return doc[match[0][1]:match[0][2]]

## hello_world_matcher.py
#add pattern to matcher
matcher.add("HELLO_WORLD", None, pattern)

#create a doc of the string to be 'queried'
doc = nlp("hello world!\nHello World.")
matches = matcher(doc)

for match_id, start, end in matches:
    span = doc[start:end]  # The matched span
    print(match_id, start, end, span.text)

# Output
# 2008415248711360438 0 3 hello world!
# 2008415248711360438 4 7 Hello World.

## hello_world_pattern.py
pattern = [{'LOWER': 'hello'},
           {'LOWER': 'world'},
           {'IS_PUNCT': True, 'OP': '+'}]

## import_modules.py
import json
import time
import spacy
from spacy.matcher import Matcher

## init_matcher.py
import spacy
from spacy.matcher import Matcher

nlp=spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

## lemma_matcher_example.py
run_matcher = Matcher(nlp.vocab)
pattern = [{"LEMMA": "run"}]
run_matcher.add("RUN", None, pattern)

doc = nlp("Only when it dawned on him that he had nowhere left to run to, he finally stopped running.")
matches = run_matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(start, end, span.text)

# Output
# 12 13 run
# 18 19 running

## load_data.py
with open('data.json', 'r') as f:
    news_articles = json.load(f)

print(news_articles[0])

## multiple_values_example.py
buy_matcher = Matcher(nlp.vocab)
pattern = [{"LEMMA": {"IN": ["acquire", "buy", 'purchase']}},
           {"POS": {"IN": ["NOUN", "ADJ"]}, 'OP': '+'}]
buy_matcher.add("BUY", None, pattern)

doc = nlp("While his friends were buying things they didn't need, Charlie was busy acquiring productive assets.")
matches = buy_matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(start, end, span.text)

# Output
# 4 6 buying things
# 14 16 acquiring productive
# 14 17 acquiring productive assets

## pos_tag_example.py
doc = nlp("BlackRock Innovation and Growth Trust (BIGZ) will begin trading ex-dividend on November 12, 2021.")
for token in doc:
  print(f"{token.text:15}, {token.pos_:<10}")

## print_titles.py
print("\n".join(art['title'] for art in news_articles))

## regex_matcher_example.py
regex_matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": {"REGEX": "colou?r"}}]
regex_matcher.add("BUY", None, pattern)

doc = nlp("Color is the spelling used in the United States. Colour is used in other English-speaking countries.")
matches = regex_matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(start, end, span.text)

# Output
# 0 1 Color
# 10 11 Colour

## test_org_ticker.py
doc = nlp("BlackRock Energy and Resources Trust (BGR) Ex-Dividend Date Scheduled for November 12, 2021")
print(get_org(doc))
print(get_ticker(doc))

# Output
# BlackRock Energy and Resources Trust
# NYSE

## test_summary_methods.py
doc = nlp("BlackRock Innovation and Growth Trust (BIGZ) will begin trading ex-dividend on November 12, 2021. A cash dividend payment of $0.1 per share is scheduled to be paid on November 30, 2021. Shareholders who purchased BIGZ prior to the ex-dividend date are eligible for the cash dividend payment. This marks the 6th quarter that BIGZ has paid the same dividend. At the current stock price of $17.99, the dividend yield is 6.67%.")

print(get_amount_summary(doc))
print(get_pay_date(doc))

# Output
# $0.1
# November 30, 2021
	def dividend_info(article):
	headline = nlp(article['title'])
	if 'date' in [token.text.lower() for token in headline]:
	date = get_date(headline)
	if date:
	org = get_org(headline)
	ticker = get_ticker(headline)
	amount = get_amount_summary(nlp(article['summary']))
	pay_date = get_pay_date(nlp(article['summary']))
	print("HEADLINE: " + article['title'])
	print(f"\nTICKER: {ticker}" + f"\nDATE: {date}" + f"\nAMOUNT: {amount} per share to be paid on {pay_date}\n")
	else:
	dividend = get_amount_headline(headline)
	if dividend:
	org = get_org(headline)
	ticker = get_ticker(headline)
	print("NEWS HEADLINE: " + article['title'])
	print(f"\nTICKER: {ticker}" + f"\nAMOUNT: {dividend}\n")
	for article in news_articles:
	dividend_info(article)
	time.sleep(0.2)
	def get_date(doc):
	date_matcher = Matcher(nlp.vocab)
	pattern = [{"POS": "PROPN"}, {"LIKE_NUM": True},
	{"text": ","}, {"LIKE_NUM": True}]
	date_matcher.add("EX_DATE", None, pattern)
	if len(date_matcher(doc)) > 0:
	match = date_matcher(doc)[0]
	return doc[match[1]:match[2]]
	else:
	return False
	def get_amount_headline(doc):
	dividend_matcher = Matcher(nlp.vocab)
	pattern = [{"ORTH": "US$"}, {"LIKE_NUM": True}]
	dividend_matcher.add("USD", None, pattern)
	if len(dividend_matcher(doc)) > 0:
	match = dividend_matcher(doc)[0]
	return doc[match[1]:match[2]]
	else:
	return False

	doc = nlp("There's A Lot To Like About ConnectOne Bancorp's (NASDAQ:CNOB) Upcoming US$0.13 Dividend")
	print(get_amount_headline(doc))

	# OUTPUT
	# US$0.13
	def get_org(doc):
	org_matcher = Matcher(nlp.vocab)
	pattern = [{'POS': 'PROPN', 'OP': '+'},
	{'POS': 'CCONJ', 'OP': '?'},
	{'POS': 'PROPN', 'OP': '*'},
	{'ORTH': '\'', 'OP': '?'},
	{'ORTH': '\'s', 'OP': '?'},
	{'ORTH': '(', 'OP': '+'}]
	org_matcher.add("ORG", None, pattern)
	matches = org_matcher(doc)
	if len(matches) == 0:
	return f"{doc.text} -> NO MATCH FOUND"
	elif len(matches) == 1:
	match_idx = matches[0]
	else:
	max_len = 0
	for m in matches:
	if m[2] - m[1] > max_len:
	max_len = m[2] - m[1]
	match_idx = m
	def get_pay_date(doc):
	pay_date_matcher = Matcher(nlp.vocab)
	pattern = [{"ORTH": "paid"}, {"ORTH": "on"},
	{"POS": "PROPN"}, {"LIKE_NUM": True},
	{"ORTH": ","}, {"LIKE_NUM": True},
	{"ORTH": "."}]
	pay_date_matcher.add("AMOUNT", None, pattern)
	match = pay_date_matcher(doc)[0]
	return doc[match[1] + 2:match[2]-1]
	def get_amount_summary(doc):
	per_share_matcher = Matcher(nlp.vocab)
	pattern = [{"ORTH": "$"}, {"LIKE_NUM": True},
	{"LOWER": "per"}, {"LOWER": "share"}]
	per_share_matcher.add("AMOUNT", None, pattern)
	match = per_share_matcher(doc)[0]
	return doc[match[1]:match[2]-2]
	def get_ticker(doc):
	org_matcher = Matcher(nlp.vocab)
	pattern = [{'ORTH': '('}, {'IS_ALPHA': True},
	{'ORTH': ':', 'OP': '*'},
	{'IS_ALPHA': True, 'OP': '*'},
	{'ORTH': ')'}]
	org_matcher.add("ORG", None, pattern)
	match = org_matcher(doc)
	if len(match) == 0:
	return f"{doc.text} -> NO MATCH FOUND"
	else:
	return doc[match[0][1]:match[0][2]]
	#add pattern to matcher
	matcher.add("HELLO_WORLD", None, pattern)

	#create a doc of the string to be 'queried'
	doc = nlp("hello world!\nHello World.")
	matches = matcher(doc)

	for match_id, start, end in matches:
	span = doc[start:end] # The matched span
	print(match_id, start, end, span.text)

	# Output
	# 2008415248711360438 0 3 hello world!
	# 2008415248711360438 4 7 Hello World.