Skip to content

Instantly share code, notes, and snippets.

@Aditya1001001
Last active December 16, 2021 16:32
Show Gist options
  • Save Aditya1001001/f8a6fdd1fc6b8a10e763303090931a7e to your computer and use it in GitHub Desktop.
Save Aditya1001001/f8a6fdd1fc6b8a10e763303090931a7e to your computer and use it in GitHub Desktop.
Mining Financial Stock News Using SpaCy Matcher
def dividend_info(article):
headline = nlp(article['title'])
if 'date' in [token.text.lower() for token in headline]:
date = get_date(headline)
if date:
org = get_org(headline)
ticker = get_ticker(headline)
amount = get_amount_summary(nlp(article['summary']))
pay_date = get_pay_date(nlp(article['summary']))
print("HEADLINE: " + article['title'])
print(f"\nTICKER: {ticker}" + f"\nDATE: {date}" + f"\nAMOUNT: {amount} per share to be paid on {pay_date}\n")
else:
dividend = get_amount_headline(headline)
if dividend:
org = get_org(headline)
ticker = get_ticker(headline)
print("NEWS HEADLINE: " + article['title'])
print(f"\nTICKER: {ticker}" + f"\nAMOUNT: {dividend}\n")
for article in news_articles:
dividend_info(article)
time.sleep(0.2)
def get_date(doc):
date_matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN"}, {"LIKE_NUM": True},
{"text": ","}, {"LIKE_NUM": True}]
date_matcher.add("EX_DATE", None, pattern)
if len(date_matcher(doc)) > 0:
match = date_matcher(doc)[0]
return doc[match[1]:match[2]]
else:
return False
def get_amount_headline(doc):
dividend_matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "US$"}, {"LIKE_NUM": True}]
dividend_matcher.add("USD", None, pattern)
if len(dividend_matcher(doc)) > 0:
match = dividend_matcher(doc)[0]
return doc[match[1]:match[2]]
else:
return False
doc = nlp("There's A Lot To Like About ConnectOne Bancorp's (NASDAQ:CNOB) Upcoming US$0.13 Dividend")
print(get_amount_headline(doc))
# OUTPUT
# US$0.13
def get_org(doc):
org_matcher = Matcher(nlp.vocab)
pattern = [{'POS': 'PROPN', 'OP': '+'},
{'POS': 'CCONJ', 'OP': '?'},
{'POS': 'PROPN', 'OP': '*'},
{'ORTH': '\'', 'OP': '?'},
{'ORTH': '\'s', 'OP': '?'},
{'ORTH': '(', 'OP': '+'}]
org_matcher.add("ORG", None, pattern)
matches = org_matcher(doc)
if len(matches) == 0:
return f"{doc.text} -> NO MATCH FOUND"
elif len(matches) == 1:
match_idx = matches[0]
else:
max_len = 0
for m in matches:
if m[2] - m[1] > max_len:
max_len = m[2] - m[1]
match_idx = m
return doc[match_idx[1]:match_idx[2]-1]
def get_pay_date(doc):
pay_date_matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "paid"}, {"ORTH": "on"},
{"POS": "PROPN"}, {"LIKE_NUM": True},
{"ORTH": ","}, {"LIKE_NUM": True},
{"ORTH": "."}]
pay_date_matcher.add("AMOUNT", None, pattern)
match = pay_date_matcher(doc)[0]
return doc[match[1] + 2:match[2]-1]
def get_amount_summary(doc):
per_share_matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "$"}, {"LIKE_NUM": True},
{"LOWER": "per"}, {"LOWER": "share"}]
per_share_matcher.add("AMOUNT", None, pattern)
match = per_share_matcher(doc)[0]
return doc[match[1]:match[2]-2]
def get_ticker(doc):
org_matcher = Matcher(nlp.vocab)
pattern = [{'ORTH': '('}, {'IS_ALPHA': True},
{'ORTH': ':', 'OP': '*'},
{'IS_ALPHA': True, 'OP': '*'},
{'ORTH': ')'}]
org_matcher.add("ORG", None, pattern)
match = org_matcher(doc)
if len(match) == 0:
return f"{doc.text} -> NO MATCH FOUND"
else:
return doc[match[0][1]:match[0][2]]
#add pattern to matcher
matcher.add("HELLO_WORLD", None, pattern)
#create a doc of the string to be 'queried'
doc = nlp("hello world!\nHello World.")
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end] # The matched span
print(match_id, start, end, span.text)
# Output
# 2008415248711360438 0 3 hello world!
# 2008415248711360438 4 7 Hello World.
pattern = [{'LOWER': 'hello'},
{'LOWER': 'world'},
{'IS_PUNCT': True, 'OP': '+'}]
import json
import time
import spacy
from spacy.matcher import Matcher
import spacy
from spacy.matcher import Matcher
nlp=spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
run_matcher = Matcher(nlp.vocab)
pattern = [{"LEMMA": "run"}]
run_matcher.add("RUN", None, pattern)
doc = nlp("Only when it dawned on him that he had nowhere left to run to, he finally stopped running.")
matches = run_matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
print(start, end, span.text)
# Output
# 12 13 run
# 18 19 running
with open('data.json', 'r') as f:
news_articles = json.load(f)
print(news_articles[0])
buy_matcher = Matcher(nlp.vocab)
pattern = [{"LEMMA": {"IN": ["acquire", "buy", 'purchase']}},
{"POS": {"IN": ["NOUN", "ADJ"]}, 'OP': '+'}]
buy_matcher.add("BUY", None, pattern)
doc = nlp("While his friends were buying things they didn't need, Charlie was busy acquiring productive assets.")
matches = buy_matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
print(start, end, span.text)
# Output
# 4 6 buying things
# 14 16 acquiring productive
# 14 17 acquiring productive assets
doc = nlp("BlackRock Innovation and Growth Trust (BIGZ) will begin trading ex-dividend on November 12, 2021.")
for token in doc:
print(f"{token.text:15}, {token.pos_:<10}")
print("\n".join(art['title'] for art in news_articles))
regex_matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": {"REGEX": "colou?r"}}]
regex_matcher.add("BUY", None, pattern)
doc = nlp("Color is the spelling used in the United States. Colour is used in other English-speaking countries.")
matches = regex_matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
print(start, end, span.text)
# Output
# 0 1 Color
# 10 11 Colour
doc = nlp("BlackRock Energy and Resources Trust (BGR) Ex-Dividend Date Scheduled for November 12, 2021")
print(get_org(doc))
print(get_ticker(doc))
# Output
# BlackRock Energy and Resources Trust
# NYSE
doc = nlp("BlackRock Innovation and Growth Trust (BIGZ) will begin trading ex-dividend on November 12, 2021. A cash dividend payment of $0.1 per share is scheduled to be paid on November 30, 2021. Shareholders who purchased BIGZ prior to the ex-dividend date are eligible for the cash dividend payment. This marks the 6th quarter that BIGZ has paid the same dividend. At the current stock price of $17.99, the dividend yield is 6.67%.")
print(get_amount_summary(doc))
print(get_pay_date(doc))
# Output
# $0.1
# November 30, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment