Last active
December 16, 2021 16:32
-
-
Save Aditya1001001/f8a6fdd1fc6b8a10e763303090931a7e to your computer and use it in GitHub Desktop.
Mining Financial Stock News Using SpaCy Matcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def dividend_info(article): | |
headline = nlp(article['title']) | |
if 'date' in [token.text.lower() for token in headline]: | |
date = get_date(headline) | |
if date: | |
org = get_org(headline) | |
ticker = get_ticker(headline) | |
amount = get_amount_summary(nlp(article['summary'])) | |
pay_date = get_pay_date(nlp(article['summary'])) | |
print("HEADLINE: " + article['title']) | |
print(f"\nTICKER: {ticker}" + f"\nDATE: {date}" + f"\nAMOUNT: {amount} per share to be paid on {pay_date}\n") | |
else: | |
dividend = get_amount_headline(headline) | |
if dividend: | |
org = get_org(headline) | |
ticker = get_ticker(headline) | |
print("NEWS HEADLINE: " + article['title']) | |
print(f"\nTICKER: {ticker}" + f"\nAMOUNT: {dividend}\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for article in news_articles: | |
dividend_info(article) | |
time.sleep(0.2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_date(doc): | |
date_matcher = Matcher(nlp.vocab) | |
pattern = [{"POS": "PROPN"}, {"LIKE_NUM": True}, | |
{"text": ","}, {"LIKE_NUM": True}] | |
date_matcher.add("EX_DATE", None, pattern) | |
if len(date_matcher(doc)) > 0: | |
match = date_matcher(doc)[0] | |
return doc[match[1]:match[2]] | |
else: | |
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_amount_headline(doc): | |
dividend_matcher = Matcher(nlp.vocab) | |
pattern = [{"ORTH": "US$"}, {"LIKE_NUM": True}] | |
dividend_matcher.add("USD", None, pattern) | |
if len(dividend_matcher(doc)) > 0: | |
match = dividend_matcher(doc)[0] | |
return doc[match[1]:match[2]] | |
else: | |
return False | |
doc = nlp("There's A Lot To Like About ConnectOne Bancorp's (NASDAQ:CNOB) Upcoming US$0.13 Dividend") | |
print(get_amount_headline(doc)) | |
# OUTPUT | |
# US$0.13 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_org(doc): | |
org_matcher = Matcher(nlp.vocab) | |
pattern = [{'POS': 'PROPN', 'OP': '+'}, | |
{'POS': 'CCONJ', 'OP': '?'}, | |
{'POS': 'PROPN', 'OP': '*'}, | |
{'ORTH': '\'', 'OP': '?'}, | |
{'ORTH': '\'s', 'OP': '?'}, | |
{'ORTH': '(', 'OP': '+'}] | |
org_matcher.add("ORG", None, pattern) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
matches = org_matcher(doc) | |
if len(matches) == 0: | |
return f"{doc.text} -> NO MATCH FOUND" | |
elif len(matches) == 1: | |
match_idx = matches[0] | |
else: | |
max_len = 0 | |
for m in matches: | |
if m[2] - m[1] > max_len: | |
max_len = m[2] - m[1] | |
match_idx = m |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
return doc[match_idx[1]:match_idx[2]-1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_pay_date(doc): | |
pay_date_matcher = Matcher(nlp.vocab) | |
pattern = [{"ORTH": "paid"}, {"ORTH": "on"}, | |
{"POS": "PROPN"}, {"LIKE_NUM": True}, | |
{"ORTH": ","}, {"LIKE_NUM": True}, | |
{"ORTH": "."}] | |
pay_date_matcher.add("AMOUNT", None, pattern) | |
match = pay_date_matcher(doc)[0] | |
return doc[match[1] + 2:match[2]-1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_amount_summary(doc): | |
per_share_matcher = Matcher(nlp.vocab) | |
pattern = [{"ORTH": "$"}, {"LIKE_NUM": True}, | |
{"LOWER": "per"}, {"LOWER": "share"}] | |
per_share_matcher.add("AMOUNT", None, pattern) | |
match = per_share_matcher(doc)[0] | |
return doc[match[1]:match[2]-2] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_ticker(doc): | |
org_matcher = Matcher(nlp.vocab) | |
pattern = [{'ORTH': '('}, {'IS_ALPHA': True}, | |
{'ORTH': ':', 'OP': '*'}, | |
{'IS_ALPHA': True, 'OP': '*'}, | |
{'ORTH': ')'}] | |
org_matcher.add("ORG", None, pattern) | |
match = org_matcher(doc) | |
if len(match) == 0: | |
return f"{doc.text} -> NO MATCH FOUND" | |
else: | |
return doc[match[0][1]:match[0][2]] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#add pattern to matcher | |
matcher.add("HELLO_WORLD", None, pattern) | |
#create a doc of the string to be 'queried' | |
doc = nlp("hello world!\nHello World.") | |
matches = matcher(doc) | |
for match_id, start, end in matches: | |
span = doc[start:end] # The matched span | |
print(match_id, start, end, span.text) | |
# Output | |
# 2008415248711360438 0 3 hello world! | |
# 2008415248711360438 4 7 Hello World. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pattern = [{'LOWER': 'hello'}, | |
{'LOWER': 'world'}, | |
{'IS_PUNCT': True, 'OP': '+'}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import time | |
import spacy | |
from spacy.matcher import Matcher |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
from spacy.matcher import Matcher | |
nlp=spacy.load('en_core_web_sm') | |
matcher = Matcher(nlp.vocab) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
run_matcher = Matcher(nlp.vocab) | |
pattern = [{"LEMMA": "run"}] | |
run_matcher.add("RUN", None, pattern) | |
doc = nlp("Only when it dawned on him that he had nowhere left to run to, he finally stopped running.") | |
matches = run_matcher(doc) | |
for match_id, start, end in matches: | |
span = doc[start:end] | |
print(start, end, span.text) | |
# Output | |
# 12 13 run | |
# 18 19 running |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with open('data.json', 'r') as f: | |
news_articles = json.load(f) | |
print(news_articles[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
buy_matcher = Matcher(nlp.vocab) | |
pattern = [{"LEMMA": {"IN": ["acquire", "buy", 'purchase']}}, | |
{"POS": {"IN": ["NOUN", "ADJ"]}, 'OP': '+'}] | |
buy_matcher.add("BUY", None, pattern) | |
doc = nlp("While his friends were buying things they didn't need, Charlie was busy acquiring productive assets.") | |
matches = buy_matcher(doc) | |
for match_id, start, end in matches: | |
span = doc[start:end] | |
print(start, end, span.text) | |
# Output | |
# 4 6 buying things | |
# 14 16 acquiring productive | |
# 14 17 acquiring productive assets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
doc = nlp("BlackRock Innovation and Growth Trust (BIGZ) will begin trading ex-dividend on November 12, 2021.") | |
for token in doc: | |
print(f"{token.text:15}, {token.pos_:<10}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print("\n".join(art['title'] for art in news_articles)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
regex_matcher = Matcher(nlp.vocab) | |
pattern = [{"LOWER": {"REGEX": "colou?r"}}] | |
regex_matcher.add("BUY", None, pattern) | |
doc = nlp("Color is the spelling used in the United States. Colour is used in other English-speaking countries.") | |
matches = regex_matcher(doc) | |
for match_id, start, end in matches: | |
span = doc[start:end] | |
print(start, end, span.text) | |
# Output | |
# 0 1 Color | |
# 10 11 Colour |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
doc = nlp("BlackRock Energy and Resources Trust (BGR) Ex-Dividend Date Scheduled for November 12, 2021") | |
print(get_org(doc)) | |
print(get_ticker(doc)) | |
# Output | |
# BlackRock Energy and Resources Trust | |
# NYSE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
doc = nlp("BlackRock Innovation and Growth Trust (BIGZ) will begin trading ex-dividend on November 12, 2021. A cash dividend payment of $0.1 per share is scheduled to be paid on November 30, 2021. Shareholders who purchased BIGZ prior to the ex-dividend date are eligible for the cash dividend payment. This marks the 6th quarter that BIGZ has paid the same dividend. At the current stock price of $17.99, the dividend yield is 6.67%.") | |
print(get_amount_summary(doc)) | |
print(get_pay_date(doc)) | |
# Output | |
# $0.1 | |
# November 30, 2021 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment