Skip to content

Instantly share code, notes, and snippets.

@NewscatcherAPI
Last active July 26, 2023 01:13
Show Gist options
  • Save NewscatcherAPI/501ba40acebd17bf2660f8919f50550e to your computer and use it in GitHub Desktop.
Save NewscatcherAPI/501ba40acebd17bf2660f8919f50550e to your computer and use it in GitHub Desktop.
def is_company_acquisition(headline_doc):
# check if the acquisition lemma (why lemma?)
for token in headline_doc:
if 'acquire' not in [token.lemma_ for token in headline_doc]:
return False
# check that at least 2 ORG entities
elif len([ent.label_ for ent in headline_doc.ents if ent.label_ == 'ORG']) < 2:
return False
return True
def detect_acquisition(headline_doc):
if is_company_acquisition(headline_doc):
if all(t is not None for t in [find_acquired(headline_doc), find_acquirer(headline_doc)]):
print(str(headline_doc) + " --> " + str(find_acquirer(headline_doc)) + " acquires " + str(find_acquired(headline_doc)))
else:
print(str(headline_doc) + " --> " + "no acquisition detected")
doc = nlp("PayPal decided to acquire Paidy Inc. and Microsoft")
spacy.displacy.serve(doc, style= "dep")
def find_acquired(headline_doc):
acquired_list = []
for token in headline_doc:
if (token.ent_type_ == 'ORG') and (token.head.lemma_ == 'acquire') and (token.dep_ in ('attr', 'dobj')):
for noun_chunk in doc.noun_chunks:
if token in noun_chunk:
acquired_list.append(noun_chunk) if noun_chunk not in acquired_list else acquired_list
elif len(list(token.rights)) > 0:
for tright in list(token.rights):
if (tright.ent_type_ == 'ORG') and (tright.dep_ in ('conj')):
for noun_chunk in doc.noun_chunks:
if tright in noun_chunk:
acquired_list.append(noun_chunk) if noun_chunk not in acquired_list else acquired_list
return acquired_list
def find_acquirer(headline_doc):
acquirer_list = []
for token in headline_doc:
if (token.ent_type_ == 'ORG') and (token.dep_ in ('nsubj', 'ROOT')):
for noun_chunk in doc.noun_chunks:
if token in noun_chunk:
acquirer_list.append(noun_chunk) if noun_chunk not in acquirer_list else acquirer_list
elif len(list(token.rights)) > 0:
for tright in list(token.rights):
if (tright.ent_type_ == 'ORG') and (tright.dep_ in ('conj')):
for noun_chunk in doc.noun_chunks:
if tright in noun_chunk:
acquirer_list.append(noun_chunk) if noun_chunk not in acquirer_list else acquirer_list
if acquirer_list != []:
return acquirer_list
import spacy
nlp = spacy.load('en_core_web_lg')
# 2. preset headlines
headlines = ["PayPal to acquire Paidy Inc.",
"Pump solutions and water technology company Grundfos has entered into an agreement to acquire Mechanical Equipment Company (MECO) for an undisclosed sum.",
"Cavaliers acquire Lauri Markkanen",
"Mastercard to acquire CipherTrace",
"TransUnion to acquire Neustar and Newstar",
"Microsoft enters into agreement to acquire Myhotelshop to help hotels optimize guest acquisition",
"CDK to help car buyers acquire insurance",
"Tyber Medical reaches agreement to acquire CatapultMD",
"Rebels acquire forward Stevenson from Blades",
"Google to acquire 1.3M-SF Manhattan office for $2.1B",
"PayPal decided to acquire Paidy Inc. and Microsoft",
]
docs = [nlp(headline) for headline in headlines]
for doc in docs:
print([(ent.text, ent.label_) for ent in doc.ents])
def is_company_acquisition(headline_doc):
# check if the acquisition lemma (why lemma?)
for token in headline_doc:
if 'acquire' not in [token.lemma_ for token in headline_doc]:
return False
# check that at least 2 ORG entities
elif len([ent.label_ for ent in headline_doc.ents if ent.label_ == 'ORG']) < 2:
return False
return True
# 4. find the dependency
# 4.1 one of a company should be in a dependancy
def find_acquired(headline_doc):
acquired_list = []
for token in headline_doc:
if (token.ent_type_ == 'ORG') and (token.head.lemma_ == 'acquire') and (token.dep_ in ('attr', 'dobj')):
for noun_chunk in doc.noun_chunks:
if token in noun_chunk:
acquired_list.append(noun_chunk) if noun_chunk not in acquired_list else acquired_list
elif len(list(token.rights)) > 0:
for tright in list(token.rights):
if (tright.ent_type_ == 'ORG') and (tright.dep_ in ('conj')):
for noun_chunk in doc.noun_chunks:
if tright in noun_chunk:
acquired_list.append(noun_chunk) if noun_chunk not in acquired_list else acquired_list
if acquired_list != []:
return acquired_list
# 5. find the acquirer
def find_acquirer(headline_doc):
acquirer_list = []
for token in headline_doc:
if (token.ent_type_ == 'ORG') and (token.dep_ in ('nsubj', 'ROOT')):
for noun_chunk in doc.noun_chunks:
if token in noun_chunk:
acquirer_list.append(noun_chunk) if noun_chunk not in acquirer_list else acquirer_list
elif len(list(token.rights)) > 0:
for tright in list(token.rights):
if (tright.ent_type_ == 'ORG') and (tright.dep_ in ('conj')):
for noun_chunk in doc.noun_chunks:
if tright in noun_chunk:
acquirer_list.append(noun_chunk) if noun_chunk not in acquirer_list else acquirer_list
if acquirer_list != []:
return acquirer_list
# 6. combine to find who acquiring
def detect_acquisition(headline_doc):
if is_company_acquisition(headline_doc):
if all(t is not None for t in [find_acquired(headline_doc), find_acquirer(headline_doc)]):
print(str(headline_doc) + " --> " + str(find_acquirer(headline_doc)) + " acquires " + str(find_acquired(headline_doc)))
else:
print(str(headline_doc) + " --> " + "no acquisition detected")
# showcase
for headline in headlines:
doc = nlp(headline)
detect_acquisition(doc)
headlines = ["PayPal to acquire Paidy Inc.",
"Pump solutions and water technology company Grundfos has entered into an agreement to acquire Mechanical Equipment Company (MECO) for an undisclosed sum.",
"Cavaliers acquire Lauri Markkanen",
"Mastercard to acquire CipherTrace",
"TransUnion to acquire Neustar and Newstar",
"Microsoft enters into agreement to acquire Myhotelshop to help hotels optimize guest acquisition",
"CDK to help car buyers acquire insurance",
"Tyber Medical reaches agreement to acquire CatapultMD",
"Rebels acquire forward Stevenson from Blades",
"Google to acquire 1.3M-SF Manhattan office for $2.1B",
"PayPal decided to acquire Paidy Inc. and Microsoft",
]
docs = [nlp(headline) for headline in headlines]
for doc in docs:
print([(ent.text, ent.label_) for ent in doc.ents])
from newscatcherapi import NewsCatcherApiClient
newscatcherapi = NewsCatcherApiClient(x_api_key='YOUR_API_KEY')
acquisition_articles = newscatcherapi.get_search(q='acquire',
search_in = 'title',
lang='en',
from_='24 hours ago',
sources='prnewswire.com, businesswire.com',
page_size=100,
page=1)
for article in acquisition_articles['articles']:
print(article['title'])
import spacy
import time
nlp = spacy.load('en_core_web_lg')
for headline in headlines:
doc = nlp(headline)
detect_acquisition(doc)
time.sleep(0.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment