Skip to content

Instantly share code, notes, and snippets.

@Aditya1001001
Last active January 27, 2022 05:42
Show Gist options
  • Save Aditya1001001/6a16ade072662c2ffe6cb782dad2b610 to your computer and use it in GitHub Desktop.
Save Aditya1001001/6a16ade072662c2ffe6cb782dad2b610 to your computer and use it in GitHub Desktop.
Name Matching: News Data Annotation Using Spacy PhraseMatcher
patterns = [nlp.make_doc(name) for name in names]
matcher.add("COMPANY", patterns)
patterns = [nlp.make_doc(symbol) for symbol in data['Symbol']]
matcher.add("SYMBOL", patterns)
from cleanco import basename
combined_list['Cleaned Name'] = combined_list['Company Name'].apply(basename)
combined_list['Cleaned Name'] = combined_list['Cleaned Name'].apply(basename)
names = pd.concat([combined_list['Company Name'], combined_list['Cleaned Name']], ignore_index = True).drop_duplicates()
combined_list = pd.concat([data, SP500], ignore_index = True).drop_duplicates()
name_corrections = {"A": "A-Mark", "Federal": "Federal-Mogul",
"Global": "Global-Tech Advanced Innovations",
"G": "G-III Apparel", "Heritage": "Heritage Crystal Clean",
"II": "II-VI", "Mid": "Microchip Technology",
"Pro":"Pro-Dex", "Perma":"Perma-Fix Environmental Services",
"Park": "Park-Ohio Holdings", "Bio": "Bio-Techne",
"ROBO": " ROBO Global Robotics and Automation Index ETF",
"United": "United-Guardian", "Uni":"Uni-Pixel",
"Popular" : "Banco Popular", "News": "News Corp",
}
names = [name_corrections[name] if name in name_corrections.keys() else name for name in names ]
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
from spacy import displacy
# displacy options
colors = {"COMPANY": "#F67DE3", "SYMBOL": "#7DF6D9"}
options = {"colors": colors}
plot_data = {
"text": doc.text,
"ents": [],
"title": None
}
matches_with_dup = {"COMPANY":{}, "SYMBOL": {}}
for match_id, span_start, span_end in matches:
rule_id = nlp.vocab.strings[match_id]
text = doc[span_start: span_end].text
start_idx = doc.text.index(doc[span_start].text)
end_idx = start_idx + len(text)
matches_with_dup[rule_id][text] = {"start": start_idx, "end": end_idx, "label": rule_id}
# substring names will appear multiple times but the expanded
# names will appear only once
for ent_type in matches_with_dup.keys():
matches = matches_with_dup[ent_type]
keys = matches.keys()
counts = {text:0 for text in keys}
for text in keys:
for key in keys:
if text in key:
counts[text] += 1
for text, count in counts.items():
if count == 1:
plot_data['ents'].append(matches[text])
#sort the matches by start index
plot_data['ents'] = sorted(plot_data['ents'], key=lambda ent: ent["start"])
displacy.render(plot_data , style="ent", options=options, manual=True, jupyter =True)
# List of Patterns To Match For
phrases = ["Sergio Mattarella", "Mario Draghi", "president", "prime minister"]
# Create Doc Objects For The Phrases
patterns = [nlp(text) for text in phrases ]
matcher.add("PatternList", patterns)
!wget https://datahub.io/core/nasdaq-listings/r/nasdaq-listed-symbols.csv
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
response=requests.get(url)
df=pd.read_html(str(table))
# convert list to dataframe
df=pd.DataFrame(df[0])
pip install cleanco requests beautifulsoup4
import pandas as pd
data = pd.read_csv("nasdaq-listed-symbols.csv")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
name = ["Sergio Mattarella", "Mario Draghi"]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(name) for name in names]
matcher.add("Names", patterns)
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
print(span.text)
names = [name for name in cleaned_names if name != " " and len(name) > 0]
def remove_parenthesis(name):
if "(" in name:
l_paren_idx = name.index("(")
r_paren_idx = name.index(")")
return name[: l_paren_idx] + name[r_paren_idx + 1 :]
else:
return name
combined_list['Company Name'] = combined_list['Company Name'].apply(remove_parenthesis)
SP500 = df[['Symbol', 'Security']]
SP500 = SP500.rename(columns={"Security": "Company Name"})
soup = BeautifulSoup(response.text, 'html.parser')
table=soup.find('table',{'class':"wikitable"})
doc = nlp("A joint session of Italian parliament and some regional delegates, \
known as “great electors,” began a secret ballot on Monday to elect the next \
Italian president to replace the current officeholder, Sergio Mattarella. \
It is a focus of special attention because a top contender for the job is \
the prime minister, Mario Draghi, a titan of Europe who in just a year in \
power has stabilized Italy’s politics and initiated long-overdue overhauls.")
# Find Matches
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
print(span.text)
Text = "Microsoft (MSFT) dipped 2.4% after announcing the software giant will \
buy video game company Activision Blizzard, Inc (ATVI) in an all-cash transaction \
valued at $68.7 billion. \nThe shortened trading week will feature quarterly \
reports from 35 companies in the S&P 500, including Bank of America (BAC), \
UnitedHealth Group(UNH), and Netflix (NFLX). General Motors (GM) said it \
will invest roughly $6.6 billion in its home state of Michigan through \
2024. GM has projected it will overtake Tesla (TSLA) as the \
top U.S.-based seller of electric vehicles by mid-decade. Retailer Gap (GPS) \
shares fell 6.7% after Morgan Stanley downgraded the retailer."
doc = nlp(Text)
matches = matcher(doc)
for match_id, start, end in matches:
rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COMPANY'
span = doc[start : end] # get the matched slice of the doc
print(rule_id, span.text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment