Last active
January 27, 2022 05:42
-
-
Save Aditya1001001/6a16ade072662c2ffe6cb782dad2b610 to your computer and use it in GitHub Desktop.
Name Matching: News Data Annotation Using Spacy PhraseMatcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
patterns = [nlp.make_doc(name) for name in names] | |
matcher.add("COMPANY", patterns) | |
patterns = [nlp.make_doc(symbol) for symbol in data['Symbol']] | |
matcher.add("SYMBOL", patterns) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from cleanco import basename | |
combined_list['Cleaned Name'] = combined_list['Company Name'].apply(basename) | |
combined_list['Cleaned Name'] = combined_list['Cleaned Name'].apply(basename) | |
names = pd.concat([combined_list['Company Name'], combined_list['Cleaned Name']], ignore_index = True).drop_duplicates() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
combined_list = pd.concat([data, SP500], ignore_index = True).drop_duplicates() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name_corrections = {"A": "A-Mark", "Federal": "Federal-Mogul", | |
"Global": "Global-Tech Advanced Innovations", | |
"G": "G-III Apparel", "Heritage": "Heritage Crystal Clean", | |
"II": "II-VI", "Mid": "Microchip Technology", | |
"Pro":"Pro-Dex", "Perma":"Perma-Fix Environmental Services", | |
"Park": "Park-Ohio Holdings", "Bio": "Bio-Techne", | |
"ROBO": " ROBO Global Robotics and Automation Index ETF", | |
"United": "United-Guardian", "Uni":"Uni-Pixel", | |
"Popular" : "Banco Popular", "News": "News Corp", | |
} | |
names = [name_corrections[name] if name in name_corrections.keys() else name for name in names ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
from spacy.matcher import PhraseMatcher | |
nlp = spacy.load("en_core_web_sm") | |
matcher = PhraseMatcher(nlp.vocab) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
from spacy.matcher import PhraseMatcher | |
nlp = spacy.load("en_core_web_sm") | |
matcher = PhraseMatcher(nlp.vocab) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from spacy import displacy | |
# displacy options | |
colors = {"COMPANY": "#F67DE3", "SYMBOL": "#7DF6D9"} | |
options = {"colors": colors} | |
plot_data = { | |
"text": doc.text, | |
"ents": [], | |
"title": None | |
} | |
matches_with_dup = {"COMPANY":{}, "SYMBOL": {}} | |
for match_id, span_start, span_end in matches: | |
rule_id = nlp.vocab.strings[match_id] | |
text = doc[span_start: span_end].text | |
start_idx = doc.text.index(doc[span_start].text) | |
end_idx = start_idx + len(text) | |
matches_with_dup[rule_id][text] = {"start": start_idx, "end": end_idx, "label": rule_id} | |
# substring names will appear multiple times but the expanded | |
# names will appear only once | |
for ent_type in matches_with_dup.keys(): | |
matches = matches_with_dup[ent_type] | |
keys = matches.keys() | |
counts = {text:0 for text in keys} | |
for text in keys: | |
for key in keys: | |
if text in key: | |
counts[text] += 1 | |
for text, count in counts.items(): | |
if count == 1: | |
plot_data['ents'].append(matches[text]) | |
#sort the matches by start index | |
plot_data['ents'] = sorted(plot_data['ents'], key=lambda ent: ent["start"]) | |
displacy.render(plot_data , style="ent", options=options, manual=True, jupyter =True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# List of Patterns To Match For | |
phrases = ["Sergio Mattarella", "Mario Draghi", "president", "prime minister"] | |
# Create Doc Objects For The Phrases | |
patterns = [nlp(text) for text in phrases ] | |
matcher.add("PatternList", patterns) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!wget https://datahub.io/core/nasdaq-listings/r/nasdaq-listed-symbols.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests # library to handle requests | |
from bs4 import BeautifulSoup # library to parse HTML documents | |
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" | |
response=requests.get(url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df=pd.read_html(str(table)) | |
# convert list to dataframe | |
df=pd.DataFrame(df[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pip install cleanco requests beautifulsoup4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
data = pd.read_csv("nasdaq-listed-symbols.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
matcher = PhraseMatcher(nlp.vocab, attr="LOWER") | |
name = ["Sergio Mattarella", "Mario Draghi"] | |
# Only run nlp.make_doc to speed things up | |
patterns = [nlp.make_doc(name) for name in names] | |
matcher.add("Names", patterns) | |
matches = matcher(doc) | |
for match_id, start, end in matches: | |
span = doc[start:end] | |
print(span.text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
names = [name for name in cleaned_names if name != " " and len(name) > 0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_parenthesis(name): | |
if "(" in name: | |
l_paren_idx = name.index("(") | |
r_paren_idx = name.index(")") | |
return name[: l_paren_idx] + name[r_paren_idx + 1 :] | |
else: | |
return name | |
combined_list['Company Name'] = combined_list['Company Name'].apply(remove_parenthesis) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SP500 = df[['Symbol', 'Security']] | |
SP500 = SP500.rename(columns={"Security": "Company Name"}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
soup = BeautifulSoup(response.text, 'html.parser') | |
table=soup.find('table',{'class':"wikitable"}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
doc = nlp("A joint session of Italian parliament and some regional delegates, \ | |
known as “great electors,” began a secret ballot on Monday to elect the next \ | |
Italian president to replace the current officeholder, Sergio Mattarella. \ | |
It is a focus of special attention because a top contender for the job is \ | |
the prime minister, Mario Draghi, a titan of Europe who in just a year in \ | |
power has stabilized Italy’s politics and initiated long-overdue overhauls.") | |
# Find Matches | |
matches = matcher(doc) | |
for match_id, start, end in matches: | |
span = doc[start:end] | |
print(span.text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Text = "Microsoft (MSFT) dipped 2.4% after announcing the software giant will \ | |
buy video game company Activision Blizzard, Inc (ATVI) in an all-cash transaction \ | |
valued at $68.7 billion. \nThe shortened trading week will feature quarterly \ | |
reports from 35 companies in the S&P 500, including Bank of America (BAC), \ | |
UnitedHealth Group(UNH), and Netflix (NFLX). General Motors (GM) said it \ | |
will invest roughly $6.6 billion in its home state of Michigan through \ | |
2024. GM has projected it will overtake Tesla (TSLA) as the \ | |
top U.S.-based seller of electric vehicles by mid-decade. Retailer Gap (GPS) \ | |
shares fell 6.7% after Morgan Stanley downgraded the retailer." | |
doc = nlp(Text) | |
matches = matcher(doc) | |
for match_id, start, end in matches: | |
rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COMPANY' | |
span = doc[start : end] # get the matched slice of the doc | |
print(rule_id, span.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment