Created
June 5, 2020 20:49
-
-
Save aniruddha27/83d917d9061fe525d8b90f35b9addb43 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# to extract initiatives using pattern matching | |
def all_schemes(text,check): | |
schemes = [] | |
doc = nlp(text) | |
# initiatives | |
prog_list = ['programme','scheme', | |
'initiative','campaign', | |
'agreement','conference', | |
'alliance','plan'] | |
# pattern to match initiatives names | |
pattern = [{'POS':'DET'}, | |
{'POS':'PROPN','DEP':'compound'}, | |
{'POS':'PROPN','DEP':'compound'}, | |
{'POS':'PROPN','OP':'?'}, | |
{'POS':'PROPN','OP':'?'}, | |
{'POS':'PROPN','OP':'?'}, | |
{'LOWER':{'IN':prog_list},'OP':'+'} | |
] | |
if check == 0: | |
# return blank list | |
return schemes | |
# Matcher class object | |
matcher = Matcher(nlp.vocab) | |
matcher.add("matching", None, pattern) | |
matches = matcher(doc) | |
for i in range(0,len(matches)): | |
# match: id, start, end | |
start, end = matches[i][1], matches[i][2] | |
if doc[start].pos_=='DET': | |
start = start+1 | |
# matched string | |
span = str(doc[start:end]) | |
if (len(schemes)!=0) and (schemes[-1] in span): | |
schemes[-1] = span | |
else: | |
schemes.append(span) | |
return schemes | |
# apply function | |
df2['Schemes1'] = df2.apply(lambda x:all_schemes(x.Sent,x.Check_Schemes),axis=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment