Skip to content

Instantly share code, notes, and snippets.

@rsalaza4
Last active March 20, 2024 18:15
Show Gist options
  • Save rsalaza4/a923e5591401bccb770b244b766efd17 to your computer and use it in GitHub Desktop.
Save rsalaza4/a923e5591401bccb770b244b766efd17 to your computer and use it in GitHub Desktop.
# Define multiple universities names patterns
sub_patterns = [
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* University of [A-Z][a-z]*',
'[A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]*',
'[A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* University of [A-Z][a-z]* at [A-Z][a-z]*',
'[A-Z][a-z]* University of [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* University of [A-Z][a-z]*',
'University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*',
'University of [A-Z][a-z]* [A-Z][a-z]* at [A-Z][a-z]*',
'University of [A-Z][a-z]* at [A-Z][a-z]* [A-Z][a-z]*',
'University of [A-Z][a-z]* at [A-Z][a-z]*',
'University of [A-Z][a-z]* [A-Z][a-z]*',
'University of [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* University',
'[A-Z][a-z]* University',
'[A-Z]* University',
'[A-Z][a-z]* Institute of [A-Z][a-z]* & [A-Z][a-z]*',
'[A-Z][a-z]* Institute of [A-Z][a-z]* [A-Z][a-z]*',
'[A-Z][a-z]* Institute of [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* Community College',
'[A-Z][a-z]* Community College',
'College of [A-Z][a-z]* & [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* College of [A-Z][a-z]*',
'[A-Z]* College of [A-Z][a-z]*',
'[A-Z][a-z]* College of [A-Z][a-z]*',
'[A-Z][a-z]* [A-Z][a-z]* College',
'[A-Z][a-z]* College',
'[A-Z][a-z]* [A-Z][a-z]* High School',
'[A-Z][a-z]* High School',
'[A-Z][a-z]* [A-Z][a-z]* Military Academy',
'[A-Z][a-z]* Military Academy',
'Universidad de los [A-Z][a-z]*',
'Universidad de las [A-Z][a-z]*',
'Universidad de [A-Z][a-z]*',
'Universidad [A-Z][a-z]* de [A-Z][a-z]* [A-Z][a-z]*',
'Universidad [A-Z][a-z]* de los [A-Z][a-z]*',
'Universidad [A-Z][a-z]* de las [A-Z][a-z]*',
'Universidad [A-Z][a-z]* de [A-Z][a-z]*',
]
# Join university patterns
university_patterns = '({})'.format('|'.join(sub_patterns))
def get_univerisities(self):
# Find all strings in text that follow university names patterns
university_matches = re.findall(university_patterns, self.raw_text)
# Declare list of university names to remove if found
universities_to_remove = ["University","University "," University"," University ","College"]
# Assign list of universities names
self.universities = list(set(university_matches))
# Loop through the list of universities to remove
for university in universities_to_remove:
try:
self.universities.remove(university)
except:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment