Skip to content

Instantly share code, notes, and snippets.

@badzil
Created February 16, 2011 23:50
Show Gist options
  • Save badzil/830586 to your computer and use it in GitHub Desktop.
Save badzil/830586 to your computer and use it in GitHub Desktop.
ads_affiliations_splitter.py
import re
_RE_MULTIPLE_SPACES = re.compile('\s\s+')
_RE_AFFILIATION_PREFIX = re.compile('([A-Z][A-Z]+)\(')
_RE_AFFILIATION_SUFFIX = re.compile('\)[,;]? [A-Z][A-Z]+\(')
_RE_EMAIL = re.compile(';?\s?<EMAIL>\s?(.*?)\s?</EMAIL>;?')
def clean_affiliations_string(affiliations_string):
"""
Strips the spaces and collapses multiple spaces.
"""
return _RE_MULTIPLE_SPACES.sub(' ', affiliations_string.strip())
def get_affiliations(affiliations_string):
"""
Returns a dictionary of affiliations and emails.
"""
affiliations = {}
emails = {}
# First we clean up the spaces in the affilitions string.
affiliations_string = clean_affiliations_string(affiliations_string)
while affiliations_string:
affiliations_string, index, affiliation = _extract_first_affiliation(affiliations_string)
if index in affiliations:
raise Exception('Double label.')
else:
if '<EMAIL>' in affiliation:
affiliation, email = _extract_email_from_affiliation(affiliation)
emails[index] = email
affiliations[index] = affiliation
return (affiliations, emails)
def _extract_email_from_affiliation(affiliation):
"""
Returns a tuple:
* affiliation without email.
* email.
"""
email = None
match = _RE_EMAIL.search(affiliation)
if match is None:
raise Exception('Affiliation contains <EMAIL> but could not be parsed: %s' % affiliation)
else:
email = match.group(1)
affiliation = affiliation.replace(match.group(0), '')
return (affiliation, email)
def _extract_first_affiliation(affiliations_string):
"""
Extract the first affiliation from the affiliations string and returns a
tuple of:
* the affiliations string without the first affiliation.
* the index of the first affiliation.
* the first affiliation.
"""
match = _RE_AFFILIATION_PREFIX.match(affiliations_string)
if match is None:
raise Exception('Prefix not found: %s' % affiliations_string)
label = match.group(1)
index = get_index_from_label(label)
affiliations_string = _RE_AFFILIATION_PREFIX.sub('', affiliations_string, count=1)
opened_parenthesis = 1
# Now we count the parenthesis and when we find balanced parenthesis, we
# consider that we got the full affiliation string.
idx = 0
for idx, char in enumerate(affiliations_string):
if char == '(':
opened_parenthesis += 1
elif char == ')':
opened_parenthesis -= 1
if opened_parenthesis == 0:
break
if opened_parenthesis > 0:
raise Exception('Problem of affiliation with unbalanced parenthesis.')
# OK. We know where the affiliation is so we extract it and remove it from
# the global string.
affiliation = affiliations_string[:idx].strip()
affiliations_string = affiliations_string[idx:].strip()
# Finally we check that the global string starts with an affiliation suffix
# and we clean it.
if affiliations_string == ')':
# OK. This was the last affiliation.
affiliations_string = ''
elif _RE_AFFILIATION_SUFFIX.match(affiliations_string) is not None:
# OK. There is an affiliation following.
affiliations_string = re.sub('\)[;,]? ', '', affiliations_string, count=1)
else:
# OK. Something went wrong.
raise Exception('Problem of affiliation with unbalanced parenthesis.')
return (affiliations_string, index, affiliation)
# Dictionary used to cache the results of the computation for the labels.
_LABEL_INDEX = {}
def get_index_from_label(label):
"""
Returns an integer index for an affiliation label, ie:
AA -> 1
AB -> 2
BA -> 27
AAA -> 677
"""
index = _LABEL_INDEX.get(label)
if index is None:
# First we reverse the label.
label = label[::-1]
# Then the label is a base-26 representation of the index.
index = 0
for idx, char in enumerate(label):
index += (ord(char) - 64) * (26 ** idx)
# Because we consider 'A' as 1 and not 0, we need to offset by 26.
index -= 26
return index
TESTS = [
# Simplest case
('AA(aff1)', ({1: 'aff1'}, {})),
# Index does not start at AA.
('AB(aff1)', ({2: 'aff1'}, {})),
# 2 affiliations - ordered
('AA(aff1), AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
('AA(aff1); AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
('AA(aff1) AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
# 3 affiliations - ordered
('AA(aff1), AB(aff2), AC(aff3)', ({1: 'aff1', 2: 'aff2', 3: 'aff3'}, {})),
('AA(aff with space)', ({1: 'aff with space'}, {})),
('AA(CERN, Switzerland), AB(CfA (Cambridge) USA)', ({2: 'CfA (Cambridge) USA', 1: 'CERN, Switzerland'}, {})),
('AB(CERN, Switzerland), AA(CfA (Cambridge) USA)', ({1: 'CfA (Cambridge) USA', 2: 'CERN, Switzerland'}, {})),
('AA(CERN, Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB(CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
('AA(CERN, Switzerland <EMAIL>bthiell@cfa.harvard.edu</EMAIL>), AB( CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: 'bthiell@cfa.harvard.edu'})),
('AA( CERN Geneva ), AB( Another affiliation <EMAIL>me@me.com </EMAIL>;)', ({1: 'CERN Geneva', 2: 'Another affiliation'}, {2: 'me@me.com'})),
('AA(aff1), AAA(aff2), AAAA(aff3)', ({1: 'aff1', 677: 'aff2', 18253: 'aff3'}, {})),
]
def test_get_affiliations():
for aff_string, output in TESTS:
if get_affiliations(aff_string) != output:
print 'Test failed:\n\t%s\n\t%s' % (aff_string, output)
print 'All %d tests finished.' % len(TESTS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment