langner/similar_pages.py

## similar_pages.py
def similar_pages(pages1, pages2):
    """Determine whether two pages strings are similar.

    Redundant digits in the end page should be ignored -- for example, 1660-1661 can be
    reduced to 1660-1 -- and the end page (and hyphen) can be skipped if it's a single page.
    Additionally, for some journals, WoK can also replace the end page with something else,
    for example: 241-+ instead of 241-247.e9 (supp info), or O1125-U144 (no idea what that is),
    and they have said this cannot change for technical reasons. Oh well.

    Additional exceptions:
        - sometimes WoK adds 'UNSP before the article number when it is given as a pge
    """

    if not (pages1 and pages2):
        return False

    def standardize_pages(p):
        if p.count('-') == 1:
            start, end = p.split('-')
            if start == end:
                p = start
            elif start[0] == 'o':
                p = start
            elif (end == "+") or (".e" in end):
                p = start
            elif len(start) == len(end):
                p = start + "-" + end[len(os.path.commonprefix([start, end])):]
        if len(p.split()) > 1:
            if p.split()[0].upper() == "UNSP":
                p = ' '.join(p.split()[1:])
        return p

    sp1 = standardize_pages(pages1.strip().lower())
    sp2 = standardize_pages(pages2.strip().lower())

    # Sometimes WoK has pages like '540-U32', but the actual pages are regular, so we
    # won't detect the 'U' in our database. In this case we need to adjust the pages
    # for both cases (taking just the first page for matching).
    if sp1.count('-') and sp2.count('-'):
        if sp1.split('-')[1][0] == "u" or sp2.split('-')[1][0] == "u":
            sp1 = sp1.split('-')[0]
            sp2 = sp2.split('-')[0]

    return sp1 == sp2
	def similar_pages(pages1, pages2):
	"""Determine whether two pages strings are similar.

	Redundant digits in the end page should be ignored -- for example, 1660-1661 can be
	reduced to 1660-1 -- and the end page (and hyphen) can be skipped if it's a single page.
	Additionally, for some journals, WoK can also replace the end page with something else,
	for example: 241-+ instead of 241-247.e9 (supp info), or O1125-U144 (no idea what that is),
	and they have said this cannot change for technical reasons. Oh well.

	Additional exceptions:
	- sometimes WoK adds 'UNSP before the article number when it is given as a pge
	"""

	if not (pages1 and pages2):
	return False

	def standardize_pages(p):
	if p.count('-') == 1:
	start, end = p.split('-')
	if start == end:
	p = start
	elif start[0] == 'o':
	p = start
	elif (end == "+") or (".e" in end):
	p = start
	elif len(start) == len(end):
	p = start + "-" + end[len(os.path.commonprefix([start, end])):]
	if len(p.split()) > 1:
	if p.split()[0].upper() == "UNSP":
	p = ' '.join(p.split()[1:])
	return p

	sp1 = standardize_pages(pages1.strip().lower())
	sp2 = standardize_pages(pages2.strip().lower())

	# Sometimes WoK has pages like '540-U32', but the actual pages are regular, so we
	# won't detect the 'U' in our database. In this case we need to adjust the pages
	# for both cases (taking just the first page for matching).
	if sp1.count('-') and sp2.count('-'):
	if sp1.split('-')[1][0] == "u" or sp2.split('-')[1][0] == "u":
	sp1 = sp1.split('-')[0]
	sp2 = sp2.split('-')[0]

	return sp1 == sp2