Skip to content

Instantly share code, notes, and snippets.

@langner
Last active May 7, 2021 09:55
Show Gist options
  • Save langner/9e732e662b81d45af52a to your computer and use it in GitHub Desktop.
Save langner/9e732e662b81d45af52a to your computer and use it in GitHub Desktop.
Python function for testing similarity of two article title fields
import difflib
import string
def similar_titles(t1, t2, accuracy=1.00, debug=None):
"""Determine whether two titles are similar.
As a rule, we want titles to be identical after removing whitespace
and punctuation. Other discrepancies should be dealt with manually by
ensuring the titles are correct, or by replacing strings in all titles,
in this function, before comparing them.
"""
try:
t1 = t1.lower().decode('utf-8')
except UnicodeEncodeError as e:
if debug:
debug("There was a problem decoding a title: %s" % e)
debug("Offending title: %s" % t1)
try:
t2 = t2.lower().decode('utf-8')
except UnicodeEncodeError as e:
if debug:
debug("There was a problem decoding a title: %s" % e)
debug("Offending title: %s" % t2)
to_replace = {
# Remove some prefixes that are sometimes prepended to titles.
"letter to the editor: " : "", "tech sight. " : "",
# Expand Unicode symbols and some signs.
u"α" : "alpha", u"β" : "beta", u"γ" : "gamma",
u"κ" : "kappa", u"δ": "delta",
u"Å" : "a", "angstrom" : "a", 'angstroms' : 'angstrom',
"+" : "plus",
# Several names are often shortened.
"h. pylori" : "helicobacter pylori",
# The formatting of isotopes varies.
"h-1" : "1h", "c-13" : "13c", "n-15" : "15n",
# WoS is not capable of printing vertical bars for some odd reason,
# a response I received from the Customer Support.
'|' : ' vertical bar ',
}
to_replace.update(greek_alphabet)
for tr in to_replace:
t1 = t1.replace(tr, to_replace[tr])
t2 = t2.replace(tr, to_replace[tr])
# Replace any remaining Unicode with ASCII equivalents
t1 = unidecode(t1)
t2 = unidecode(t2)
exclude = ' ' + string.whitespace + string.punctuation
t1 = ''.join([c for c in t1 if c not in exclude])
t2 = ''.join([c for c in t2 if c not in exclude])
return difflib.SequenceMatcher(None, t1, t2).ratio() >= accuracy
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment