Skip to content

Instantly share code, notes, and snippets.

@schcriher
Last active November 9, 2017 01:14
Show Gist options
  • Save schcriher/7388722 to your computer and use it in GitHub Desktop.
Save schcriher/7388722 to your computer and use it in GitHub Desktop.
Calcula la diferencia entre dos textos
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
#
# Copyright (C) 2013-2017 Cristian Hernán Schmidt
#
# texdiff.py is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# texdiff.py is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with texdiff.py. If not, see <http://www.gnu.org/licenses/>.
import re
import unicodedata
NO_WORD_RE = re.compile('[^\w]')
def remove_diacritics(string):
"""Removes the Mark and Nonspacing characters from the string"""
nfkd = unicodedata.normalize('NFKD', string)
return ''.join(c for c in nfkd if unicodedata.category(c) != 'Mn')
def get_positions(string, letter, offset=0):
""" Returns all positions of the letter in the string,
offset is an integer that modifies all positions.
"""
for i, c in enumerate(string):
if c == letter:
yield i + offset
def texdiff(a, b, insensitive=True, accents=False, onlyword=False):
""" Returns the fraction of the difference between "a" and "b".
An incorrect letter is more penalized in a short word than in a long one.
Parameters:
insensitive=True Set case-insensitive
accents=False There is no distinction between words with and
without accents
onlyword=False It analyzes only the characters considered word
in the regular expressions
Use:
fraction = texdiff(a, b)
where:
0 <= fraction <= 1
0.0 Zero difference, "a" and "b" are the same
0.5 Half difference, example: the same letters in another order
1.0 Full difference, no letter matches
Design: Schmidt Cristian Hernán <schcriher@gmail.com>
"""
if insensitive:
a = a.lower()
b = b.lower()
if not accents:
a = remove_diacritics(a)
b = remove_diacritics(b)
if onlyword:
a = NO_WORD_RE.sub('', a)
b = NO_WORD_RE.sub('', b)
n = len(a + b)
letters = set(a + b)
a_offset = b.find(a) if a in b else 0
b_offset = a.find(b) if b in a else 0
diff_quantity = 0
diff_position = 0
for letter in letters:
count_a = a.count(letter)
count_b = b.count(letter)
diff_quantity += abs(count_a - count_b)
pos_a = set(get_positions(a, letter, a_offset))
pos_b = set(get_positions(b, letter, b_offset))
diff_position += len(pos_a.symmetric_difference(pos_b))
return (diff_quantity + diff_position) / (2 * n)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment