schcriher/texdiff.py

## texdiff.py
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
#
# Copyright (C) 2013-2017 Cristian Hernán Schmidt
#
# texdiff.py is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# texdiff.py is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with texdiff.py. If not, see <http://www.gnu.org/licenses/>.

import re
import unicodedata

NO_WORD_RE = re.compile('[^\w]')

def remove_diacritics(string):
    """Removes the Mark and Nonspacing characters from the string"""
    nfkd = unicodedata.normalize('NFKD', string)
    return ''.join(c for c in nfkd if unicodedata.category(c) != 'Mn')

def get_positions(string, letter, offset=0):
    """ Returns all positions of the letter in the string,
        offset is an integer that modifies all positions.
    """
    for i, c in enumerate(string):
        if c == letter:
            yield i + offset

def texdiff(a, b, insensitive=True, accents=False, onlyword=False):
    """ Returns the fraction of the difference between "a" and "b".

        An incorrect letter is more penalized in a short word than in a long one.

        Parameters:
            insensitive=True    Set case-insensitive
            accents=False       There is no distinction between words with and
                                without accents
            onlyword=False      It analyzes only the characters considered word
                                in the regular expressions

        Use:
            fraction = texdiff(a, b)

        where:
            0 <= fraction <= 1

            0.0 Zero difference, "a" and "b" are the same
            0.5 Half difference, example: the same letters in another order
            1.0 Full difference, no letter matches

       Design: Schmidt Cristian Hernán <schcriher@gmail.com>
    """
    if insensitive:
        a = a.lower()
        b = b.lower()

    if not accents:
        a = remove_diacritics(a)
        b = remove_diacritics(b)

    if onlyword:
        a = NO_WORD_RE.sub('', a)
        b = NO_WORD_RE.sub('', b)

    n = len(a + b)
    letters = set(a + b)

    a_offset = b.find(a) if a in b else 0
    b_offset = a.find(b) if b in a else 0

    diff_quantity = 0
    diff_position = 0

    for letter in letters:
        count_a = a.count(letter)
        count_b = b.count(letter)
        diff_quantity += abs(count_a - count_b)

        pos_a = set(get_positions(a, letter, a_offset))
        pos_b = set(get_positions(b, letter, b_offset))
        diff_position += len(pos_a.symmetric_difference(pos_b))

    return (diff_quantity + diff_position) / (2 * n)
	#!/usr/bin/env python3
	#-- coding: utf-8 --
	#
	# Copyright (C) 2013-2017 Cristian Hernán Schmidt
	#
	# texdiff.py is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# texdiff.py is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with texdiff.py. If not, see <http://www.gnu.org/licenses/>.

	import re
	import unicodedata

	NO_WORD_RE = re.compile('[^\w]')

	def remove_diacritics(string):
	"""Removes the Mark and Nonspacing characters from the string"""
	nfkd = unicodedata.normalize('NFKD', string)
	return ''.join(c for c in nfkd if unicodedata.category(c) != 'Mn')

	def get_positions(string, letter, offset=0):
	""" Returns all positions of the letter in the string,
	offset is an integer that modifies all positions.
	"""
	for i, c in enumerate(string):
	if c == letter:
	yield i + offset

	def texdiff(a, b, insensitive=True, accents=False, onlyword=False):
	""" Returns the fraction of the difference between "a" and "b".

	An incorrect letter is more penalized in a short word than in a long one.

	Parameters:
	insensitive=True Set case-insensitive
	accents=False There is no distinction between words with and
	without accents
	onlyword=False It analyzes only the characters considered word
	in the regular expressions

	Use:
	fraction = texdiff(a, b)

	where:
	0 <= fraction <= 1

	0.0 Zero difference, "a" and "b" are the same
	0.5 Half difference, example: the same letters in another order
	1.0 Full difference, no letter matches

	Design: Schmidt Cristian Hernán <schcriher@gmail.com>
	"""
	if insensitive:
	a = a.lower()
	b = b.lower()

	if not accents:
	a = remove_diacritics(a)
	b = remove_diacritics(b)

	if onlyword:
	a = NO_WORD_RE.sub('', a)
	b = NO_WORD_RE.sub('', b)

	n = len(a + b)
	letters = set(a + b)

	a_offset = b.find(a) if a in b else 0
	b_offset = a.find(b) if b in a else 0

	diff_quantity = 0
	diff_position = 0

	for letter in letters:
	count_a = a.count(letter)
	count_b = b.count(letter)
	diff_quantity += abs(count_a - count_b)

	pos_a = set(get_positions(a, letter, a_offset))
	pos_b = set(get_positions(b, letter, b_offset))
	diff_position += len(pos_a.symmetric_difference(pos_b))

	return (diff_quantity + diff_position) / (2 * n)