racheltwu/remove_longest_duplicate_substring.py

## remove_longest_duplicate_substring.py
import re

def longestDuplicateNonOverlappingSubstring(string, minimum_length_of_substring=4):
    substrings = []
    # Create list of every substring
    # ['b', 'ba', 'ban', 'bana', 'banan', 'banana', 'a', 'an', 'ana', ...]
    for i in xrange(len(string)):
        for j in xrange(i, len(string)):
            sub = string[i:j+1]
            substrings.append(sub)
    dups = []
    # Find substrings that occur more than once
    # ['an', 'ana', 'na', 'an', 'ana', 'na']
    for substring in substrings:
        if len(substring) < minimum_length_of_substring:
            continue
        if substrings.count(substring) > 1:
            dups.append(substring)
    if not dups:
        return False
    # Sort dups by length
    dups.sort(lambda x,y: cmp(len(x), len(y)))
    # Starting with longest dup...
    for dup in reversed(dups):
        new_string = removeLongestDuplicateSubstringAfterFirstInstance(string, dup)
        if new_string:
            # Return the dup if it was not overlapping, or move on to next longest
            return dup
    return False


def removeLongestDuplicateSubstringAfterFirstInstance(string, dup):
    i = string.find(dup) + len(dup)
    # Split string after first instance of dup in string
    first, rest = string[:i], string[i:]
    new_string = first + re.sub(re.escape(dup), ' ', rest)
    # Remove all instances of dup after first instance
    new_string = re.sub(r' +', ' ', new_string)
    # If it removed anything, it means the dup was not overlapping, i.e. 'ana'
    return new_string if new_string != string else False


newlines = []
with open('trans.csv', 'r') as file:
    lines = file.read().split('\n')
    for line in lines:
        cells = line.split(',')
        description = cells[1]
        dup = longestDuplicateNonOverlappingSubstring(description)
        if dup:
            new_description = removeLongestDuplicateSubstringAfterFirstInstance(description, dup)
            cells[1] = new_description
            newline = ','.join(cells)
            newlines.append(newline)
        else:
            newlines.append(line)

with open('out.csv', 'w') as file:
    for line in newlines:
        file.write(line + '\n')
	import re

	def longestDuplicateNonOverlappingSubstring(string, minimum_length_of_substring=4):
	substrings = []
	# Create list of every substring
	# ['b', 'ba', 'ban', 'bana', 'banan', 'banana', 'a', 'an', 'ana', ...]
	for i in xrange(len(string)):
	for j in xrange(i, len(string)):
	sub = string[i:j+1]
	substrings.append(sub)
	dups = []
	# Find substrings that occur more than once
	# ['an', 'ana', 'na', 'an', 'ana', 'na']
	for substring in substrings:
	if len(substring) < minimum_length_of_substring:
	continue
	if substrings.count(substring) > 1:
	dups.append(substring)
	if not dups:
	return False
	# Sort dups by length
	dups.sort(lambda x,y: cmp(len(x), len(y)))
	# Starting with longest dup...
	for dup in reversed(dups):
	new_string = removeLongestDuplicateSubstringAfterFirstInstance(string, dup)
	if new_string:
	# Return the dup if it was not overlapping, or move on to next longest
	return dup
	return False


	def removeLongestDuplicateSubstringAfterFirstInstance(string, dup):
	i = string.find(dup) + len(dup)
	# Split string after first instance of dup in string
	first, rest = string[:i], string[i:]
	new_string = first + re.sub(re.escape(dup), ' ', rest)
	# Remove all instances of dup after first instance
	new_string = re.sub(r' +', ' ', new_string)
	# If it removed anything, it means the dup was not overlapping, i.e. 'ana'
	return new_string if new_string != string else False


	newlines = []
	with open('trans.csv', 'r') as file:
	lines = file.read().split('\n')
	for line in lines:
	cells = line.split(',')
	description = cells[1]
	dup = longestDuplicateNonOverlappingSubstring(description)
	if dup:
	new_description = removeLongestDuplicateSubstringAfterFirstInstance(description, dup)
	cells[1] = new_description
	newline = ','.join(cells)
	newlines.append(newline)
	else:
	newlines.append(line)

	with open('out.csv', 'w') as file:
	for line in newlines:
	file.write(line + '\n')