Skip to content

Instantly share code, notes, and snippets.

@racheltwu
Last active December 24, 2015 00:40
Show Gist options
  • Save racheltwu/6718367 to your computer and use it in GitHub Desktop.
Save racheltwu/6718367 to your computer and use it in GitHub Desktop.
import re
def longestDuplicateNonOverlappingSubstring(string, minimum_length_of_substring=4):
substrings = []
# Create list of every substring
# ['b', 'ba', 'ban', 'bana', 'banan', 'banana', 'a', 'an', 'ana', ...]
for i in xrange(len(string)):
for j in xrange(i, len(string)):
sub = string[i:j+1]
substrings.append(sub)
dups = []
# Find substrings that occur more than once
# ['an', 'ana', 'na', 'an', 'ana', 'na']
for substring in substrings:
if len(substring) < minimum_length_of_substring:
continue
if substrings.count(substring) > 1:
dups.append(substring)
if not dups:
return False
# Sort dups by length
dups.sort(lambda x,y: cmp(len(x), len(y)))
# Starting with longest dup...
for dup in reversed(dups):
new_string = removeLongestDuplicateSubstringAfterFirstInstance(string, dup)
if new_string:
# Return the dup if it was not overlapping, or move on to next longest
return dup
return False
def removeLongestDuplicateSubstringAfterFirstInstance(string, dup):
i = string.find(dup) + len(dup)
# Split string after first instance of dup in string
first, rest = string[:i], string[i:]
new_string = first + re.sub(re.escape(dup), ' ', rest)
# Remove all instances of dup after first instance
new_string = re.sub(r' +', ' ', new_string)
# If it removed anything, it means the dup was not overlapping, i.e. 'ana'
return new_string if new_string != string else False
newlines = []
with open('trans.csv', 'r') as file:
lines = file.read().split('\n')
for line in lines:
cells = line.split(',')
description = cells[1]
dup = longestDuplicateNonOverlappingSubstring(description)
if dup:
new_description = removeLongestDuplicateSubstringAfterFirstInstance(description, dup)
cells[1] = new_description
newline = ','.join(cells)
newlines.append(newline)
else:
newlines.append(line)
with open('out.csv', 'w') as file:
for line in newlines:
file.write(line + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment