Skip to content

Instantly share code, notes, and snippets.

@codecademydev
Created April 9, 2020 12:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save codecademydev/77da1523c1031ea0e5f67f537caa7941 to your computer and use it in GitHub Desktop.
Save codecademydev/77da1523c1031ea0e5f67f537caa7941 to your computer and use it in GitHub Desktop.
Codecademy export
# These are the emails you will be censoring. The open() function is opening the text file that the emails are contained in and the .read() method is allowing us to save their contexts to the following variables:
email_one = open("email_one.txt", "r").read()
email_two = open("email_two.txt", "r").read()
email_three = open("email_three.txt", "r").read()
email_four = open("email_four.txt", "r").read()
redact_replacer = "[REDACTED]"
proprietary_terms = ["she", "personality matrix", "sense of self", "self-preservation", "learning algorithm", "her", "herself"]
negative_words = ["concerned", "behind", "dangerous", "danger", "alarming", "alarmed", "out of control", "help", "unhappy", "bad", "upset", "awful", "broken", "damaging", "damage", "dismal", "distressed", "distressing", "concerning", "horrible", "horribly", "questionable"]
# Note that the order of negative words has been changed such that terms that are 'subset' of other
# terms come after the full term. For example "danger" is part of "dangerous", so "dangerous" needs
# to come first or you'll end up with "[REDACTED]ous" in the output.
# For my current skill level I'm really happy with this one. Word subsets like "her" are a little bit
# troublesome, as they results in some "[REDACTED]e"s etc.
def redact_proprietary(document, proprietary_list, redacted_word):
for word in proprietary_list: # Remove all occurences of these words
for i in range(len(document)): # Index the document
if document[i:i+len(word)].lower() == word.lower(): # Check if slice of the document is equal to word that should be replaced
document = document.replace(document[i:i+len(word)], redacted_word) # Redact a word in the document
return document
# print(redact_proprietary(document, proprietary_terms, redact_replacer))
# I think this handles pretty much all cases. Word subsets like "her" are a little bit troublesome,
# as they results in some "[REDACTED]e"s etc.
def redact_negative(document, negatives_list, redacted_word):
negative_counter = 0 # Count the times an item on negatives_list has been seen
for i in range(len(document)): # Index the document
for negative in negatives_list: # Remove occurences of words on negatives_list when occurences exceed two
if document[i:i+len(negative)].lower() == negative.lower() and negative_counter < 2: # Increment counter for first 2 occurences of items on negatives_list
negative_counter += 1
elif document[i:i+len(negative)].lower() == negative.lower(): # Redact a word in the document after 2 occurences
document = document.replace(document[i:i+len(negative)], redacted_word)
return document
# print(redact_negative(redact_proprietary(email_three, proprietary_terms, redact_replacer), negative_words, redact_replacer))
# This somewhat works, but has problems with pharses such as "personality matrix", as .split()
# can't really handle them.
def redact_all(document, proprietary_list, negatives_list, redact_replacer):
document = document.replace('\n', ' * ') # Replace line change with a special character for later reversion
document_to_words = document.split(' ') # Split the document into list so items in it can be changed
all_to_be_redacted = proprietary_list + negatives_list # Make a list of words that need censoring
for word in all_to_be_redacted:
for i in range(len(document_to_words)):
if word.lower() == document_to_words[i].lower():
before_redacted = i - 1 # Handles censoring the word before the actual word
after_redacted = i + 1 # Handles censoring the word after the actual word
document_to_words[i] = redact_replacer
document_to_words[before_redacted] = redact_replacer
document_to_words[after_redacted] = redact_replacer
document = ' '.join(document_to_words) # Merge the document
document = document.replace(' * ', '\n') # Reverse the line change done earlier
return document
print(email_four)
print(redact_all(email_four, proprietary_terms, negative_words, redact_replacer))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment