-
-
Save codecademydev/77da1523c1031ea0e5f67f537caa7941 to your computer and use it in GitHub Desktop.
Codecademy export
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# These are the emails you will be censoring. The open() function is opening the text file that the emails are contained in and the .read() method is allowing us to save their contexts to the following variables: | |
email_one = open("email_one.txt", "r").read() | |
email_two = open("email_two.txt", "r").read() | |
email_three = open("email_three.txt", "r").read() | |
email_four = open("email_four.txt", "r").read() | |
redact_replacer = "[REDACTED]" | |
proprietary_terms = ["she", "personality matrix", "sense of self", "self-preservation", "learning algorithm", "her", "herself"] | |
negative_words = ["concerned", "behind", "dangerous", "danger", "alarming", "alarmed", "out of control", "help", "unhappy", "bad", "upset", "awful", "broken", "damaging", "damage", "dismal", "distressed", "distressing", "concerning", "horrible", "horribly", "questionable"] | |
# Note that the order of negative words has been changed such that terms that are 'subset' of other | |
# terms come after the full term. For example "danger" is part of "dangerous", so "dangerous" needs | |
# to come first or you'll end up with "[REDACTED]ous" in the output. | |
# For my current skill level I'm really happy with this one. Word subsets like "her" are a little bit | |
# troublesome, as they results in some "[REDACTED]e"s etc. | |
def redact_proprietary(document, proprietary_list, redacted_word): | |
for word in proprietary_list: # Remove all occurences of these words | |
for i in range(len(document)): # Index the document | |
if document[i:i+len(word)].lower() == word.lower(): # Check if slice of the document is equal to word that should be replaced | |
document = document.replace(document[i:i+len(word)], redacted_word) # Redact a word in the document | |
return document | |
# print(redact_proprietary(document, proprietary_terms, redact_replacer)) | |
# I think this handles pretty much all cases. Word subsets like "her" are a little bit troublesome, | |
# as they results in some "[REDACTED]e"s etc. | |
def redact_negative(document, negatives_list, redacted_word): | |
negative_counter = 0 # Count the times an item on negatives_list has been seen | |
for i in range(len(document)): # Index the document | |
for negative in negatives_list: # Remove occurences of words on negatives_list when occurences exceed two | |
if document[i:i+len(negative)].lower() == negative.lower() and negative_counter < 2: # Increment counter for first 2 occurences of items on negatives_list | |
negative_counter += 1 | |
elif document[i:i+len(negative)].lower() == negative.lower(): # Redact a word in the document after 2 occurences | |
document = document.replace(document[i:i+len(negative)], redacted_word) | |
return document | |
# print(redact_negative(redact_proprietary(email_three, proprietary_terms, redact_replacer), negative_words, redact_replacer)) | |
# This somewhat works, but has problems with pharses such as "personality matrix", as .split() | |
# can't really handle them. | |
def redact_all(document, proprietary_list, negatives_list, redact_replacer): | |
document = document.replace('\n', ' * ') # Replace line change with a special character for later reversion | |
document_to_words = document.split(' ') # Split the document into list so items in it can be changed | |
all_to_be_redacted = proprietary_list + negatives_list # Make a list of words that need censoring | |
for word in all_to_be_redacted: | |
for i in range(len(document_to_words)): | |
if word.lower() == document_to_words[i].lower(): | |
before_redacted = i - 1 # Handles censoring the word before the actual word | |
after_redacted = i + 1 # Handles censoring the word after the actual word | |
document_to_words[i] = redact_replacer | |
document_to_words[before_redacted] = redact_replacer | |
document_to_words[after_redacted] = redact_replacer | |
document = ' '.join(document_to_words) # Merge the document | |
document = document.replace(' * ', '\n') # Reverse the line change done earlier | |
return document | |
print(email_four) | |
print(redact_all(email_four, proprietary_terms, negative_words, redact_replacer)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment