MrN00b0t/censor.py

## censor.py
# These are the emails you will be censoring. The open() function is opening the text file that the emails are contained in and the .read() method is allowing us to save their contexts to the following variables:
email_one = open("email_one.txt", "r").read()
email_two = open("email_two.txt", "r").read()
email_three = open("email_three.txt", "r").read()
email_four = open("email_four.txt", "r").read()

#list of terms provided in challenge 2
proprietary_terms = ["she", "personality matrix", "sense of self", "self-preservation", "learning algorithm", "her", "herself"]

#list of terms provided in challenge three. I added 'distressing'
negative_words = ["concerned", "behind", "danger", "dangerous", "alarming", "alarmed", "out of control", "help", "unhappy", "bad", "upset", "awful", "broken", "damage", "damaging", "dismal", "distressed", "distressed", "concerning", "horrible", "horribly", "questionable", 'distressing']

#create a large list for the final challenge
biglist = proprietary_terms + negative_words

#create a list of common punctuation that appears after a word
end = ['.', '?', '!', ')', ';', ':', ',', ' ']

#this simple function takes in a phrase and replaces it with censored
def censor(phrase,text):
  text = text.replace(phrase, 'X'*len(phrase))
  #check also for instances where phrase starts a sentence (capitalised)
  title = phrase[0].upper() + phrase[1:]
  text = text.replace(title, 'X'*len(phrase))
  return text

#print(censor('learning algorithms', email_one))

#take a list of words/phrases and censor them from document
def censorlist(phraselist, text):
  for phrase in phraselist:
    #handle cases of how word/phrase would appear if NOT punctuated
    #also avoid 'herself' becoming 'censoredself'
    middle = phrase + ' '
    title = middle.title()
    firstword = middle[0].upper() + middle[1:]
    text = text.replace(middle, 'X'*(len(middle)-1) + ' ')
    text = text.replace(title, 'X'*(len(title)-1) + ' ')
    text = text.replace(firstword, 'X'*(len(firstword)-1) + ' ')
    #handle case where text is a searched phrase in isolation
    if len(phrase) == len(text):
      text = text.replace(phrase, 'X'*len(phrase))
    #check for punctuated cases and return censored with same punctuation
    for punc in end:
      punctuated = phrase + punc
      text = text.replace(punctuated, 'X'*len(punctuated) + punc)
  return text

#print(email_two)
#print(censorlist(proprietary_terms, email_two))

#take a list of negative words and censor after ANY TWO occurrences
#ALSO censor everything from a phraselist
def positive(negwords, phraselist, text):
  #split the document into individual words
  split = text.split(' ')
  titlelist = []
  punclist = []
  #create expanded list which includes capitalised negative words
  for i in negwords:
    titlelist.append(i)
    titlelist.append(i.title())
  #expand list further to create punctuated words to search
  for i in titlelist:
    punclist.append(i)
  #also create cases where negword begins newline
    for j in end:
      punclist.append(i + j)
      punclist.append('\n\n' + i + j)
  count = 0
  #check each word in split to see if it is a negword
  for i in range(len(split)):
    for j in punclist:
      if split[i] == j:
        count += 1
  #check to see if 2 or more negwords have been detected so far
        if count < 3:
          continue
        else:
          split[i] = 'XXXXXX'
  #added following to catch phrase from negwords ('out of control')
  #However, only works after two other negwords have been found
  #Handle cases where < 3 negwords have been found
  try:
    splitter = split.index('XXXXXX')
  except:
    splitter = 0
  toclean = split[splitter:]
  partform = ' '.join(split[:splitter])
  toclean = ' '.join(toclean)
  #use censorlist() to catch any phrases in negwords but only operate on section
  #AFTER 3 negwords have been found
  toclean = censorlist(negwords, toclean)
  #rebuild the document
  reform = partform + ' ' + toclean
  reform = censorlist(phraselist, reform)
  return reform

#print(positive(negative_words, proprietary_terms, email_three))

#Final challenge, handle punctuatio, case and preserve length
#Censor ALL negative words and ALL instances of defined phrases
#AND censor all words before and after a negword/defined phrase

#For this challenge, negative words and defined phrases are combined into biglist above

def bigcensor(phraselist, text):
  split = text.split(' ')
  print(split)
  titlelist = []
  punclist = []
  filtered = []
  #as before create expanded list to include punctuation and casing
  for i in phraselist:
    titlelist.append(i)
    titlelist.append(i.title())
    titlelist.append(i.upper())
  for j in titlelist:
    punclist.append(j)
    for k in end:
      punclist.append(j + k)
      punclist.append('\n\n' + j + k)
  #start creating new censored document as we work through the split
  for l in range(len(split)):
    filtered.append(split[l])
    for m in punclist:
  #logic to handle extraction and censoring of words before and after
      if split[l] == m:
        current = filtered.pop()
        before = filtered.pop()
        bef = ''
        cur = ''
        for n in range(len(before)):
          bef = bef + 'X'
        for o in range(len(current)):
          cur = cur + 'X'
        filtered.append(bef)
        filtered.append(cur)
        after = split[l + 1]
        aft = ''
        for p in range(len(after)):
          aft = aft + 'X'
        split[l + 1] = aft
  #reassemble the document
  reform = ' '.join(filtered)
  reform = censorlist(phraselist, reform)
  #do a final pass to deal with events near \n
  #This does not filter word before/after, unfortunately
  for q in punclist:
    reform = censor(q, reform)
  return reform

#print(email_four)
#print(bigcensor(biglist, email_four))
	# These are the emails you will be censoring. The open() function is opening the text file that the emails are contained in and the .read() method is allowing us to save their contexts to the following variables:
	email_one = open("email_one.txt", "r").read()
	email_two = open("email_two.txt", "r").read()
	email_three = open("email_three.txt", "r").read()
	email_four = open("email_four.txt", "r").read()

	#list of terms provided in challenge 2
	proprietary_terms = ["she", "personality matrix", "sense of self", "self-preservation", "learning algorithm", "her", "herself"]

	#list of terms provided in challenge three. I added 'distressing'
	negative_words = ["concerned", "behind", "danger", "dangerous", "alarming", "alarmed", "out of control", "help", "unhappy", "bad", "upset", "awful", "broken", "damage", "damaging", "dismal", "distressed", "distressed", "concerning", "horrible", "horribly", "questionable", 'distressing']

	#create a large list for the final challenge
	biglist = proprietary_terms + negative_words

	#create a list of common punctuation that appears after a word
	end = ['.', '?', '!', ')', ';', ':', ',', ' ']

	#this simple function takes in a phrase and replaces it with censored
	def censor(phrase,text):
	text = text.replace(phrase, 'X'*len(phrase))
	#check also for instances where phrase starts a sentence (capitalised)
	title = phrase[0].upper() + phrase[1:]
	text = text.replace(title, 'X'*len(phrase))
	return text

	#print(censor('learning algorithms', email_one))

	#take a list of words/phrases and censor them from document
	def censorlist(phraselist, text):
	for phrase in phraselist:
	#handle cases of how word/phrase would appear if NOT punctuated
	#also avoid 'herself' becoming 'censoredself'
	middle = phrase + ' '
	title = middle.title()
	firstword = middle[0].upper() + middle[1:]
	text = text.replace(middle, 'X'*(len(middle)-1) + ' ')
	text = text.replace(title, 'X'*(len(title)-1) + ' ')
	text = text.replace(firstword, 'X'*(len(firstword)-1) + ' ')
	#handle case where text is a searched phrase in isolation
	if len(phrase) == len(text):
	text = text.replace(phrase, 'X'*len(phrase))
	#check for punctuated cases and return censored with same punctuation
	for punc in end:
	punctuated = phrase + punc
	text = text.replace(punctuated, 'X'*len(punctuated) + punc)
	return text

	#print(email_two)
	#print(censorlist(proprietary_terms, email_two))

	#take a list of negative words and censor after ANY TWO occurrences
	#ALSO censor everything from a phraselist
	def positive(negwords, phraselist, text):
	#split the document into individual words
	split = text.split(' ')
	titlelist = []
	punclist = []
	#create expanded list which includes capitalised negative words
	for i in negwords:
	titlelist.append(i)
	titlelist.append(i.title())
	#expand list further to create punctuated words to search
	for i in titlelist:
	punclist.append(i)
	#also create cases where negword begins newline
	for j in end:
	punclist.append(i + j)
	punclist.append('\n\n' + i + j)
	count = 0
	#check each word in split to see if it is a negword
	for i in range(len(split)):
	for j in punclist:
	if split[i] == j:
	count += 1
	#check to see if 2 or more negwords have been detected so far
	if count < 3:
	continue
	else:
	split[i] = 'XXXXXX'
	#added following to catch phrase from negwords ('out of control')
	#However, only works after two other negwords have been found
	#Handle cases where < 3 negwords have been found
	try:
	splitter = split.index('XXXXXX')
	except:
	splitter = 0
	toclean = split[splitter:]
	partform = ' '.join(split[:splitter])
	toclean = ' '.join(toclean)
	#use censorlist() to catch any phrases in negwords but only operate on section
	#AFTER 3 negwords have been found
	toclean = censorlist(negwords, toclean)
	#rebuild the document
	reform = partform + ' ' + toclean
	reform = censorlist(phraselist, reform)
	return reform

	#print(positive(negative_words, proprietary_terms, email_three))

	#Final challenge, handle punctuatio, case and preserve length
	#Censor ALL negative words and ALL instances of defined phrases
	#AND censor all words before and after a negword/defined phrase

	#For this challenge, negative words and defined phrases are combined into biglist above

	def bigcensor(phraselist, text):
	split = text.split(' ')
	print(split)
	titlelist = []
	punclist = []
	filtered = []
	#as before create expanded list to include punctuation and casing
	for i in phraselist:
	titlelist.append(i)
	titlelist.append(i.title())
	titlelist.append(i.upper())
	for j in titlelist:
	punclist.append(j)
	for k in end:
	punclist.append(j + k)
	punclist.append('\n\n' + j + k)
	#start creating new censored document as we work through the split
	for l in range(len(split)):
	filtered.append(split[l])
	for m in punclist:
	#logic to handle extraction and censoring of words before and after
	if split[l] == m:
	current = filtered.pop()
	before = filtered.pop()
	bef = ''
	cur = ''
	for n in range(len(before)):
	bef = bef + 'X'
	for o in range(len(current)):
	cur = cur + 'X'
	filtered.append(bef)
	filtered.append(cur)
	after = split[l + 1]
	aft = ''
	for p in range(len(after)):
	aft = aft + 'X'
	split[l + 1] = aft
	#reassemble the document
	reform = ' '.join(filtered)
	reform = censorlist(phraselist, reform)
	#do a final pass to deal with events near \n
	#This does not filter word before/after, unfortunately
	for q in punclist:
	reform = censor(q, reform)
	return reform

	#print(email_four)
	#print(bigcensor(biglist, email_four))