Censor Dispenser – Searching and Manipulating Text with Python

Date: June 10, 2020Author: MrN00b0t 0 Comments

This is my solution to the Codecademy Pro Project “Censor Dispenser” using Python to search and manipulate text documents. Probably not the most elegant but really helped me to learn more about catching variations in searches, for example capitalised and punctuated words/phrases.

# These are the emails you will be censoring. The open() function is opening the text file that the emails are contained in and the .read() method is allowing us to save their contexts to the following variables:
email_one = open("email_one.txt", "r").read()
email_two = open("email_two.txt", "r").read()
email_three = open("email_three.txt", "r").read()
email_four = open("email_four.txt", "r").read()

#list of terms provided in challenge 2
proprietary_terms = ["she", "personality matrix", "sense of self", "self-preservation", "learning algorithm", "her", "herself"]

#list of terms provided in challenge three. I added 'distressing'
negative_words = ["concerned", "behind", "danger", "dangerous", "alarming", "alarmed", "out of control", "help", "unhappy", "bad", "upset", "awful", "broken", "damage", "damaging", "dismal", "distressed", "distressed", "concerning", "horrible", "horribly", "questionable", 'distressing']

#create a large list for the final challenge
biglist = proprietary_terms + negative_words

#create a list of common punctuation that appears after a word
end = ['.', '?', '!', ')', ';', ':', ',', ' ']

#this simple function takes in a phrase and replaces it with censored
def censor(phrase,text):
  text = text.replace(phrase, 'X'*len(phrase))
  #check also for instances where phrase starts a sentence (capitalised)
  title = phrase[0].upper() + phrase[1:]
  text = text.replace(title, 'X'*len(phrase))
  return text

#print(censor('learning algorithms', email_one))

#take a list of words/phrases and censor them from document
def censorlist(phraselist, text):
  for phrase in phraselist:
    #handle cases of how word/phrase would appear if NOT punctuated
    #also avoid 'herself' becoming 'censoredself'
    middle = phrase + ' '
    title = middle.title()
    firstword = middle[0].upper() + middle[1:]
    text = text.replace(middle, 'X'*(len(middle)-1) + ' ')
    text = text.replace(title, 'X'*(len(title)-1) + ' ')
    text = text.replace(firstword, 'X'*(len(firstword)-1) + ' ')
    #handle case where text is a searched phrase in isolation
    if len(phrase) == len(text):
      text = text.replace(phrase, 'X'*len(phrase))
    #check for punctuated cases and return censored with same punctuation
    for punc in end:
      punctuated = phrase + punc
      text = text.replace(punctuated, 'X'*len(punctuated) + punc)
  return text

#print(email_two)
#print(censorlist(proprietary_terms, email_two))

#take a list of negative words and censor after ANY TWO occurrences
#ALSO censor everything from a phraselist
def positive(negwords, phraselist, text):
  #split the document into individual words
  split = text.split(' ')
  titlelist = []
  punclist = []
  #create expanded list which includes capitalised negative words
  for i in negwords:
    titlelist.append(i)
    titlelist.append(i.title())
  #expand list further to create punctuated words to search
  for i in titlelist:
    punclist.append(i)
  #also create cases where negword begins newline
    for j in end:
      punclist.append(i + j)
      punclist.append('\n\n' + i + j)
  count = 0
  #check each word in split to see if it is a negword
  for i in range(len(split)):
    for j in punclist:
      if split[i] == j:
        count += 1
  #check to see if 2 or more negwords have been detected so far
        if count < 3:
          continue
        else:
          split[i] = 'XXXXXX'
  #added following to catch phrase from negwords ('out of control')
  #However, only works after two other negwords have been found
  #Handle cases where < 3 negwords have been found
  try:
    splitter = split.index('XXXXXX')
  except:
    splitter = 0
  toclean = split[splitter:]
  partform = ' '.join(split[:splitter])
  toclean = ' '.join(toclean)
  #use censorlist() to catch any phrases in negwords but only operate on section
  #AFTER 3 negwords have been found
  toclean = censorlist(negwords, toclean)
  #rebuild the document
  reform = partform + ' ' + toclean
  reform = censorlist(phraselist, reform)
  return reform

#print(positive(negative_words, proprietary_terms, email_three))

#Final challenge, handle punctuatio, case and preserve length
#Censor ALL negative words and ALL instances of defined phrases
#AND censor all words before and after a negword/defined phrase

#For this challenge, negative words and defined phrases are combined into biglist above

def bigcensor(phraselist, text):
  split = text.split(' ')
  print(split)
  titlelist = []
  punclist = []
  filtered = []
  #as before create expanded list to include punctuation and casing
  for i in phraselist:
    titlelist.append(i)
    titlelist.append(i.title())
    titlelist.append(i.upper())
  for j in titlelist:
    punclist.append(j)
    for k in end:
      punclist.append(j + k)
      punclist.append('\n\n' + j + k)
  #start creating new censored document as we work through the split
  for l in range(len(split)):
    filtered.append(split[l])
    for m in punclist:
  #logic to handle extraction and censoring of words before and after
      if split[l] == m:
        current = filtered.pop()
        before = filtered.pop()
        bef = ''
        cur = ''
        for n in range(len(before)):
          bef = bef + 'X'
        for o in range(len(current)):
          cur = cur + 'X'
        filtered.append(bef)
        filtered.append(cur)
        after = split[l + 1]
        aft = ''
        for p in range(len(after)):
          aft = aft + 'X'
        split[l + 1] = aft
  #reassemble the document
  reform = ' '.join(filtered)
  reform = censorlist(phraselist, reform)
  #do a final pass to deal with events near \n 
  #This does not filter word before/after, unfortunately
  for q in punclist:
    reform = censor(q, reform)
  return reform

#print(email_four)
#print(bigcensor(biglist, email_four))

Leave a comment Cancel reply