This is my solution to the Codecademy Pro Project “Censor Dispenser” using Python to search and manipulate text documents. Probably not the most elegant but really helped me to learn more about catching variations in searches, for example capitalised and punctuated words/phrases.
# These are the emails you will be censoring. The open() function is opening the text file that the emails are contained in and the .read() method is allowing us to save their contexts to the following variables: email_one = open("email_one.txt", "r").read() email_two = open("email_two.txt", "r").read() email_three = open("email_three.txt", "r").read() email_four = open("email_four.txt", "r").read() #list of terms provided in challenge 2 proprietary_terms = ["she", "personality matrix", "sense of self", "self-preservation", "learning algorithm", "her", "herself"] #list of terms provided in challenge three. I added 'distressing' negative_words = ["concerned", "behind", "danger", "dangerous", "alarming", "alarmed", "out of control", "help", "unhappy", "bad", "upset", "awful", "broken", "damage", "damaging", "dismal", "distressed", "distressed", "concerning", "horrible", "horribly", "questionable", 'distressing'] #create a large list for the final challenge biglist = proprietary_terms + negative_words #create a list of common punctuation that appears after a word end = ['.', '?', '!', ')', ';', ':', ',', ' '] #this simple function takes in a phrase and replaces it with censored def censor(phrase,text): text = text.replace(phrase, 'X'*len(phrase)) #check also for instances where phrase starts a sentence (capitalised) title = phrase[0].upper() + phrase[1:] text = text.replace(title, 'X'*len(phrase)) return text #print(censor('learning algorithms', email_one)) #take a list of words/phrases and censor them from document def censorlist(phraselist, text): for phrase in phraselist: #handle cases of how word/phrase would appear if NOT punctuated #also avoid 'herself' becoming 'censoredself' middle = phrase + ' ' title = middle.title() firstword = middle[0].upper() + middle[1:] text = text.replace(middle, 'X'*(len(middle)-1) + ' ') text = text.replace(title, 'X'*(len(title)-1) + ' ') text = text.replace(firstword, 'X'*(len(firstword)-1) + ' ') #handle case where text is a searched phrase in isolation if len(phrase) == len(text): text = text.replace(phrase, 'X'*len(phrase)) #check for punctuated cases and return censored with same punctuation for punc in end: punctuated = phrase + punc text = text.replace(punctuated, 'X'*len(punctuated) + punc) return text #print(email_two) #print(censorlist(proprietary_terms, email_two)) #take a list of negative words and censor after ANY TWO occurrences #ALSO censor everything from a phraselist def positive(negwords, phraselist, text): #split the document into individual words split = text.split(' ') titlelist = [] punclist = [] #create expanded list which includes capitalised negative words for i in negwords: titlelist.append(i) titlelist.append(i.title()) #expand list further to create punctuated words to search for i in titlelist: punclist.append(i) #also create cases where negword begins newline for j in end: punclist.append(i + j) punclist.append('\n\n' + i + j) count = 0 #check each word in split to see if it is a negword for i in range(len(split)): for j in punclist: if split[i] == j: count += 1 #check to see if 2 or more negwords have been detected so far if count < 3: continue else: split[i] = 'XXXXXX' #added following to catch phrase from negwords ('out of control') #However, only works after two other negwords have been found #Handle cases where < 3 negwords have been found try: splitter = split.index('XXXXXX') except: splitter = 0 toclean = split[splitter:] partform = ' '.join(split[:splitter]) toclean = ' '.join(toclean) #use censorlist() to catch any phrases in negwords but only operate on section #AFTER 3 negwords have been found toclean = censorlist(negwords, toclean) #rebuild the document reform = partform + ' ' + toclean reform = censorlist(phraselist, reform) return reform #print(positive(negative_words, proprietary_terms, email_three)) #Final challenge, handle punctuatio, case and preserve length #Censor ALL negative words and ALL instances of defined phrases #AND censor all words before and after a negword/defined phrase #For this challenge, negative words and defined phrases are combined into biglist above def bigcensor(phraselist, text): split = text.split(' ') print(split) titlelist = [] punclist = [] filtered = [] #as before create expanded list to include punctuation and casing for i in phraselist: titlelist.append(i) titlelist.append(i.title()) titlelist.append(i.upper()) for j in titlelist: punclist.append(j) for k in end: punclist.append(j + k) punclist.append('\n\n' + j + k) #start creating new censored document as we work through the split for l in range(len(split)): filtered.append(split[l]) for m in punclist: #logic to handle extraction and censoring of words before and after if split[l] == m: current = filtered.pop() before = filtered.pop() bef = '' cur = '' for n in range(len(before)): bef = bef + 'X' for o in range(len(current)): cur = cur + 'X' filtered.append(bef) filtered.append(cur) after = split[l + 1] aft = '' for p in range(len(after)): aft = aft + 'X' split[l + 1] = aft #reassemble the document reform = ' '.join(filtered) reform = censorlist(phraselist, reform) #do a final pass to deal with events near \n #This does not filter word before/after, unfortunately for q in punclist: reform = censor(q, reform) return reform #print(email_four) #print(bigcensor(biglist, email_four))