Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
I use this to parse badwords files from several sources
"""
badwords source: https://github.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en
badwords source 2: http://urbanoalvarez.es/blog/2008/04/04/bad-words-list/
"""
f = open("badwords.txt")
lines = f.readlines()
lines2 = []
for i in lines:
#remove trailing and prepending space
lines2.append(i.strip())
lines3 = []
for i in lines2:
#remove spaces
b = i.replace(" ", "").replace("-","")
line3.append(b)
"""
for i in lines:
#check if is alphabetics
#so remove numbers and marks
if not i.isalpha():
lines.remove(i)
#note that this will not clear all items which is not alpha
#has to do more than serveral times, and print out to check
#or use the following
"""
line4 = []
for i in line3:
if i.isalpha():
#from capital to lower case
line4.append(i.lower())
output = []
for i in lines4:
if i is not in output:
output.append(i)
f2 = open("badwords2.txt", "w")
for x in output:
f2.write("%s\n", x)
f2.close()
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment