Skip to content

Instantly share code, notes, and snippets.

@paultopia
Last active October 15, 2015 19:45
Show Gist options
  • Save paultopia/4dae0e73e22370a3f45a to your computer and use it in GitHub Desktop.
Save paultopia/4dae0e73e22370a3f45a to your computer and use it in GitHub Desktop.
quick and dirty stopwords eliminator for wordclouds. copy and paste word document to a text file. call it from commandline with text file as extension. open new text file. copy and paste into a wordcloud maker like wordle or https://www.jasondavies.com/wordcloud/ . Tweak settings. Profit.
import sys
import nltk
import string
dasfile = sys.argv[1]
with open(dasfile) as dastext:
thetext = dastext.read()
cleanstring = thetext.translate(string.maketrans("",""), string.punctuation)
cleanstring = cleanstring.translate(string.maketrans("",""), string.digits)
cleanstring = filter(lambda x: x in string.printable, cleanstring)
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.append('this')
stop.append('the')
stop.append('chapter')
stop.append('chapters')
stop.append('also')
stop.append('however')
stop.append('even')
stop.append('since')
stop.append('therefore')
stop.append('thus')
stop.append('moreover')
stop.append('whether')
stop.append('first')
stop.append('second')
stop.append('third')
stop.append('often')
stop.append('within')
stop.append('rather')
stop.append('made')
stop.append('argue')
stop.append('still')
stop.append('used')
stop.append('much')
stop.append('many')
stop.append('some')
stop.append('think')
stop.append('other')
stop.append('others')
stop.append('must')
stop.append('claim')
stop.append('may')
stop.append('idea')
stop.append('well')
stop.append('likely')
chapnums = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
stop.extend(chapnums)
stopcap = [i.capitalize() for i in stop]
stop.extend(stopcap)
for i in range(10):
stop.append(str(i))
newtext = [i.lower() for i in cleanstring.split() if ((i not in stop) and (len(i) > 2))]
# n.b. this business of replacing the simple plurals with their singular equivalents is really slow...
slist = []
for word in newtext:
if word + 's' in newtext:
slist.append(word + 's')
for word in slist:
for index, item in enumerate(newtext):
if item == word:
newtext[index] = word[0:-1]
newfile = 'new-' + dasfile
with open(newfile, 'w') as thenewtxt:
thenewtxt.write(' '.join(newtext))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment