Skip to content

Instantly share code, notes, and snippets.

@LyndonArmitage
Created July 28, 2016 18:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LyndonArmitage/c0c716e824a99a534eb542d4c1b4fdd2 to your computer and use it in GitHub Desktop.
Save LyndonArmitage/c0c716e824a99a534eb542d4c1b4fdd2 to your computer and use it in GitHub Desktop.
import java.io.{BufferedWriter, File, FileWriter}
import scala.collection.mutable
object WordListGenerator {
def main(args: Array[String]): Unit = {
assert(args.length > 0)
val words = new mutable.HashSet[String]
args.map(arg =>
scala.io.Source.fromFile(arg, "UTF-8")
)
.map(source => source.getLines())
.foreach(it => {
it.filterNot(line => line.isEmpty)
.map(line => sanitizeLine(line))
.filterNot(line => line.isEmpty)
.map(line => getWords(line))
.foreach(set => words ++= set)
})
println("Outputting word list")
val outputFile = new File("wordlist.txt")
val writer = new BufferedWriter(new FileWriter(outputFile))
words.foreach(word => {
writer.write(word)
writer.newLine()
})
writer.close()
println("Finished writing word list")
}
def sanitizeLine(line: String): String = {
var newLine = line.toLowerCase.trim
val specialCharRanges = Range.inclusive(33, 47) ++ Range.inclusive(58, 64) ++ Range.inclusive(91, 96) ++ Range.inclusive(123, 126)
specialCharRanges.map(i => i.toChar)
.foreach(charValue => {
newLine = newLine.replace(charValue, ' ')
})
newLine
}
def getWords(line: String): Set[String] = {
val seperators = Array(' ', '\t')
line.split(seperators)
.map(word => word.trim)
.filterNot(word => word.isEmpty)
.toSet
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment