-
-
Save LyndonArmitage/c0c716e824a99a534eb542d4c1b4fdd2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.{BufferedWriter, File, FileWriter} | |
import scala.collection.mutable | |
object WordListGenerator { | |
def main(args: Array[String]): Unit = { | |
assert(args.length > 0) | |
val words = new mutable.HashSet[String] | |
args.map(arg => | |
scala.io.Source.fromFile(arg, "UTF-8") | |
) | |
.map(source => source.getLines()) | |
.foreach(it => { | |
it.filterNot(line => line.isEmpty) | |
.map(line => sanitizeLine(line)) | |
.filterNot(line => line.isEmpty) | |
.map(line => getWords(line)) | |
.foreach(set => words ++= set) | |
}) | |
println("Outputting word list") | |
val outputFile = new File("wordlist.txt") | |
val writer = new BufferedWriter(new FileWriter(outputFile)) | |
words.foreach(word => { | |
writer.write(word) | |
writer.newLine() | |
}) | |
writer.close() | |
println("Finished writing word list") | |
} | |
def sanitizeLine(line: String): String = { | |
var newLine = line.toLowerCase.trim | |
val specialCharRanges = Range.inclusive(33, 47) ++ Range.inclusive(58, 64) ++ Range.inclusive(91, 96) ++ Range.inclusive(123, 126) | |
specialCharRanges.map(i => i.toChar) | |
.foreach(charValue => { | |
newLine = newLine.replace(charValue, ' ') | |
}) | |
newLine | |
} | |
def getWords(line: String): Set[String] = { | |
val seperators = Array(' ', '\t') | |
line.split(seperators) | |
.map(word => word.trim) | |
.filterNot(word => word.isEmpty) | |
.toSet | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment