Skip to content

Instantly share code, notes, and snippets.

@abdolence
Created August 8, 2020 11:43
Show Gist options
  • Save abdolence/a9808fbb4dc439a758c06639d6a3e743 to your computer and use it in GitHub Desktop.
Save abdolence/a9808fbb4dc439a758c06639d6a3e743 to your computer and use it in GitHub Desktop.
import $ivy.`com.github.pemistahl:lingua:1.0.1`, com.github.pemistahl.lingua.api._, com.github.pemistahl.lingua.api.Language._
private val DefaultPattern = "txt"
private val QuotePattern = """("(?<dqt>[^"]*)")|('(?<sqt>[^']*)')""".r
private val SplitMatched = """([-,/])""".r
private val SplitCameCase = """(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])""".r
private val MinKeywordLen = 3
private val CacheDir = os.Path("/tmp/.detect-non-english-cache")
private val EnglishDictWordsUrl = "https://raw.githubusercontent.com/dwyl/english-words/master/words.txt"
private val EnglishDictCacheFileName = "english-words.txt"
private val CustomStopWordsSet : Set[String] = Set(
"lodash","http", "https", "css","csv", "url", "svg", "html", "ssl", "tls"
).map(_.toLowerCase)
private val detector: LanguageDetector = LanguageDetectorBuilder.fromLanguages(ENGLISH, SWEDISH).build()
@main
def main(path: os.Path = os.pwd, extensionPatternFilter : String = DefaultPattern) = {
println(s"Detecting non english string in starting from '${path}'. Pattern: ${extensionPatternFilter}")
if(!os.exists(CacheDir))
os.makeDir(CacheDir)
os.walk(path)
.filter( os.isFile )
.filter( _.ext.matches(extensionPatternFilter) )
.foreach { filePath =>
os.read( filePath ).lines.zipWithIndex.foreach { case (fileLine, index) =>
QuotePattern.findAllMatchIn(fileLine).foreach { matched =>
splitToKeywords( Option(matched.group("dqt")).getOrElse(matched.group("sqt")))
.filter ( keyword =>
keyword.length >= MinKeywordLen &&
keyword.headOption.exists(_.isLetter) &&
keyword.forall(_.isLetterOrDigit)
)
.filter ( isNonEnglish )
.filter ( isNotStopWord )
.foreach { keyword =>
println(s"""Found: "${keyword}" in ${filePath}:${index}""")
}
}
}
}
}
private def splitToKeywords(matchedStr : String) : List[String] = {
SplitMatched.split(matchedStr).toList
.foldLeft(List[String]())( (all,keyword) =>
all ++ SplitCameCase.split(keyword).toList
)
}
private def loadEnglishWordSetFromFile(filePath: os.Path) : Set[String] = {
println(s"Loading english dictionary from '${filePath}'...")
if(os.exists(filePath)) {
os.read(filePath).lines.map(_.toLowerCase).toSet
}
else {
println(s"Absent english dictionary file at: ${filePath}. Only language detector will be used")
Set()
}
}
private def loadEnglishWordSet() : Set[String] = {
val englishWordsFilePath = CacheDir / EnglishDictCacheFileName
if(os.exists(englishWordsFilePath)) {
loadEnglishWordSetFromFile(englishWordsFilePath)
}
else {
println(s"Downloading english dictionary from ${EnglishDictWordsUrl}")
val resp = requests.get(EnglishDictWordsUrl)
os.write(englishWordsFilePath, resp.bytes)
loadEnglishWordSetFromFile(englishWordsFilePath)
}
}
private lazy val englishDictSet : Set[String] = loadEnglishWordSet()
private def isNonEnglish(keyword : String): Boolean = {
detector.detectLanguageOf(keyword) != ENGLISH && !englishDictSet.contains(keyword.toLowerCase)
}
private def isNotStopWord(keyword : String): Boolean = {
!CustomStopWordsSet.contains(keyword.toLowerCase)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment