Created
August 8, 2020 11:43
-
-
Save abdolence/a9808fbb4dc439a758c06639d6a3e743 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import $ivy.`com.github.pemistahl:lingua:1.0.1`, com.github.pemistahl.lingua.api._, com.github.pemistahl.lingua.api.Language._ | |
private val DefaultPattern = "txt" | |
private val QuotePattern = """("(?<dqt>[^"]*)")|('(?<sqt>[^']*)')""".r | |
private val SplitMatched = """([-,/])""".r | |
private val SplitCameCase = """(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])""".r | |
private val MinKeywordLen = 3 | |
private val CacheDir = os.Path("/tmp/.detect-non-english-cache") | |
private val EnglishDictWordsUrl = "https://raw.githubusercontent.com/dwyl/english-words/master/words.txt" | |
private val EnglishDictCacheFileName = "english-words.txt" | |
private val CustomStopWordsSet : Set[String] = Set( | |
"lodash","http", "https", "css","csv", "url", "svg", "html", "ssl", "tls" | |
).map(_.toLowerCase) | |
private val detector: LanguageDetector = LanguageDetectorBuilder.fromLanguages(ENGLISH, SWEDISH).build() | |
@main | |
def main(path: os.Path = os.pwd, extensionPatternFilter : String = DefaultPattern) = { | |
println(s"Detecting non english string in starting from '${path}'. Pattern: ${extensionPatternFilter}") | |
if(!os.exists(CacheDir)) | |
os.makeDir(CacheDir) | |
os.walk(path) | |
.filter( os.isFile ) | |
.filter( _.ext.matches(extensionPatternFilter) ) | |
.foreach { filePath => | |
os.read( filePath ).lines.zipWithIndex.foreach { case (fileLine, index) => | |
QuotePattern.findAllMatchIn(fileLine).foreach { matched => | |
splitToKeywords( Option(matched.group("dqt")).getOrElse(matched.group("sqt"))) | |
.filter ( keyword => | |
keyword.length >= MinKeywordLen && | |
keyword.headOption.exists(_.isLetter) && | |
keyword.forall(_.isLetterOrDigit) | |
) | |
.filter ( isNonEnglish ) | |
.filter ( isNotStopWord ) | |
.foreach { keyword => | |
println(s"""Found: "${keyword}" in ${filePath}:${index}""") | |
} | |
} | |
} | |
} | |
} | |
private def splitToKeywords(matchedStr : String) : List[String] = { | |
SplitMatched.split(matchedStr).toList | |
.foldLeft(List[String]())( (all,keyword) => | |
all ++ SplitCameCase.split(keyword).toList | |
) | |
} | |
private def loadEnglishWordSetFromFile(filePath: os.Path) : Set[String] = { | |
println(s"Loading english dictionary from '${filePath}'...") | |
if(os.exists(filePath)) { | |
os.read(filePath).lines.map(_.toLowerCase).toSet | |
} | |
else { | |
println(s"Absent english dictionary file at: ${filePath}. Only language detector will be used") | |
Set() | |
} | |
} | |
private def loadEnglishWordSet() : Set[String] = { | |
val englishWordsFilePath = CacheDir / EnglishDictCacheFileName | |
if(os.exists(englishWordsFilePath)) { | |
loadEnglishWordSetFromFile(englishWordsFilePath) | |
} | |
else { | |
println(s"Downloading english dictionary from ${EnglishDictWordsUrl}") | |
val resp = requests.get(EnglishDictWordsUrl) | |
os.write(englishWordsFilePath, resp.bytes) | |
loadEnglishWordSetFromFile(englishWordsFilePath) | |
} | |
} | |
private lazy val englishDictSet : Set[String] = loadEnglishWordSet() | |
private def isNonEnglish(keyword : String): Boolean = { | |
detector.detectLanguageOf(keyword) != ENGLISH && !englishDictSet.contains(keyword.toLowerCase) | |
} | |
private def isNotStopWord(keyword : String): Boolean = { | |
!CustomStopWordsSet.contains(keyword.toLowerCase) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment