-
-
Save ganeshchand/6ff6e408b5144de44055d5ca17516769 to your computer and use it in GitHub Desktop.
Scala CLI script to find Top X largest files by line counts in a given directory (including sub-directories)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//> using scala 3 | |
//> using lib "com.lihaoyi::os-lib::0.9.0" | |
import scala.util.Try | |
// args(0) fails with java.lang.ArrayIndexOutOfBoundsException if no argument was provided by the user | |
val inputDir: Try[String] = Try(args(0)) | |
// Default input directory is current working directory where the script is being run at. | |
val directory: os.Path = inputDir.map(os.Path(_)).getOrElse(os.pwd) | |
// Default number of largest files to show is 5. | |
val topN: Int = Try(args(1).toInt).getOrElse(5) | |
// Default file type is scala | |
val fileType: String = Try(args(2)).getOrElse("scala").toLowerCase() | |
def isFileOfType(path: os.Path, fileType: String): Boolean = { | |
val ext: String = path.ext.toLowerCase() | |
os.isFile(path) && (fileType match { | |
case "scala" => ext == "scala" || ext == "sc" | |
case "java" => ext == "java" | |
case "python" => ext == "py" | |
case "sql" => ext == "sql" | |
case "text" => ext == "txt" | |
case "json" => ext == "json" | |
case "xml" => ext == "xml" | |
case "yaml" => ext == "yaml" || ext == "yml" | |
case "markdown" => ext == "md" | |
case "html" => ext == "html" | |
case "css" => ext == "css" | |
case "javascript" => ext == "js" | |
case "typescript" => ext == "ts" | |
case "shell" => ext == "sh" | |
case _ => ext == fileType || ext == s".$fileType" | |
}) | |
} | |
println(s"Finding top $topN largest ${fileType.toUpperCase()} files in $directory") | |
os | |
.walk(directory) | |
.filter(path => isFileOfType(path, fileType)) | |
.map(path => (path, os.read.lines(path).size)) | |
.sortBy((path, lineCount) => lineCount) // short form: sortBy(_._2) | |
.reverse | |
.take(topN) | |
.foreach { (path, lineCount) => | |
val relativeFilePath: String = | |
directory.toNIO.toUri().relativize(path.toNIO.toUri()).getPath | |
println(s"$lineCount $relativeFilePath") | |
} | |
/* | |
* How to run this script: | |
$ scala-cli https://gist.github.com/ganeshchand/6ff6e408b5144de44055d5ca17516769 -- /Users/Shared/repos/opensource/delta 5 scala | |
Finding top 5 largest SCALA files in /Users/Shared/repos/opensource/delta | |
38153 benchmarks/src/main/scala/benchmark/TPCDSBenchmarkQueries.scala | |
5305 core/src/test/scala/org/apache/spark/sql/delta/MergeIntoSuiteBase.scala | |
3060 core/src/test/scala/org/apache/spark/sql/delta/DeltaSuite.scala | |
2996 core/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala | |
2785 core/src/test/scala/org/apache/spark/sql/delta/DeltaErrorsSuite.scala | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment