Skip to content

Instantly share code, notes, and snippets.

@timmc
Last active August 9, 2016 13:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timmc/dae6c0b775bca8ac81427e87562646ec to your computer and use it in GitHub Desktop.
Save timmc/dae6c0b775bca8ac81427e87562646ec to your computer and use it in GitHub Desktop.
(ns adhoc.java-types
"Count the amount of space taken up by types in Java code. Problems:
- Not actually a parser! Counts contents of comments!
- Willovercount in code that is heavy in STATIC_FIELDS
- Doesn't capture generics, arrays, and other non-alphabetic type
characters"
(:require [clojure.java.io :as io]
[clojure.string :as str])
(:import java.io.File
java.util.regex.Pattern))
(defn find-files
"Find all regular files in a directory File."
[^File d]
(if (.isDirectory d)
(mapcat find-files (sort (.listFiles d)))
[d]))
(defn stream-code-lines
"Lazily stream non-comment code from a directory."
[dir-path]
(for [f (find-files (io/file dir-path))
line (line-seq (io/reader f :encoding "UTF-8"))
;; Ignore multiline comments
:when (not (re-matches #"^\s*/?\*.*" line))]
;; Strip end of line comments
(first (.split ^String line "//" 2))))
;;;; State machine
;; A bit slower, and doesn't catch primitives, but I like state machines!
(defn step
[state ^Character next-char]
(case state
:punct
(if (Character/isUpperCase next-char)
:type
(if (Character/isAlphabetic (int next-char))
:other-word
:punct))
:other-word
(if (Character/isLetterOrDigit (int next-char))
:other-word
:punct)
:type
(if (or (Character/isLetterOrDigit (int next-char))
(= next-char \_))
:type
:punct)))
(defn count-object-types-streaming
"Count space taken up by *Object* types using a state machine. Does
not pick up primitive types or classes with lowercase first letters.
Yields [total-chars, type-chars]."
[dir-path]
(loop [state :punct
total-chars 0
type-chars 0
remaining (apply concat (stream-code-lines dir-path))]
(let [next-char (first remaining)]
(if (nil? next-char)
(float (/ type-chars total-chars))
(let [next-state (step state next-char)]
(recur next-state
(inc total-chars)
(cond-> type-chars (= next-state :type) inc)
(next remaining)))))))
;;;; Regex
;; Faster, better results. Technically has a corner case if a type
;; ended a file, but that won't validly happen in Java.
(def type-regex
#"(?<![A-Za-z])([A-Z][A-Za-z0-9_]*?|byte|short|int|long|float|double|char|boolean|void)(?=[^A-Za-z0-9_])")
(defn count-types-line-regex
[strng]
[(apply + (map (comp count first) (re-seq type-regex strng)))
(count strng)])
(defn count-types-regex
"Count space taken up by types using a regex."
[dir-path]
(let [[types total]
(reduce (fn [[accum-types accum-total] [line-types line-total]]
[(+ accum-types line-types)
(+ accum-total line-total)])
[0 0]
(map count-types-line-regex (stream-code-lines dir-path)))]
(float (/ types total))))
(comment
(load-file "java-count-type-chars.clj")
(require '[adhoc.java-types :as jt])
((juxt jt/count-object-types-streaming jt/count-types-regex) "source/dir/")
[0.2193496 0.21625592] ;; about 20%
;; Quick debugging call to list type tokens in source:
(take 300 (for [line (jt/stream-code-lines "source/dir")
token (map first (re-seq jt/type-regex line))]
token))
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment