timmc/java-count-type-chars.clj

## java-count-type-chars.clj
(ns adhoc.java-types
  "Count the amount of space taken up by types in Java code. Problems:

- Not actually a parser! Counts contents of comments!
- Willovercount in code that is heavy in STATIC_FIELDS
- Doesn't capture generics, arrays, and other non-alphabetic type
  characters"
  (:require [clojure.java.io :as io]
            [clojure.string :as str])
  (:import java.io.File
           java.util.regex.Pattern))

(defn find-files
  "Find all regular files in a directory File."
  [^File d]
  (if (.isDirectory d)
    (mapcat find-files (sort (.listFiles d)))
    [d]))

(defn stream-code-lines
  "Lazily stream non-comment code from a directory."
  [dir-path]
  (for [f (find-files (io/file dir-path))
        line (line-seq (io/reader f :encoding "UTF-8"))
        ;; Ignore multiline comments
        :when (not (re-matches #"^\s*/?\*.*" line))]
    ;; Strip end of line comments
    (first (.split ^String line "//" 2))))

;;;; State machine

;; A bit slower, and doesn't catch primitives, but I like state machines!

(defn step
  [state ^Character next-char]
  (case state
    :punct
    (if (Character/isUpperCase next-char)
      :type
      (if (Character/isAlphabetic (int next-char))
        :other-word
        :punct))

    :other-word
    (if (Character/isLetterOrDigit (int next-char))
      :other-word
      :punct)

    :type
    (if (or (Character/isLetterOrDigit (int next-char))
            (= next-char \_))
      :type
      :punct)))

(defn count-object-types-streaming
  "Count space taken up by *Object* types using a state machine. Does
not pick up primitive types or classes with lowercase first letters.

Yields [total-chars, type-chars]."
  [dir-path]
  (loop [state :punct
         total-chars 0
         type-chars 0
         remaining (apply concat (stream-code-lines dir-path))]
    (let [next-char (first remaining)]
      (if (nil? next-char)
        (float (/ type-chars total-chars))
        (let [next-state (step state next-char)]
          (recur next-state
                 (inc total-chars)
                 (cond-> type-chars (= next-state :type) inc)
                 (next remaining)))))))

;;;; Regex

;; Faster, better results. Technically has a corner case if a type
;; ended a file, but that won't validly happen in Java.

(def type-regex
  #"(?<![A-Za-z])([A-Z][A-Za-z0-9_]*?|byte|short|int|long|float|double|char|boolean|void)(?=[^A-Za-z0-9_])")

(defn count-types-line-regex
  [strng]
  [(apply + (map (comp count first) (re-seq type-regex strng)))
   (count strng)])

(defn count-types-regex
  "Count space taken up by types using a regex."
  [dir-path]
  (let [[types total]
        (reduce (fn [[accum-types accum-total] [line-types line-total]]
                  [(+ accum-types line-types)
                   (+ accum-total line-total)])
                [0 0]
                (map count-types-line-regex (stream-code-lines dir-path)))]
    (float (/ types total))))

(comment
  (load-file "java-count-type-chars.clj")
  (require '[adhoc.java-types :as jt])
  ((juxt jt/count-object-types-streaming jt/count-types-regex) "source/dir/")
  [0.2193496 0.21625592] ;; about 20%
  ;; Quick debugging call to list type tokens in source:
  (take 300 (for [line (jt/stream-code-lines "source/dir")
                  token (map first (re-seq jt/type-regex line))]
              token))
  )
	(ns adhoc.java-types
	"Count the amount of space taken up by types in Java code. Problems:

	- Not actually a parser! Counts contents of comments!
	- Willovercount in code that is heavy in STATIC_FIELDS
	- Doesn't capture generics, arrays, and other non-alphabetic type
	characters"
	(:require [clojure.java.io :as io]
	[clojure.string :as str])
	(:import java.io.File
	java.util.regex.Pattern))

	(defn find-files
	"Find all regular files in a directory File."
	[^File d]
	(if (.isDirectory d)
	(mapcat find-files (sort (.listFiles d)))
	[d]))

	(defn stream-code-lines
	"Lazily stream non-comment code from a directory."
	[dir-path]
	(for [f (find-files (io/file dir-path))
	line (line-seq (io/reader f :encoding "UTF-8"))
	;; Ignore multiline comments
	:when (not (re-matches #"^\s/?\.*" line))]
	;; Strip end of line comments
	(first (.split ^String line "//" 2))))

	;;;; State machine

	;; A bit slower, and doesn't catch primitives, but I like state machines!

	(defn step
	[state ^Character next-char]
	(case state
	:punct
	(if (Character/isUpperCase next-char)
	:type
	(if (Character/isAlphabetic (int next-char))
	:other-word
	:punct))

	:other-word
	(if (Character/isLetterOrDigit (int next-char))
	:other-word
	:punct)

	:type
	(if (or (Character/isLetterOrDigit (int next-char))
	(= next-char \_))
	:type
	:punct)))

	(defn count-object-types-streaming
	"Count space taken up by Object types using a state machine. Does
	not pick up primitive types or classes with lowercase first letters.

	Yields [total-chars, type-chars]."
	[dir-path]
	(loop [state :punct
	total-chars 0
	type-chars 0
	remaining (apply concat (stream-code-lines dir-path))]
	(let [next-char (first remaining)]
	(if (nil? next-char)
	(float (/ type-chars total-chars))
	(let [next-state (step state next-char)]
	(recur next-state
	(inc total-chars)
	(cond-> type-chars (= next-state :type) inc)
	(next remaining)))))))

	;;;; Regex

	;; Faster, better results. Technically has a corner case if a type
	;; ended a file, but that won't validly happen in Java.

	(def type-regex
	#"(?<![A-Za-z])([A-Z][A-Za-z0-9_]*?\|byte\|short\|int\|long\|float\|double\|char\|boolean\|void)(?=[^A-Za-z0-9_])")

	(defn count-types-line-regex
	[strng]
	[(apply + (map (comp count first) (re-seq type-regex strng)))
	(count strng)])

	(defn count-types-regex
	"Count space taken up by types using a regex."
	[dir-path]
	(let [[types total]
	(reduce (fn [[accum-types accum-total] [line-types line-total]]
	[(+ accum-types line-types)
	(+ accum-total line-total)])
	[0 0]
	(map count-types-line-regex (stream-code-lines dir-path)))]
	(float (/ types total))))

	(comment
	(load-file "java-count-type-chars.clj")
	(require '[adhoc.java-types :as jt])
	((juxt jt/count-object-types-streaming jt/count-types-regex) "source/dir/")
	[0.2193496 0.21625592] ;; about 20%
	;; Quick debugging call to list type tokens in source:
	(take 300 (for [line (jt/stream-code-lines "source/dir")
	token (map first (re-seq jt/type-regex line))]
	token))
	)