Last active
August 9, 2016 13:58
-
-
Save timmc/dae6c0b775bca8ac81427e87562646ec to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns adhoc.java-types | |
"Count the amount of space taken up by types in Java code. Problems: | |
- Not actually a parser! Counts contents of comments! | |
- Willovercount in code that is heavy in STATIC_FIELDS | |
- Doesn't capture generics, arrays, and other non-alphabetic type | |
characters" | |
(:require [clojure.java.io :as io] | |
[clojure.string :as str]) | |
(:import java.io.File | |
java.util.regex.Pattern)) | |
(defn find-files | |
"Find all regular files in a directory File." | |
[^File d] | |
(if (.isDirectory d) | |
(mapcat find-files (sort (.listFiles d))) | |
[d])) | |
(defn stream-code-lines | |
"Lazily stream non-comment code from a directory." | |
[dir-path] | |
(for [f (find-files (io/file dir-path)) | |
line (line-seq (io/reader f :encoding "UTF-8")) | |
;; Ignore multiline comments | |
:when (not (re-matches #"^\s*/?\*.*" line))] | |
;; Strip end of line comments | |
(first (.split ^String line "//" 2)))) | |
;;;; State machine | |
;; A bit slower, and doesn't catch primitives, but I like state machines! | |
(defn step | |
[state ^Character next-char] | |
(case state | |
:punct | |
(if (Character/isUpperCase next-char) | |
:type | |
(if (Character/isAlphabetic (int next-char)) | |
:other-word | |
:punct)) | |
:other-word | |
(if (Character/isLetterOrDigit (int next-char)) | |
:other-word | |
:punct) | |
:type | |
(if (or (Character/isLetterOrDigit (int next-char)) | |
(= next-char \_)) | |
:type | |
:punct))) | |
(defn count-object-types-streaming | |
"Count space taken up by *Object* types using a state machine. Does | |
not pick up primitive types or classes with lowercase first letters. | |
Yields [total-chars, type-chars]." | |
[dir-path] | |
(loop [state :punct | |
total-chars 0 | |
type-chars 0 | |
remaining (apply concat (stream-code-lines dir-path))] | |
(let [next-char (first remaining)] | |
(if (nil? next-char) | |
(float (/ type-chars total-chars)) | |
(let [next-state (step state next-char)] | |
(recur next-state | |
(inc total-chars) | |
(cond-> type-chars (= next-state :type) inc) | |
(next remaining))))))) | |
;;;; Regex | |
;; Faster, better results. Technically has a corner case if a type | |
;; ended a file, but that won't validly happen in Java. | |
(def type-regex | |
#"(?<![A-Za-z])([A-Z][A-Za-z0-9_]*?|byte|short|int|long|float|double|char|boolean|void)(?=[^A-Za-z0-9_])") | |
(defn count-types-line-regex | |
[strng] | |
[(apply + (map (comp count first) (re-seq type-regex strng))) | |
(count strng)]) | |
(defn count-types-regex | |
"Count space taken up by types using a regex." | |
[dir-path] | |
(let [[types total] | |
(reduce (fn [[accum-types accum-total] [line-types line-total]] | |
[(+ accum-types line-types) | |
(+ accum-total line-total)]) | |
[0 0] | |
(map count-types-line-regex (stream-code-lines dir-path)))] | |
(float (/ types total)))) | |
(comment | |
(load-file "java-count-type-chars.clj") | |
(require '[adhoc.java-types :as jt]) | |
((juxt jt/count-object-types-streaming jt/count-types-regex) "source/dir/") | |
[0.2193496 0.21625592] ;; about 20% | |
;; Quick debugging call to list type tokens in source: | |
(take 300 (for [line (jt/stream-code-lines "source/dir") | |
token (map first (re-seq jt/type-regex line))] | |
token)) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment