eggsyntax/exercise.clj

## exercise.clj
(ns ski.wotcha
  (:require [clojure.string :as string]))

;; Based on exercise at https://drive.google.com/drive/folders/1AQN08ikQZvq0QWn9KLuhiE9kJu1Tp2a5

;;; NB: The top-level code to actually solve the problem is down at the bottom:

;; Clojure relies on Java regex, which means regexes of any
;; complexity are pretty ugly:
(def name-regex #"^(\p{IsAlphabetic}+), (\p{IsAlphabetic}+)")

(defn add-name [names line]
  (if-let [[[full-match lname fname]] (re-seq name-regex line)]
    (conj names [lname fname])
    names))

(defn rotate
  "Given a sequence s, rotate it n steps; eg rotating [1 2 3 4] by
  one step gives [2 3 4 1], and rotating it two steps gives [3 4 1 2]."
  [n s]
  (concat (drop n s) (take n s)))

;; minor utility function
(defn combine-lists
  "(combine-lists [[1 3 5] [2 4 6]]) returns [[1 2] [3 4] [5 6]]. Works for
  any number of sequences (eg passing 3 sequences returns triplets).
  Clojure's `interleave` already does something close to this, but not lazily."
  [lists]
  (apply map vector lists))

;; TODO a bit of inefficiency in the next bit is that we convert the
;; lists-so-far of first-names and last-names to a set every time this fn is
;; called. But in practice, we're only going to take 25 from the list, so it's
;; not worth worrying about. If I were actually going to submit this to a
;; prospective employer, I might deal with that, though. The obvious thing would
;; be just to store them as sets throughout. The downside of using sets for our
;; accumulation of first-names and last-names is that we can't then be sure that
;; we'll traverse those two sets in the same order, so that would complicate
;; things a bit. One decent option, although again less readable, is to create a
;; data structure that maintains (for each of first-namse and last-names) both
;; an ordered list (for later traversal) and a set (for fast membership
;; testing).

(defn add-name-if-fully-unique
  "Note that here names are assumed to be represented as a vector of
  [last-name first-name]"
  [[lnames fnames] [lname fname]]
  (if (or (contains? (set lnames) lname)
          (contains? (set fnames) fname))
    [lnames fnames] ; no change
    [(conj lnames lname) (conj fnames fname)]))

;; I'll give two versions of the following. The first is simpler, and then I'll
;; give a second version that's much more performant.
(defn fully-unique-names-simple
  "Pass a source list of names like ['Smith' 'John] and the number of
  unique-names you want"
  [names num-wanted]
  (let [unique-name-vectors (reduce add-name-if-fully-unique [] names)]
    (take num-wanted
          ;; combine vectors of last names and first names:
          (combine-lists unique-name-vectors))))

;; The trouble with the above is that it reduces eagerly over the complete
;; first-names and last-names. Since we care about efficiency here, we use
;; `reductions`, which is like `reduce`, but returns a lazy sequence of
;; intermediate effects, the nth of which will be what we want.

(defn fully-unique-names
  "Pass a source list of names like ['Smith' 'John] and the number of
  unique-names you want"
  [names num-wanted]
  (let [unique-name-vectors (reductions add-name-if-fully-unique [] names)]
    ;; combine vectors of last names and first names & take the ones we want
    (combine-lists (nth unique-name-vectors (inc num-wanted)))))

(defn rotate-names [names n]
  (let [fnames (map second names)
        lnames (map first  names)]
    (map (partial string/join ", ")
         (combine-lists [(take n lnames)
                         (rotate 1 (take n fnames))]))))

;;; Begin code for the specific problem:

(time
 (do

   ;; TODO A minor inefficiency here is that I'm iterating over
   ;; the sequence of lines in each of the next three lines. If I
   ;; wanted to improve the time, I'd use reduce f with a function that
   ;; output the full match, the last name, and the first name. But
   ;; it's pretty fast anyway, so I haven't bothered; I think this
   ;; way is slightly clearer.

   (def f (string/split-lines (slurp "test-data-v1-8.txt")))

   (def full-names (reduce add-name [] f))

   (def last-names  (map first full-names))

   (def first-names (map second full-names))

   (println "1. The unique count of full names (i.e. duplicates are counted only once)"
            (count (set full-names)))
   (println "2. The unique count of last names"
            (count (set last-names)))
   (println "3. The unique count of first names"
            (count (set first-names)))

   ;; `frequencies` produces (essentially) a list of two-element vectors: a name and
   ;; the number of times it occurs. We sort by the second element of that vector.
   (println "4. The ten most common last names (the names and number of occurrences)\n    "
            (reverse (take-last 10 (sort-by second (frequencies last-names)))))
   (println "5. The ten most common first names (the names and number of occurrences)\n    "
            (reverse (take-last 10 (sort-by second (frequencies first-names)))))

   ;; For the final two steps, I made the code completely lazy so that we'd only
   ;; have to process enough to complete our list of 25 unique names. That does
   ;; come at some cost to readability, but fortunately not too much.

   (def unique-names (reverse ; our result is backward as a quirk of laziness + efficiency
                      (fully-unique-names full-names 25)))

   (def unique-names-output (map (partial string/join ", ") unique-names))

   (println "6. A list of 25 specially unique names (see below for details)")
   (prn unique-names-output)

   (println "7. A list of 25 modified names (see below for details)")
   (prn (rotate-names unique-names 25))

   ))

;; output:
"
1. The unique count of full names (i.e. duplicates are counted only once) 48308
2. The unique count of last names 467
3. The unique count of first names 3007
4. The ten most common last names (the names and number of occurrences)
     ([Barton 143] [Lang 136] [Ortiz 135] [Hilll 134] [Hills 130] [Terry 129] [Batz 129] [Johns 128] [Romaguera 128] [Crist 127])
5. The ten most common first names (the names and number of occurrences)
     ([Keon 31] [Tara 31] [Andreanne 31] [Stephania 31] [Summer 30] [Milo 30] [Baron 30] [Kaycee 30] [Ulices 30] [Heath 30])
6. A list of 25 specially unique names (see below for details)
('Graham, Mckenna' 'Marvin, Garfield' 'McLaughlin, Mariah' 'Lang, Agustina' 'Bradtke, Nikko' 'Adams, Luis' 'Lehner, Matilde' 'Ortiz, Anita' 'Koch, Berry' 'Cartwright, Nicolas' 'Fisher, Elmo' 'Kunze, Gertrude' 'Stanton, Davin' 'Runolfsdottir, Roy' 'Rogahn, Colby' 'Tromp, Ryley' 'Hoppe, Stanley' 'Shanahan, Bethel' 'Hills, Samanta' 'McGlynn, Thad' 'Lynch, Norma' 'Bahringer, Lennie' 'Tillman, Madison' 'Stoltenberg, Donna' 'Dickinson, Sonya')
7. A list of 25 modified names (see below for details)
('Graham, Garfield' 'Marvin, Mariah' 'McLaughlin, Agustina' 'Lang, Nikko' 'Bradtke, Luis' 'Adams, Matilde' 'Lehner, Anita' 'Ortiz, Berry' 'Koch, Nicolas' 'Cartwright, Elmo' 'Fisher, Gertrude' 'Kunze, Davin' 'Stanton, Roy' 'Runolfsdottir, Colby' 'Rogahn, Ryley' 'Tromp, Stanley' 'Hoppe, Bethel' 'Shanahan, Samanta' 'Hills, Thad' 'McGlynn, Norma' 'Lynch, Lennie' 'Bahringer, Madison' 'Tillman, Donna' 'Stoltenberg, Sonya' 'Dickinson, Mckenna')
'Elapsed time: 367.249959 msecs'
"
	(ns ski.wotcha
	(:require [clojure.string :as string]))

	;; Based on exercise at https://drive.google.com/drive/folders/1AQN08ikQZvq0QWn9KLuhiE9kJu1Tp2a5

	;;; NB: The top-level code to actually solve the problem is down at the bottom:

	;; Clojure relies on Java regex, which means regexes of any
	;; complexity are pretty ugly:
	(def name-regex #"^(\p{IsAlphabetic}+), (\p{IsAlphabetic}+)")

	(defn add-name [names line]
	(if-let [[[full-match lname fname]] (re-seq name-regex line)]
	(conj names [lname fname])
	names))

	(defn rotate
	"Given a sequence s, rotate it n steps; eg rotating [1 2 3 4] by
	one step gives [2 3 4 1], and rotating it two steps gives [3 4 1 2]."
	[n s]
	(concat (drop n s) (take n s)))

	;; minor utility function
	(defn combine-lists
	"(combine-lists [[1 3 5] [2 4 6]]) returns [[1 2] [3 4] [5 6]]. Works for
	any number of sequences (eg passing 3 sequences returns triplets).
	Clojure's `interleave` already does something close to this, but not lazily."
	[lists]
	(apply map vector lists))

	;; TODO a bit of inefficiency in the next bit is that we convert the
	;; lists-so-far of first-names and last-names to a set every time this fn is
	;; called. But in practice, we're only going to take 25 from the list, so it's
	;; not worth worrying about. If I were actually going to submit this to a
	;; prospective employer, I might deal with that, though. The obvious thing would
	;; be just to store them as sets throughout. The downside of using sets for our
	;; accumulation of first-names and last-names is that we can't then be sure that
	;; we'll traverse those two sets in the same order, so that would complicate
	;; things a bit. One decent option, although again less readable, is to create a
	;; data structure that maintains (for each of first-namse and last-names) both
	;; an ordered list (for later traversal) and a set (for fast membership
	;; testing).

	(defn add-name-if-fully-unique
	"Note that here names are assumed to be represented as a vector of
	[last-name first-name]"
	[[lnames fnames] [lname fname]]
	(if (or (contains? (set lnames) lname)
	(contains? (set fnames) fname))
	[lnames fnames] ; no change
	[(conj lnames lname) (conj fnames fname)]))

	;; I'll give two versions of the following. The first is simpler, and then I'll
	;; give a second version that's much more performant.
	(defn fully-unique-names-simple
	"Pass a source list of names like ['Smith' 'John] and the number of
	unique-names you want"
	[names num-wanted]
	(let [unique-name-vectors (reduce add-name-if-fully-unique [] names)]
	(take num-wanted
	;; combine vectors of last names and first names:
	(combine-lists unique-name-vectors))))

	;; The trouble with the above is that it reduces eagerly over the complete
	;; first-names and last-names. Since we care about efficiency here, we use
	;; `reductions`, which is like `reduce`, but returns a lazy sequence of
	;; intermediate effects, the nth of which will be what we want.

	(defn fully-unique-names
	"Pass a source list of names like ['Smith' 'John] and the number of
	unique-names you want"
	[names num-wanted]
	(let [unique-name-vectors (reductions add-name-if-fully-unique [] names)]
	;; combine vectors of last names and first names & take the ones we want
	(combine-lists (nth unique-name-vectors (inc num-wanted)))))

	(defn rotate-names [names n]
	(let [fnames (map second names)
	lnames (map first names)]
	(map (partial string/join ", ")
	(combine-lists [(take n lnames)
	(rotate 1 (take n fnames))]))))

	;;; Begin code for the specific problem:

	(time
	(do

	;; TODO A minor inefficiency here is that I'm iterating over
	;; the sequence of lines in each of the next three lines. If I
	;; wanted to improve the time, I'd use reduce f with a function that
	;; output the full match, the last name, and the first name. But
	;; it's pretty fast anyway, so I haven't bothered; I think this
	;; way is slightly clearer.

	(def f (string/split-lines (slurp "test-data-v1-8.txt")))

	(def full-names (reduce add-name [] f))

	(def last-names (map first full-names))

	(def first-names (map second full-names))

	(println "1. The unique count of full names (i.e. duplicates are counted only once)"
	(count (set full-names)))
	(println "2. The unique count of last names"
	(count (set last-names)))
	(println "3. The unique count of first names"
	(count (set first-names)))

	;; `frequencies` produces (essentially) a list of two-element vectors: a name and
	;; the number of times it occurs. We sort by the second element of that vector.
	(println "4. The ten most common last names (the names and number of occurrences)\n "
	(reverse (take-last 10 (sort-by second (frequencies last-names)))))
	(println "5. The ten most common first names (the names and number of occurrences)\n "
	(reverse (take-last 10 (sort-by second (frequencies first-names)))))

	;; For the final two steps, I made the code completely lazy so that we'd only
	;; have to process enough to complete our list of 25 unique names. That does
	;; come at some cost to readability, but fortunately not too much.

	(def unique-names (reverse ; our result is backward as a quirk of laziness + efficiency
	(fully-unique-names full-names 25)))

	(def unique-names-output (map (partial string/join ", ") unique-names))

	(println "6. A list of 25 specially unique names (see below for details)")
	(prn unique-names-output)

	(println "7. A list of 25 modified names (see below for details)")
	(prn (rotate-names unique-names 25))

	))

	;; output:
	"
	1. The unique count of full names (i.e. duplicates are counted only once) 48308
	2. The unique count of last names 467
	3. The unique count of first names 3007
	4. The ten most common last names (the names and number of occurrences)
	([Barton 143] [Lang 136] [Ortiz 135] [Hilll 134] [Hills 130] [Terry 129] [Batz 129] [Johns 128] [Romaguera 128] [Crist 127])
	5. The ten most common first names (the names and number of occurrences)
	([Keon 31] [Tara 31] [Andreanne 31] [Stephania 31] [Summer 30] [Milo 30] [Baron 30] [Kaycee 30] [Ulices 30] [Heath 30])
	6. A list of 25 specially unique names (see below for details)
	('Graham, Mckenna' 'Marvin, Garfield' 'McLaughlin, Mariah' 'Lang, Agustina' 'Bradtke, Nikko' 'Adams, Luis' 'Lehner, Matilde' 'Ortiz, Anita' 'Koch, Berry' 'Cartwright, Nicolas' 'Fisher, Elmo' 'Kunze, Gertrude' 'Stanton, Davin' 'Runolfsdottir, Roy' 'Rogahn, Colby' 'Tromp, Ryley' 'Hoppe, Stanley' 'Shanahan, Bethel' 'Hills, Samanta' 'McGlynn, Thad' 'Lynch, Norma' 'Bahringer, Lennie' 'Tillman, Madison' 'Stoltenberg, Donna' 'Dickinson, Sonya')
	7. A list of 25 modified names (see below for details)
	('Graham, Garfield' 'Marvin, Mariah' 'McLaughlin, Agustina' 'Lang, Nikko' 'Bradtke, Luis' 'Adams, Matilde' 'Lehner, Anita' 'Ortiz, Berry' 'Koch, Nicolas' 'Cartwright, Elmo' 'Fisher, Gertrude' 'Kunze, Davin' 'Stanton, Roy' 'Runolfsdottir, Colby' 'Rogahn, Ryley' 'Tromp, Stanley' 'Hoppe, Bethel' 'Shanahan, Samanta' 'Hills, Thad' 'McGlynn, Norma' 'Lynch, Lennie' 'Bahringer, Madison' 'Tillman, Donna' 'Stoltenberg, Sonya' 'Dickinson, Mckenna')
	'Elapsed time: 367.249959 msecs'
	"