Skip to content

Instantly share code, notes, and snippets.

@luisgabriel
Created April 21, 2013 19:35
Show Gist options
  • Save luisgabriel/5430769 to your computer and use it in GitHub Desktop.
Save luisgabriel/5430769 to your computer and use it in GitHub Desktop.
(ns tsearch.lexer
(:require [clojure.string :as cjstr]))
(def a (Character/getNumericValue \a))
(def z (Character/getNumericValue \z))
(def A (Character/getNumericValue \A))
(def Z (Character/getNumericValue \Z))
(def zero (Character/getNumericValue \0))
(def nine (Character/getNumericValue \9))
(defn is-ascii-alpha-num [c]
(let [n (Character/getNumericValue c)]
(or (and (>= n a) (<= n z))
(and (>= n A) (<= n Z))
(and (>= n zero) (<= n nine)))))
(defn is-valid [c]
(or (is-ascii-alpha-num c)
(Character/isSpaceChar c)
(.equals (str \newline) (str c))))
(defn lower-and-replace [c]
(if (.equals (str \newline) (str c)) \space (Character/toLowerCase c)))
(defn tokenize [content]
(let [filtered (filter is-valid content)
lowered (map lower-and-replace filtered)]
(cjstr/split (apply str lowered) #"\s+")))
(defn process-content [content]
(let [words (tokenize content)]
(loop [ws words i 0 hmap (hash-map)]
(if (empty? ws)
hmap
(recur (rest ws) (+ i 1) (update-in hmap [(first ws)] #(conj % i)))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment