Skip to content

Instantly share code, notes, and snippets.

@usametov
Created July 16, 2023 06:06
Show Gist options
  • Save usametov/3c46a8012d5913e53f2810b28d0c5af6 to your computer and use it in GitHub Desktop.
Save usametov/3c46a8012d5913e53f2810b28d0c5af6 to your computer and use it in GitHub Desktop.
clojure implementation of LangChain's recursive splitter
(ns astanova.recursive-splitter
"implementation of LangChain's recursive code splitter"
(:require [clojure.string :as s]))
(defonce java-splitters [#"class " #"public " #"protected "
#"private " #"static " #"if"
#"for" #"while" #"switch"
#"case" #"\r\n" #"\t\t"])
(defonce js-splitters [#"function " #"const " #"let "
#"var " #"class " #"if" #"for"
#"while" #"switch" #"case" #"default "])
(defonce py-splitters [#"class " #"def " #"\n\tdef " #"\n\n"])
(defonce cpp-splitters [ #"class " #"void " #"int " #"float " #"double " #"if"
#"for" #"while" #"switch" #"case" #"\n\n"])
(defonce chunk-size 400)
(defn build-splitter
[regex]
(fn[txt]
(if (< chunk-size (count txt))
(s/split txt regex)
[txt])))
(defn split-step
[{:keys [txt-seq splitters]}]
(if (< 0 (count splitters))
{:txt-seq (map s/trim
(filter (complement s/blank?)
(mapcat (first splitters) txt-seq)))
:splitters (rest splitters)}))
(defn recursive-split
[code-txt regex-separators]
(let [splitters (map build-splitter regex-separators)]
(:txt-seq
(last
(take-while some?
(iterate split-step {:txt-seq [code-txt] :splitters splitters}))))))
(comment
(def java-code
(slurp "https://raw.githubusercontent.com/Convex-Dev/convex/develop/convex-cli/src/main/java/convex/cli/AccountBalance.java"))
(recursive-split java-code java-splitters)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment