Skip to content

Instantly share code, notes, and snippets.

@sampsyo
Last active October 27, 2016 15:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sampsyo/1f810eb538676a2898b7a2a4fb56269a to your computer and use it in GitHub Desktop.
Save sampsyo/1f810eb538676a2898b7a2a4fb56269a to your computer and use it in GitHub Desktop.
GADTs for features
(* General features. This starts out as an "empty" type, which we'll then
extend. We also define feature values and feature vectors. *)
type 'a feat = ..
type 'a value = 'a feat * float
type 'a fvec = 'a value list
(* Domain adaptation. *)
type 'a feat += Adapted : ('a feat * 'b) -> ('a * 'b) feat
let adapt = fun (v : 'a value) (b : 'b) : ('a * 'b) value ->
let f, n = v in
Adapted (f, b), n
let adapt' = fun (v : 'a value) (b : 'b) ->
let f, n = v in
let general = Adapted (f, None) in
let specific = Adapted (f, Some b) in
[(general, n); (specific, n)]
(* String utilities. These are not very interesting. *)
let rec split_from = fun s i ->
let space = String.index_from s i ' ' in
let l = String.sub s 0 i in
let r = String.sub s i space in
l::(split_from r space)
let split = fun s -> split_from s 0
let rec prefix_count = fun (l : 'a list) (x : 'a) : (int * 'a list) ->
match l with
| [] -> 0, []
| hd::tl ->
if hd = x then
let count, rest = prefix_count tl x in
count + 1, rest
else
0, l
(* Like Unix `uniq -c`. *)
let rec uniqc = fun (l : 'a list) : ('a * int) list ->
match l with
| [] -> []
| hd::tl ->
let count, rest = prefix_count l hd in
(hd, count)::(uniqc rest)
(* Featurize a document. *)
type doc = string
type 'a feat += Word : string -> doc feat
type 'a feat += Length : doc feat
let featurize = fun (d : doc) : doc fvec ->
let tokens = split d in
let sltokens =
List.sort String.compare (List.map String.lowercase_ascii tokens) in
let length = Length, float_of_int (List.length sltokens) in
let words = List.map
(fun (w, n) -> Word w, (float_of_int n))
(uniqc sltokens) in
length :: words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment