Last active
October 27, 2016 15:16
-
-
Save sampsyo/1f810eb538676a2898b7a2a4fb56269a to your computer and use it in GitHub Desktop.
GADTs for features
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* General features. This starts out as an "empty" type, which we'll then | |
extend. We also define feature values and feature vectors. *) | |
type 'a feat = .. | |
type 'a value = 'a feat * float | |
type 'a fvec = 'a value list | |
(* Domain adaptation. *) | |
type 'a feat += Adapted : ('a feat * 'b) -> ('a * 'b) feat | |
let adapt = fun (v : 'a value) (b : 'b) : ('a * 'b) value -> | |
let f, n = v in | |
Adapted (f, b), n | |
let adapt' = fun (v : 'a value) (b : 'b) -> | |
let f, n = v in | |
let general = Adapted (f, None) in | |
let specific = Adapted (f, Some b) in | |
[(general, n); (specific, n)] | |
(* String utilities. These are not very interesting. *) | |
let rec split_from = fun s i -> | |
let space = String.index_from s i ' ' in | |
let l = String.sub s 0 i in | |
let r = String.sub s i space in | |
l::(split_from r space) | |
let split = fun s -> split_from s 0 | |
let rec prefix_count = fun (l : 'a list) (x : 'a) : (int * 'a list) -> | |
match l with | |
| [] -> 0, [] | |
| hd::tl -> | |
if hd = x then | |
let count, rest = prefix_count tl x in | |
count + 1, rest | |
else | |
0, l | |
(* Like Unix `uniq -c`. *) | |
let rec uniqc = fun (l : 'a list) : ('a * int) list -> | |
match l with | |
| [] -> [] | |
| hd::tl -> | |
let count, rest = prefix_count l hd in | |
(hd, count)::(uniqc rest) | |
(* Featurize a document. *) | |
type doc = string | |
type 'a feat += Word : string -> doc feat | |
type 'a feat += Length : doc feat | |
let featurize = fun (d : doc) : doc fvec -> | |
let tokens = split d in | |
let sltokens = | |
List.sort String.compare (List.map String.lowercase_ascii tokens) in | |
let length = Length, float_of_int (List.length sltokens) in | |
let words = List.map | |
(fun (w, n) -> Word w, (float_of_int n)) | |
(uniqc sltokens) in | |
length :: words |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment