Created
November 23, 2012 04:23
-
-
Save kuenishi/4133995 to your computer and use it in GitHub Desktop.
importer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.o | |
*~ | |
.omakedb* | |
O*.omc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* see http://mjambon.com/json-wheel-doc/ for Json doc *) | |
(* see http://mmzeeman.home.xs4all.nl/ocaml/expat-doc/Expat.html for Xml processing *) | |
(* "../enwiki-20121101-pages-meta-current1.xml-p000000010p000010000.bz2" *) | |
let input_filename = Sys.argv.(2) | |
let bucket_name = Sys.argv.(1) | |
let buflen = 65536 | |
let s = Stack.create() | |
let in_page = ref false | |
let count = ref 0 | |
let riakc = ref None | |
let pj json = | |
print_endline | |
(Json_io.string_of_json ~allow_nan:true ~compact:false ~recursive:true json);; | |
let string_of_json json = | |
Json_io.string_of_json ~allow_nan:true ~compact:false ~recursive:true json;; | |
let rec read_all_bytes psr bzp pos = | |
try | |
let buf = String.create buflen in | |
let bytes_read = Bz2.read bzp buf 0 buflen in | |
Expat.parse psr (String.sub buf 0 bytes_read); | |
read_all_bytes psr bzp (pos+bytes_read) | |
with | |
End_of_file -> pos;; | |
let elem_handler tag attrs = match tag with | |
| "page" -> | |
in_page := true; | |
Stack.push (Json_type.Object([])) s; | |
| _ when !in_page -> | |
(* Do we need to preserve the attributes? No. Useless. | |
let json_attrs = List.map (fun (k,v) -> (k, Json_type.String(v))) attrs in | |
Stack.push (Json_type.Object(json_attrs)) s; *) | |
Stack.push (Json_type.Object([])) s; | |
| _ -> | |
();; | |
exception Sucks | |
let riak_client () = match (!riakc) with | |
| None -> | |
let c = Riak.riak_connect_with_defaults "127.0.0.1" 8087 in | |
riakc := Some(c); | |
c; | |
| Some(c) -> c;; | |
let close_riak_client () = match (!riakc) with | |
| None -> (); | |
| Some(c) -> Riak.riak_disconnect c;; | |
let get_title json_obj = | |
let rec get_title_ = function | |
| [] -> raise Sucks; | |
| ("title", Json_type.String(title))::_ -> title; | |
| _::tl -> get_title_ tl | |
in | |
match json_obj with | |
| Json_type.Object(pairs) -> get_title_ pairs; | |
| _ -> raise Sucks;; | |
let process_page p = | |
let title = get_title p in | |
let c = riak_client() in | |
(* val riak_put : riak_connection -> riak_bucket -> riak_key option -> | |
string -> riak_put_option list -> riak_object list | |
print_string title; | |
flush_all(); *) | |
let _ = Riak.riak_put c bucket_name (Some title) (string_of_json p) [] in | |
(* [ Riak.Put_return_body(true) ] in List.iter Riak.print_riak_obj objs; | |
print_endline " ...done"; *) | |
();; | |
let not_empty = function | |
| ("__text", Json_type.String(s)) | |
when (String.length (String.trim s) = 0) | |
-> false; | |
| _ -> true;; | |
let unwrap = function | |
|Json_type.Object(pairs) -> | |
begin | |
match List.filter not_empty pairs with | |
| [] -> Json_type.Null; | |
| [("__text", json_str)] -> json_str; | |
| l -> Json_type.Object(l) | |
end; | |
|o -> o;; | |
let elem_ehandler tag = match tag with | |
| "page" -> | |
in_page := false; | |
process_page (unwrap (Stack.pop s)); | |
count := !count + 1; | |
if (!count mod 100) = 0 then begin | |
print_string "."; | |
flush_all(); | |
end; | |
| _ when !in_page -> begin | |
(* print_endline ("end "^tag); *) | |
let o = unwrap (Stack.pop s) in | |
match Stack.pop s with | |
| Json_type.Object(parent) when o <> Json_type.Null -> | |
let new_obj = Json_type.Object((tag, o)::parent) in | |
Stack.push new_obj s; | |
| json_obj -> | |
Stack.push json_obj s; | |
end; | |
| _ -> ();; | |
let append_json_string json_str str = match json_str with | |
| Json_type.String(txt) -> Json_type.String(txt^str); | |
| _ -> Json_type.String(str);; | |
let data_handler txt = | |
if !in_page then begin | |
match Stack.pop s with | |
| Json_type.Object( ("__text",prev)::tl) -> | |
let pair = ("__text", (append_json_string prev txt)) in | |
Stack.push (Json_type.Object( pair::tl )) s; | |
| Json_type.Object(parent) -> | |
let o = Json_type.String(txt) in | |
Stack.push (Json_type.Object( ("__text",o)::parent )) s; | |
|json_obj -> | |
Stack.push json_obj s; | |
end;; | |
let _ = | |
print_endline input_filename; | |
Printexc.record_backtrace false; | |
let fp = open_in input_filename in | |
let bzp = Bz2.open_in fp in (* http://camlbz2.forge.ocamlcore.org/api/Bz2.html *) | |
let psr = Expat.parser_create (None) in | |
Expat.set_character_data_handler psr data_handler; | |
Expat.set_start_element_handler psr elem_handler; | |
Expat.set_end_element_handler psr elem_ehandler; | |
try | |
let size = read_all_bytes psr bzp 0 in | |
Bz2.close_in bzp; | |
close_in fp; | |
close_riak_client(); | |
print_int size; | |
print_endline " bytes read!"; | |
with _ -> print_endline (Printexc.get_backtrace()); | |
Printf.printf "%d pages!\n" (!count);; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.PHONY: all install clean test | |
# .SUBDIRS: | |
USE_OCAMLFIND = true | |
OCAMLPACKS[] = | |
bz2 | |
expat | |
json-wheel | |
riak | |
if $(not $(OCAMLFIND_EXISTS)) | |
eprintln(This project requires ocamlfind\, but is was not found.) | |
eprintln(You need to install ocamlfind and run "omake --configure".) | |
exit 1 | |
# OCAMLINCLUDES += | |
# NATIVE_ENABLED = $(OCAMLOPT_EXISTS) | |
# BYTE_ENABLED = $(not $(OCAMLOPT_EXISTS)) | |
# OCAMLFLAGS += | |
# OCAMLCFLAGS += | |
# OCAMLOPTFLAGS += | |
# OCAML_LINK_FLAGS += | |
# OCAML_BYTE_LINK_FLAGS += | |
# OCAML_NATIVE_LINK_FLAGS += | |
FILES[] = | |
importer | |
PROGRAM = importer | |
# OCAML_LIBS += | |
# OCAML_CLIBS += | |
# OCAML_OTHER_LIBS += | |
# OCAML_LIB_FLAGS += | |
.DEFAULT: $(OCamlProgram $(PROGRAM), $(FILES)) | |
test: $(PROGRAM) | |
./$(PROGRAM) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open build/C | |
open build/OCaml | |
open build/LaTeX | |
# | |
# The command-line variables are defined *after* the | |
# standard configuration has been loaded. | |
# | |
DefineCommandVars() | |
# | |
# Include the OMakefile in this directory. | |
# | |
.SUBDIRS: . |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment