Skip to content

Instantly share code, notes, and snippets.

@kuenishi
Created November 23, 2012 04:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kuenishi/4133995 to your computer and use it in GitHub Desktop.
Save kuenishi/4133995 to your computer and use it in GitHub Desktop.
importer
*.o
*~
.omakedb*
O*.omc
(* see http://mjambon.com/json-wheel-doc/ for Json doc *)
(* see http://mmzeeman.home.xs4all.nl/ocaml/expat-doc/Expat.html for Xml processing *)
(* "../enwiki-20121101-pages-meta-current1.xml-p000000010p000010000.bz2" *)
let input_filename = Sys.argv.(2)
let bucket_name = Sys.argv.(1)
let buflen = 65536
let s = Stack.create()
let in_page = ref false
let count = ref 0
let riakc = ref None
let pj json =
print_endline
(Json_io.string_of_json ~allow_nan:true ~compact:false ~recursive:true json);;
let string_of_json json =
Json_io.string_of_json ~allow_nan:true ~compact:false ~recursive:true json;;
let rec read_all_bytes psr bzp pos =
try
let buf = String.create buflen in
let bytes_read = Bz2.read bzp buf 0 buflen in
Expat.parse psr (String.sub buf 0 bytes_read);
read_all_bytes psr bzp (pos+bytes_read)
with
End_of_file -> pos;;
let elem_handler tag attrs = match tag with
| "page" ->
in_page := true;
Stack.push (Json_type.Object([])) s;
| _ when !in_page ->
(* Do we need to preserve the attributes? No. Useless.
let json_attrs = List.map (fun (k,v) -> (k, Json_type.String(v))) attrs in
Stack.push (Json_type.Object(json_attrs)) s; *)
Stack.push (Json_type.Object([])) s;
| _ ->
();;
exception Sucks
let riak_client () = match (!riakc) with
| None ->
let c = Riak.riak_connect_with_defaults "127.0.0.1" 8087 in
riakc := Some(c);
c;
| Some(c) -> c;;
let close_riak_client () = match (!riakc) with
| None -> ();
| Some(c) -> Riak.riak_disconnect c;;
let get_title json_obj =
let rec get_title_ = function
| [] -> raise Sucks;
| ("title", Json_type.String(title))::_ -> title;
| _::tl -> get_title_ tl
in
match json_obj with
| Json_type.Object(pairs) -> get_title_ pairs;
| _ -> raise Sucks;;
let process_page p =
let title = get_title p in
let c = riak_client() in
(* val riak_put : riak_connection -> riak_bucket -> riak_key option ->
string -> riak_put_option list -> riak_object list
print_string title;
flush_all(); *)
let _ = Riak.riak_put c bucket_name (Some title) (string_of_json p) [] in
(* [ Riak.Put_return_body(true) ] in List.iter Riak.print_riak_obj objs;
print_endline " ...done"; *)
();;
let not_empty = function
| ("__text", Json_type.String(s))
when (String.length (String.trim s) = 0)
-> false;
| _ -> true;;
let unwrap = function
|Json_type.Object(pairs) ->
begin
match List.filter not_empty pairs with
| [] -> Json_type.Null;
| [("__text", json_str)] -> json_str;
| l -> Json_type.Object(l)
end;
|o -> o;;
let elem_ehandler tag = match tag with
| "page" ->
in_page := false;
process_page (unwrap (Stack.pop s));
count := !count + 1;
if (!count mod 100) = 0 then begin
print_string ".";
flush_all();
end;
| _ when !in_page -> begin
(* print_endline ("end "^tag); *)
let o = unwrap (Stack.pop s) in
match Stack.pop s with
| Json_type.Object(parent) when o <> Json_type.Null ->
let new_obj = Json_type.Object((tag, o)::parent) in
Stack.push new_obj s;
| json_obj ->
Stack.push json_obj s;
end;
| _ -> ();;
let append_json_string json_str str = match json_str with
| Json_type.String(txt) -> Json_type.String(txt^str);
| _ -> Json_type.String(str);;
let data_handler txt =
if !in_page then begin
match Stack.pop s with
| Json_type.Object( ("__text",prev)::tl) ->
let pair = ("__text", (append_json_string prev txt)) in
Stack.push (Json_type.Object( pair::tl )) s;
| Json_type.Object(parent) ->
let o = Json_type.String(txt) in
Stack.push (Json_type.Object( ("__text",o)::parent )) s;
|json_obj ->
Stack.push json_obj s;
end;;
let _ =
print_endline input_filename;
Printexc.record_backtrace false;
let fp = open_in input_filename in
let bzp = Bz2.open_in fp in (* http://camlbz2.forge.ocamlcore.org/api/Bz2.html *)
let psr = Expat.parser_create (None) in
Expat.set_character_data_handler psr data_handler;
Expat.set_start_element_handler psr elem_handler;
Expat.set_end_element_handler psr elem_ehandler;
try
let size = read_all_bytes psr bzp 0 in
Bz2.close_in bzp;
close_in fp;
close_riak_client();
print_int size;
print_endline " bytes read!";
with _ -> print_endline (Printexc.get_backtrace());
Printf.printf "%d pages!\n" (!count);;
.PHONY: all install clean test
# .SUBDIRS:
USE_OCAMLFIND = true
OCAMLPACKS[] =
bz2
expat
json-wheel
riak
if $(not $(OCAMLFIND_EXISTS))
eprintln(This project requires ocamlfind\, but is was not found.)
eprintln(You need to install ocamlfind and run "omake --configure".)
exit 1
# OCAMLINCLUDES +=
# NATIVE_ENABLED = $(OCAMLOPT_EXISTS)
# BYTE_ENABLED = $(not $(OCAMLOPT_EXISTS))
# OCAMLFLAGS +=
# OCAMLCFLAGS +=
# OCAMLOPTFLAGS +=
# OCAML_LINK_FLAGS +=
# OCAML_BYTE_LINK_FLAGS +=
# OCAML_NATIVE_LINK_FLAGS +=
FILES[] =
importer
PROGRAM = importer
# OCAML_LIBS +=
# OCAML_CLIBS +=
# OCAML_OTHER_LIBS +=
# OCAML_LIB_FLAGS +=
.DEFAULT: $(OCamlProgram $(PROGRAM), $(FILES))
test: $(PROGRAM)
./$(PROGRAM)
open build/C
open build/OCaml
open build/LaTeX
#
# The command-line variables are defined *after* the
# standard configuration has been loaded.
#
DefineCommandVars()
#
# Include the OMakefile in this directory.
#
.SUBDIRS: .
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment