Skip to content

Instantly share code, notes, and snippets.

@m2ym
Created January 25, 2015 10:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save m2ym/a3363f585cd8ea0e33d3 to your computer and use it in GitHub Desktop.
Save m2ym/a3363f585cd8ea0e33d3 to your computer and use it in GitHub Desktop.
Extract webpage metadata in OCaml
open Core.Std
let libguess_determine_encoding =
let open Ctypes in
Foreign.foreign "libguess_determine_encoding" (string @-> int @-> string @-> returning string)
type metadata = {
title : string option;
type_ : string option;
description : string option;
image : string option;
site_name : string option;
favicon : string option;
}
let extract docs =
let open Nethtml in
let title = ref None in
let type_ = ref None in
let description = ref None in
let image = ref None in
let site_name = ref None in
let favicon = ref None in
let rec loop = function
| Element ("title", _, children) ->
if Option.is_none !title then
title := Some (String.strip
(String.concat
(List.filter_map children
(function
| Data data -> Some data
| _ -> None))))
| Element ("link", attrs, _) when
[%p? Some ("shortcut icon" | "icon")] <-- List.Assoc.find attrs "rel";
Some href <-- List.Assoc.find attrs "href" ->
if Option.is_none !favicon then
favicon := Some href
| Element ("meta", attrs, _) when
(List.mem attrs ("itemprop", "title") || List.mem attrs ("property", "og:title"));
Some value <-- List.Assoc.find attrs "content" ->
if Option.is_none !title then
title := Some value
| Element ("meta", attrs, _) when
List.mem attrs ("property", "og:type");
Some value <-- List.Assoc.find attrs "content" ->
type_ := Some value
| Element ("meta", attrs, _) when
(List.mem attrs ("itemprop", "description") || List.mem attrs ("property", "og:description"));
Some value <-- List.Assoc.find attrs "content" ->
description := Some value
| Element ("meta", attrs, _) when
(List.mem attrs ("itemprop", "image") || List.mem attrs ("property", "og:image"));
Some value <-- List.Assoc.find attrs "content" ->
image := Some value
| Element ("meta", attrs, _) when
List.mem attrs ("property", "og:site_name");
Some value <-- List.Assoc.find attrs "content" ->
site_name := Some value
| Element (_, _, children) ->
List.iter children loop
| _ -> ()
in
List.iter docs loop;
{ title = !title;
type_ = !type_;
description = !description;
image = !image;
site_name = !site_name;
favicon = !favicon }
let () =
Lwt_main.run begin Lwt.do_;
(resp, body) <-- Cohttp_lwt_unix.Client.get (Uri.of_string Sys.argv.(1));
body <-- Cohttp_lwt_body.to_string body;
let charset = libguess_determine_encoding body (String.length body) "Japanese" in
let body = Encoding.recode_string ~src:charset ~dst:"UTF-8" body in
let docs = Nethtml.parse_document (Lexing.from_string body) in
let m = extract docs in
Lwt_io.printf "title: %s\n" (Option.value m.title ~default:"");
Lwt_io.printf "type: %s\n" (Option.value m.type_ ~default:"");
Lwt_io.printf "description: %s\n" (Option.value m.description ~default:"");
Lwt_io.printf "image: %s\n" (Option.value m.image ~default:"");
Lwt_io.printf "site_name: %s\n" (Option.value m.site_name ~default:"");
Lwt_io.printf "favicon: %s\n" (Option.value m.favicon ~default:"")
end
$ ocamlfind ocamlc -package core,ctypes.foreign,text,netstring,cohttp.lwt,ppx_monadic -linkpkg -thread -custom -cclib -lguess -o extract extract.ml
$ ./extract "https://github.com/ocaml/ocaml"
title: ocaml/ocaml · GitHub
type: object
description: ocaml - Read-only mirror of INRIA SVN
image: https://avatars3.githubusercontent.com/u/1841483?v=3&amp;s=400
site_name: GitHub
favicon: https://assets-cdn.github.com/favicon.ico
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment