新しい構文

camlp4上の改訂版構文とやらを使ってみた。

(* ocamlc -pp "camlp4r" -c utf.ml *)
module Ucs4 = Int32;

type ucs4 = Ucs4.t;
type utf8_string = Bigarray.Array1.t int Bigarray.int8_unsigned_elt Bigarray.c_layout;
type utf16_string = Bigarray.Array1.t int Bigarray.int16_unsigned_elt Bigarray.c_layout;
type utf32_string = Bigarray.Array1.t int32 Bigarray.int32_elt Bigarray.c_layout;

value utf8_max_length : int = 6;
value utf16_max_length : int = 2;

value set = Bigarray.Array1.set;
value get = Bigarray.Array1.get;
value sub = Bigarray.Array1.sub;
value length : _ -> int = Bigarray.Array1.dim;

value copy source =
  let result = Bigarray.Array1.create (Bigarray.Array1.kind source) (Bigarray.Array1.layout source) (Bigarray.Array1.dim source) in (
    Bigarray.Array1.blit source result; 
    result);

value to_utf8_i ?(invalid_sequence: option exn) (code: int) (dest: utf8_string) (length: ref int) = (
  length.val := if code land 0x7fffff80 = 0 then (Bigarray.Array1.set dest 0 code; 1)
    else if code land lnot (1 lsl (5 + 6) - 1) = 0 then (Bigarray.Array1.set dest 0 (0b11000000 lor code lsr 6); 2)
    else if code land lnot (1 lsl (4 + 12) - 1) = 0 then (Bigarray.Array1.set dest 0 (0b11100000 lor code lsr 12); 3)
    else if code land lnot (1 lsl (3 + 18) - 1) = 0 then (Bigarray.Array1.set dest 0 (0b11110000 lor code lsr 18); 4)
    else if code land lnot (1 lsl (2 + 24) - 1) = 0 then (Bigarray.Array1.set dest 0 (0b11111000 lor code lsr 24); 5)
    else (Bigarray.Array1.set dest 0 (0b11111100 lor code lsr 30); 6);
  tails code dest (pred length.val)
    where rec tails code (dest: utf8_string) i = if i < 0 then () else (
      Bigarray.Array1.set dest i (0b10000000 lor code land (1 lsl 6 - 1));
      tails (code lsr 6) dest (i - 1)));

value to_utf8 ?(invalid_sequence: option exn) (code: ucs4) (dest: utf8_string) (length: ref int) =
  to_utf8_i ?invalid_sequence (Ucs4.to_int code) dest length;

value from_utf8_i ?(invalid_sequence: option exn) (data: utf8_string) (length: ref int) =
  let lead : int = Bigarray.Array1.get data 0 in
  let code = if lead land 0b10000000 = 0 then (length.val := 1; lead)
    else if lead land 0b11100000 = 0b11000000 then (length.val := 2; lead land 0b00011111)
    else if lead land 0b11110000 = 0b11100000 then (length.val := 3; lead land 0b00001111)
    else if lead land 0b11111000 = 0b11110000 then (length.val := 4; lead land 0b00000111)
    else if lead land 0b11111100 = 0b11111000 then (length.val := 5; lead land 0b00000011)
    else if lead land 0b11111110 = 0b11111100 then (length.val := 6; lead land 0b00000001)
    else match invalid_sequence with [ None -> (length.val := 1; lead) | Some exn -> raise exn ] in
  tails invalid_sequence length data 1 code
    where rec tails (invalid_sequence: option exn) (length: ref int) (data: utf8_string) i code = 
      if i >= !length then code else (
        if i >= Bigarray.Array1.dim data then (
          match invalid_sequence with 
          [ None -> (let real_length = length.val in (length.val := i; code lsl (6 * (real_length - i))))
          | Some exn -> raise exn ]
        ) else (
          let tail = 
            let item = Bigarray.Array1.get data i in
            if item land 0b10000000 = 0 then item else (
              match invalid_sequence with 
              [ None -> item 
              | Some exn -> raise exn ]) in
          tails invalid_sequence length data (i + 1) (code lsl 6 lor (tail land (2 lsl 6 - 1)))));

value from_utf8 ?(invalid_sequence: option exn) (data: utf8_string) (length: ref int) =
  Ucs4.of_int (from_utf8_i ?invalid_sequence data length);

value utf8_sequence ?(invalid_sequence: option exn) (lead: int) =
  if lead land 0b10000000 = 0 then 1
  else if lead land 0b11100000 = 0b11000000 then 2
  else if lead land 0b11110000 = 0b11100000 then 3
  else if lead land 0b11111000 = 0b11110000 then 4
  else if lead land 0b11111100 = 0b11111000 then 5
  else if lead land 0b11111110 = 0b11111100 then 6
  else match invalid_sequence with [ None -> 1 | Some exn -> raise exn ];

value to_utf16_i ?(invalid_sequence: option exn) (code: int) (dest: utf16_string) (length: ref int) =
  length.val := if code >= 0 && code <= 0xd7ff || code >= 0xe000 && code <= 0xffff then (
    Bigarray.Array1.set dest 0 code; 1
  ) else if code >= 0xd800 && code <= 0xdfff then (
    match invalid_sequence with
    [ None -> (Bigarray.Array1.set dest 0 code; 1)
    | Some exn -> raise exn ]
  ) else (
    let c2 = code - 0x10000 in (
      if c2 >= 1 lsl 20 then (
        match invalid_sequence with
        [ None -> ()
        | Some exn -> raise exn ];
      ) else ();
      Bigarray.Array1.set dest 0 (0xd800 lor (c2 lsr 10));
      Bigarray.Array1.set dest 1 (0xdc00 lor (c2 land (1 lsl 10 - 1)));
      2));

value to_utf16 (code: ucs4) (dest: utf16_string) (length: ref int) =
  to_utf16_i (Ucs4.to_int code) dest length;

value from_utf16_i ?(invalid_sequence: option exn) (data: utf16_string) (length: ref int) =
  let lead : int = Bigarray.Array1.get data 0 in
  if lead >= 0 && lead <= 0xd7ff || lead >= 0xe000 && lead <= 0xffff then (
    length.val := 1; lead
  ) else if lead >= 0xd800 && lead <= 0xdbff then (
    if Bigarray.Array1.dim data <= 1 then (
      match invalid_sequence with 
      [ None -> (length.val := 1; lead)
      | Some exn -> raise exn ]
    ) else (
      length.val := 2;
      let tail = Bigarray.Array1.get data 1 in
      if tail < 0xdc00 || tail > 0xdfff then (
        match invalid_sequence with
        [ None -> ()
        | Some exn -> raise exn ]
      ) else ();
      lead land (1 lsl 10 - 1) lsl 10 lor tail land (1 lsl 10 - 1)
    )
  ) else (
    match invalid_sequence with
    [ None -> (length.val := 1; lead)
    | Some exn -> raise exn ]
  );

value from_utf16 ?(invalid_sequence: option exn) (data: utf16_string) (length: ref int) =
  Ucs4.of_int (from_utf16_i ?invalid_sequence data length);

value utf8_sequence ?(invalid_sequence: option exn) (lead: int) =
  if lead >= 0 && lead <= 0xd7ff || lead >= 0xe000 && lead <= 0xffff then 1
  else if lead >= 0xd800 && lead <= 0xdbff then 2
  else match invalid_sequence with [ None -> 1 | Some exn -> raise exn ];

value utf8_of_string (data: string) = 
  let length = String.length data in
  let result = Bigarray.Array1.create Bigarray.int8_unsigned Bigarray.c_layout length in (
    for i = 0 to length - 1 do {
      Bigarray.Array1.set result i (int_of_char data.[i]) };
    result);

value string_of_utf8 (data: utf8_string) =
  let length = Bigarray.Array1.dim data in
  let result = String.create length in (
    for i = 0 to length - 1 do {
      String.set result i (char_of_int (Bigarray.Array1.get data i)) };
    result);

value utf8_of_array (data: array int) = Bigarray.Array1.of_array Bigarray.int8_unsigned Bigarray.c_layout data;

value utf8_of_utf16 ?(invalid_sequence: option exn) (data: utf16_string) =
  let data_length = Bigarray.Array1.dim data in
  let result = Bigarray.Array1.create Bigarray.int8_unsigned Bigarray.c_layout (3 * data_length) in
  make invalid_sequence data_length data 0 result 0 (ref 0)
    where rec make (invalid_sequence: option exn) length (data: utf16_string) i (result: utf8_string) j length_buf = 
      if i >= length then Bigarray.Array1.sub result 0 j else (
        let code : int = from_utf16_i ?invalid_sequence (Bigarray.Array1.sub data i (length - i)) length_buf in
        let data_offset = !length_buf in (
          to_utf8_i ?invalid_sequence code (Bigarray.Array1.sub result j 3) length_buf;
          let result_offset = !length_buf in
          make invalid_sequence length data (i + data_offset) result (j + result_offset) length_buf));

value utf8_of_utf32 ?(invalid_sequence: option exn) (data: utf32_string) =
  let data_length = Bigarray.Array1.dim data in
  let result = Bigarray.Array1.create Bigarray.int8_unsigned Bigarray.c_layout (6 * data_length) in
  make invalid_sequence data_length data 0 result 0 (ref 0)
    where rec make (invalid_sequence: option exn) length (data: utf32_string) i (result: utf8_string) j length_buf = 
      if i >= length then Bigarray.Array1.sub result 0 j else (
        let code : int = Ucs4.to_int (Bigarray.Array1.get data i) in
          to_utf8_i ?invalid_sequence code (Bigarray.Array1.sub result j 6) length_buf;
          let result_offset = !length_buf in
          make invalid_sequence length data (i + 1) result (j + result_offset) length_buf);

value utf16_of_array (data: array int) = Bigarray.Array1.of_array Bigarray.int16_unsigned Bigarray.c_layout data;

value utf16_of_utf8 ?(invalid_sequence: option exn) (data: utf8_string) =
  let data_length = Bigarray.Array1.dim data in
  let result = Bigarray.Array1.create Bigarray.int16_unsigned Bigarray.c_layout data_length in
  make invalid_sequence data_length data 0 result 0 (ref 0)
    where rec make (invalid_sequence: option exn) length (data: utf8_string) i (result: utf16_string) j length_buf = 
      if i >= length then Bigarray.Array1.sub result 0 j else (
        let code : int = from_utf8_i ?invalid_sequence (Bigarray.Array1.sub data i (length - i)) length_buf in
        let data_offset = !length_buf in (
          to_utf16_i ?invalid_sequence code (Bigarray.Array1.sub result j (length - i)) length_buf;
          let result_offset = !length_buf in
          make invalid_sequence length data (i + data_offset) result (j + result_offset) length_buf));

value utf16_of_utf32 ?(invalid_sequence: option exn) (data: utf32_string) =
  let data_length = Bigarray.Array1.dim data in
  let result = Bigarray.Array1.create Bigarray.int16_unsigned Bigarray.c_layout (2 * data_length) in
  make invalid_sequence data_length data 0 result 0 (ref 0)
    where rec make (invalid_sequence: option exn) length (data: utf32_string) i (result: utf16_string) j length_buf = 
      if i >= length then Bigarray.Array1.sub result 0 j else (
        let code : int = Ucs4.to_int (Bigarray.Array1.get data i) in (
          to_utf16_i ?invalid_sequence code (Bigarray.Array1.sub result j 2) length_buf;
          let result_offset = !length_buf in
          make invalid_sequence length data (i + 1) result (j + result_offset) length_buf));

value utf32_of_array (data: array int32) = Bigarray.Array1.of_array Bigarray.int32 Bigarray.c_layout data;

value utf32_of_utf8 ?(invalid_sequence: option exn) (data: utf8_string) =
  let data_length = Bigarray.Array1.dim data in
  let result = Bigarray.Array1.create Bigarray.int32 Bigarray.c_layout data_length in
  make invalid_sequence data_length data 0 result 0 (ref 0)
    where rec make (invalid_sequence: option exn) length (data: utf8_string) i (result: utf32_string) j length_buf = 
      if i >= length then Bigarray.Array1.sub result 0 j else (
        let code : int = from_utf8_i ?invalid_sequence (Bigarray.Array1.sub data i (length - i)) length_buf in
        let data_offset = !length_buf in (
          Bigarray.Array1.set result j (Ucs4.of_int code);
          make invalid_sequence length data (i + data_offset) result (j + 1) length_buf));

value utf32_of_utf16 ?(invalid_sequence: option exn) (data: utf16_string) =
  let data_length = Bigarray.Array1.dim data in
  let result = Bigarray.Array1.create Bigarray.int32 Bigarray.c_layout data_length in
  make invalid_sequence data_length data 0 result 0 (ref 0)
    where rec make (invalid_sequence: option exn) length (data: utf16_string) i (result: utf32_string) j length_buf = 
      if i >= length then Bigarray.Array1.sub result 0 j else (
        let code : int = from_utf16_i ?invalid_sequence (Bigarray.Array1.sub data i (length - i)) length_buf in
        let data_offset = !length_buf in (
          Bigarray.Array1.set result j (Ucs4.of_int code);
          make invalid_sequence length data (i + data_offset) result (j + 1) length_buf));

……コンパイル遅くなった。
whereは大変良いです。
let...inのスコープが1文しか無くなったのが、構文からそうあるべきとはいえどうインデントしていいかもうわからないです。
全体的にはすっきりして良い……はずなんですが、もうちょっと、ここでこうインデントする、という意図が込められた、改訂版改訂版構文が必要な気がっ。
(ここで案を書き始めると止まらないので削除)