bstr/lib/bstr.ml
2025-02-11 19:29:15 +01:00

652 lines
19 KiB
OCaml

type t = (char, Bigarray.int8_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t
module Bytes = struct
include Bytes
external unsafe_get_uint8 : bytes -> int -> int = "%bytes_unsafe_get"
external unsafe_set_uint8 : bytes -> int -> int -> unit = "%bytes_unsafe_set"
external unsafe_get_int32_ne : bytes -> int -> int32 = "%caml_bytes_get32u"
external unsafe_set_int32_ne : bytes -> int -> int32 -> unit
= "%caml_bytes_set32u"
end
external swap16 : int -> int = "%bswap16"
external swap32 : int32 -> int32 = "%bswap_int32"
external swap64 : int64 -> int64 = "%bswap_int64"
external get_uint8 : t -> int -> int = "%caml_ba_ref_1"
external set_uint8 : t -> int -> int -> unit = "%caml_ba_set_1"
external get_uint16_ne : t -> int -> int = "%caml_bigstring_get16"
external set_int16_ne : t -> int -> int -> unit = "%caml_bigstring_set16"
external get_int32_ne : t -> int -> int32 = "%caml_bigstring_get32"
external set_int32_ne : t -> int -> int32 -> unit = "%caml_bigstring_set32"
external set_int64_ne : t -> int -> int64 -> unit = "%caml_bigstring_set64"
external unsafe_get_uint8 : t -> int -> int = "%caml_ba_unsafe_ref_1"
external unsafe_set_uint8 : t -> int -> int -> unit = "%caml_ba_unsafe_set_1"
external unsafe_get_uint16_ne : t -> int -> int = "%caml_bigstring_get16u"
external unsafe_set_uint16_ne : t -> int -> int -> unit
= "%caml_bigstring_set16u"
external unsafe_get_int32_ne : t -> int -> int32 = "%caml_bigstring_get32u"
external unsafe_set_int32_ne : t -> int -> int32 -> unit
= "%caml_bigstring_set32u"
external unsafe_get_int64_ne : t -> (int[@untagged]) -> (int64[@unboxed])
= "bstr_bytecode_get64u" "bstr_native_get64u"
[@@noalloc]
external _unsafe_set_int64_ne : t -> int -> int64 -> unit
= "%caml_bigstring_set64u"
external unsafe_memcmp :
t
-> (int[@untagged])
-> t
-> (int[@untagged])
-> (int[@untagged])
-> (int[@untagged]) = "bstr_bytecode_memcmp" "bstr_native_memcmp"
[@@noalloc]
external unsafe_memcpy :
t -> (int[@untagged]) -> t -> (int[@untagged]) -> (int[@untagged]) -> unit
= "bstr_bytecode_memcpy" "bstr_native_memcpy"
[@@noalloc]
external unsafe_memmove :
t -> (int[@untagged]) -> t -> (int[@untagged]) -> (int[@untagged]) -> unit
= "bstr_bytecode_memmove" "bstr_native_memmove"
[@@noalloc]
external unsafe_memchr :
t
-> (int[@untagged])
-> (int[@untagged])
-> (int[@untagged])
-> (int[@untagged]) = "bstr_bytecode_memchr" "bstr_native_memchr"
[@@noalloc]
external unsafe_memset :
t
-> (int[@untagged])
-> (int[@untagged])
-> (int[@untagged])
-> (int[@untagged]) = "bstr_bytecode_memchr" "bstr_native_memchr"
[@@noalloc]
let memcmp src ~src_off dst ~dst_off ~len =
if
len < 0
|| src_off < 0
|| src_off > Bigarray.Array1.dim src - len
|| dst_off < 0
|| dst_off > Bigarray.Array1.dim dst - len
then invalid_arg "Bstr.memcmp";
unsafe_memcmp src src_off dst dst_off len
let memcpy src ~src_off dst ~dst_off ~len =
if
len < 0
|| src_off < 0
|| src_off > Bigarray.Array1.dim src - len
|| dst_off < 0
|| dst_off > Bigarray.Array1.dim dst - len
then invalid_arg "Bstr.memcpy";
unsafe_memcpy src src_off dst dst_off len
let memmove src ~src_off dst ~dst_off ~len =
if
len < 0
|| src_off < 0
|| src_off > Bigarray.Array1.dim src - len
|| dst_off < 0
|| dst_off > Bigarray.Array1.dim dst - len
then invalid_arg "Bstr.memmove";
unsafe_memmove src src_off dst dst_off len
let memchr src ~off ~len value =
if len < 0 || off < 0 || off > Bigarray.Array1.dim src - len then
invalid_arg "Bstr.memchr";
unsafe_memchr src off len (Char.code value)
let memset src ~off ~len value =
if len < 0 || off < 0 || off > Bigarray.Array1.dim src - len then
invalid_arg "Bstr.memset";
ignore (unsafe_memset src off len (Char.code value))
let empty = Bigarray.Array1.create Bigarray.char Bigarray.c_layout 0
let create len = Bigarray.Array1.create Bigarray.char Bigarray.c_layout len
external length : t -> int = "%caml_ba_dim_1"
external get : t -> int -> char = "%caml_ba_ref_1"
external unsafe_get : t -> int -> char = "%caml_ba_unsafe_ref_1"
external set : t -> int -> char -> unit = "%caml_ba_set_1"
external unsafe_set : t -> int -> char -> unit = "%caml_ba_unsafe_set_1"
let unsafe_fill bstr ~off ~len v =
let nv = Nativeint.of_int v in
let vv = Nativeint.(logor (shift_left nv 8) nv) in
let vvvv = Nativeint.(logor (shift_left vv 16) vv) in
let vvvv = Nativeint.to_int32 vvvv in
let len0 = len land 3 in
let len1 = len lsr 2 in
for i = 0 to len1 - 1 do
let i = i * 4 in
unsafe_set_int32_ne bstr (off + i) vvvv
done;
for i = 0 to len0 - 1 do
let i = (len1 * 4) + i in
unsafe_set_uint8 bstr (off + i) v
done
let fill bstr ~off ~len chr =
if len < 0 || off < 0 || off > length bstr - len then invalid_arg "Bstr.fill";
unsafe_fill bstr ~off ~len (Char.code chr)
let make len chr =
let bstr = create len in
unsafe_fill bstr ~off:0 ~len (Char.code chr);
(* [Obj.magic] instead of [Char.code]? *)
bstr
let init len fn =
let bstr = create len in
for i = 0 to len - 1 do
unsafe_set bstr i (fn i)
done;
bstr
let copy src =
let len = length src in
let bstr = create len in
unsafe_memcpy src 0 bstr 0 len;
bstr
let chop ?(rev = false) bstr =
if length bstr == 0 then None
else if not rev then Some (unsafe_get bstr 0)
else Some (unsafe_get bstr (length bstr - 1))
let get_int64_ne bstr idx =
if idx < 0 || idx > length bstr - 8 then invalid_arg "Bstr.get_int64_ne";
unsafe_get_int64_ne bstr idx
let get_int8 bstr i =
(get_uint8 bstr i lsl (Sys.int_size - 8)) asr (Sys.int_size - 8)
let get_uint16_le bstr i =
if Sys.big_endian then swap16 (get_uint16_ne bstr i) else get_uint16_ne bstr i
let get_uint16_be bstr i =
if not Sys.big_endian then swap16 (get_uint16_ne bstr i)
else get_uint16_ne bstr i
let _unsafe_get_uint16_le bstr i =
(* TODO(dinosaure): for unicode. *)
if Sys.big_endian then swap16 (unsafe_get_uint16_ne bstr i)
else unsafe_get_uint16_ne bstr i
let _unsafe_get_uint16_be bstr i =
(* TODO(dinosaure): for unicode. *)
if not Sys.big_endian then swap16 (unsafe_get_uint16_ne bstr i)
else unsafe_get_uint16_ne bstr i
let get_int16_ne bstr i =
(get_uint16_ne bstr i lsl (Sys.int_size - 16)) asr (Sys.int_size - 16)
let get_int16_le bstr i =
(get_uint16_le bstr i lsl (Sys.int_size - 16)) asr (Sys.int_size - 16)
let get_int16_be bstr i =
(get_uint16_be bstr i lsl (Sys.int_size - 16)) asr (Sys.int_size - 16)
let get_int32_le bstr i =
if Sys.big_endian then swap32 (get_int32_ne bstr i) else get_int32_ne bstr i
let get_int32_be bstr i =
if not Sys.big_endian then swap32 (get_int32_ne bstr i)
else get_int32_ne bstr i
let get_int64_le bstr i =
if Sys.big_endian then swap64 (get_int64_ne bstr i) else get_int64_ne bstr i
let get_int64_be bstr i =
if not Sys.big_endian then swap64 (get_int64_ne bstr i)
else get_int64_ne bstr i
let _unsafe_set_uint16_le bstr i x =
(* TODO(dinosaure): for unicode. *)
if Sys.big_endian then unsafe_set_uint16_ne bstr i (swap16 x)
else unsafe_set_uint16_ne bstr i x
let _unsafe_set_uint16_be bstr i x =
(* TODO(dinosaure): for unicode. *)
if Sys.big_endian then unsafe_set_uint16_ne bstr i x
else unsafe_set_uint16_ne bstr i (swap16 x)
let set_int16_le bstr i x =
if Sys.big_endian then set_int16_ne bstr i (swap16 x)
else set_int16_ne bstr i x
let set_int16_be bstr i x =
if not Sys.big_endian then set_int16_ne bstr i (swap16 x)
else set_int16_ne bstr i x
let set_int32_le bstr i x =
if Sys.big_endian then set_int32_ne bstr i (swap32 x)
else set_int32_ne bstr i x
let set_int32_be bstr i x =
if not Sys.big_endian then set_int32_ne bstr i (swap32 x)
else set_int32_ne bstr i x
let set_int64_le bstr i x =
if Sys.big_endian then set_int64_ne bstr i (swap64 x)
else set_int64_ne bstr i x
let set_int64_be bstr i x =
if not Sys.big_endian then set_int64_ne bstr i (swap64 x)
else set_int64_ne bstr i x
let set_int8 = set_uint8
let set_uint16_ne = set_int16_ne
let set_uint16_be = set_int16_be
let set_uint16_le = set_int16_le
external unsafe_sub : t -> (int[@untagged]) -> (int[@untagged]) -> t
= "bstr_bytecode_unsafe_sub" "bstr_native_unsafe_sub"
let sub bstr ~off ~len =
if off < 0 || len < 0 || off > length bstr - len then invalid_arg "Bstr.sub";
unsafe_sub bstr off len
let unsafe_blit src ~src_off dst ~dst_off ~len =
unsafe_memmove src src_off dst dst_off len
let blit src ~src_off dst ~dst_off ~len =
if
len < 0
|| src_off < 0
|| src_off > length src - len
|| dst_off < 0
|| dst_off > length dst - len
then invalid_arg "Bstr.blit";
unsafe_blit src ~src_off dst ~dst_off ~len
let unsafe_blit_to_bytes bstr ~src_off dst ~dst_off ~len =
let len0 = len land 3 in
let len1 = len lsr 2 in
for i = 0 to len1 - 1 do
let i = i * 4 in
let v = unsafe_get_int32_ne bstr (src_off + i) in
Bytes.unsafe_set_int32_ne dst (dst_off + i) v
done;
for i = 0 to len0 - 1 do
let i = (len1 * 4) + i in
let v = unsafe_get_uint8 bstr (src_off + i) in
Bytes.unsafe_set_uint8 dst (dst_off + i) v
done
let unsafe_blit_from_bytes src ~src_off bstr ~dst_off ~len =
let len0 = len land 3 in
let len1 = len lsr 2 in
for i = 0 to len1 - 1 do
let i = i * 4 in
let v = Bytes.unsafe_get_int32_ne src (src_off + i) in
unsafe_set_int32_ne bstr (dst_off + i) v
done;
for i = 0 to len0 - 1 do
let i = (len1 * 4) + i in
let v = Bytes.unsafe_get_uint8 src (src_off + i) in
unsafe_set_uint8 bstr (dst_off + i) v
done
let blit_from_bytes src ~src_off bstr ~dst_off ~len =
if
len < 0
|| src_off < 0
|| src_off > Bytes.length src - len
|| dst_off < 0
|| dst_off > length bstr - len
then invalid_arg "Bstr.blit_from_bytes";
unsafe_blit_from_bytes src ~src_off bstr ~dst_off ~len
let blit_from_string src ~src_off bstr ~dst_off ~len =
blit_from_bytes (Bytes.unsafe_of_string src) ~src_off bstr ~dst_off ~len
let blit_to_bytes bstr ~src_off dst ~dst_off ~len =
if
len < 0
|| src_off < 0
|| src_off > length bstr - len
|| dst_off < 0
|| dst_off > Bytes.length dst - len
then invalid_arg "Bstr.blit_to_bytes";
unsafe_blit_to_bytes bstr ~src_off dst ~dst_off ~len
let of_string str =
let len = String.length str in
let bstr = create len in
unsafe_blit_from_bytes
(Bytes.unsafe_of_string str)
~src_off:0 bstr ~dst_off:0 ~len;
bstr
let string ?(off = 0) ?len str =
let len =
match len with Some len -> len | None -> String.length str - off
in
if off < 0 || len < 0 || off > String.length str - len then
invalid_arg "Bstr.string";
let bstr = create len in
unsafe_blit_from_bytes
(Bytes.unsafe_of_string str)
~src_off:off bstr ~dst_off:0 ~len;
bstr
let unsafe_sub_string bstr src_off len =
let buf = Bytes.create len in
unsafe_blit_to_bytes bstr ~src_off buf ~dst_off:0 ~len;
Bytes.unsafe_to_string buf
let sub_string bstr ~off ~len =
if len < 0 || off < 0 || off > length bstr - len then
invalid_arg "Bstr.sub_string";
unsafe_sub_string bstr off len
let to_string bstr = unsafe_sub_string bstr 0 (length bstr)
let is_empty bstr = length bstr == 0
let is_prefix ~affix bstr =
let len_affix = String.length affix in
let len_bstr = length bstr in
if len_affix > len_bstr then false
else
let max_idx_affix = len_affix - 1 in
let rec go idx =
if idx > max_idx_affix then true
else if String.unsafe_get affix idx != unsafe_get bstr idx then false
else go (idx + 1)
in
go 0
let starts_with ~prefix bstr =
let len_prefix = length prefix in
let len_bstr = length bstr in
if len_prefix > len_bstr then false
else
let max_idx_prefix = len_prefix - 1 in
let rec go idx =
if idx > max_idx_prefix then true
else if unsafe_get prefix idx != unsafe_get bstr idx then false
else go (idx + 1)
in
go 0
let is_infix ~affix bstr =
let len_affix = String.length affix in
let len_bstr = length bstr in
if len_affix > len_bstr then false
else
let max_idx_affix = len_affix - 1 in
let max_idx_bstr = len_bstr - len_affix in
let rec go idx k =
if idx > max_idx_bstr then false
else if k > max_idx_affix then true
else if k > 0 then
if affix.[k] == bstr.{idx + k} then go idx (succ k) else go (succ idx) 0
else if affix.[0] = bstr.{idx} then go idx 1
else go (idx + 1) 0
in
go 0 0
let is_suffix ~affix bstr =
let max_idx_affix = String.length affix - 1 in
let max_idx_bstr = length bstr - 1 in
if max_idx_affix > max_idx_bstr then false
else
let rec go idx =
if idx > max_idx_affix then true
else if affix.[max_idx_affix - idx] != bstr.{max_idx_bstr - idx} then
false
else go (idx + 1)
in
go 0
let ends_with ~suffix bstr =
let max_idx_suffix = length suffix - 1 in
let max_idx_bstr = length bstr - 1 in
if max_idx_suffix > max_idx_bstr then false
else
let rec go idx =
if idx > max_idx_suffix then true
else if
unsafe_get suffix (max_idx_suffix - idx)
!= unsafe_get bstr (max_idx_bstr - idx)
then false
else go (idx + 1)
in
go 0
exception Break
let for_all sat bstr =
try
for idx = 0 to length bstr - 1 do
if sat (unsafe_get bstr idx) == false then raise_notrace Break
done;
true
with Break -> false
let exists sat bstr =
try
for idx = 0 to length bstr - 1 do
if sat (unsafe_get bstr idx) then raise_notrace Break
done;
false
with Break -> true
let equal a b =
if length a == length b then
try
let len = length a in
let len0 = len land 3 in
let len1 = len lsr 2 in
for i = 0 to len1 - 1 do
let i = i * 4 in
if unsafe_get_int32_ne a i <> unsafe_get_int32_ne b i then
raise_notrace Break
done;
for i = 0 to len0 - 1 do
let i = (len1 * 4) + i in
if unsafe_get_uint8 a i != unsafe_get_uint8 b i then raise_notrace Break
done;
true
with Break -> false
else false
let with_range ?(first = 0) ?(len = max_int) bstr =
if len < 0 then invalid_arg "Bstr.with_range";
if len == 0 then empty
else
let bstr_len = length bstr in
let max_idx = bstr_len - 1 in
let last =
match len with
| len when len = max_int -> max_idx
| len ->
let last = first + len - 1 in
if last > max_idx then max_idx else last
in
let first = if first < 0 then 0 else first in
if first = 0 && last = max_idx then bstr
else sub bstr ~off:first ~len:(last + 1 - first)
let with_index_range ?(first = 0) ?last bstr =
let bstr_len = length bstr in
let max_idx = bstr_len - 1 in
let last =
match last with
| None -> max_idx
| Some last -> if last > max_idx then max_idx else last
in
let first = if first < 0 then 0 else first in
if first > max_idx || last < 0 || first > last then empty
else if first == 0 && last = max_idx then bstr
else sub bstr ~off:first ~len:(last + 1 - first)
let is_white chr = chr == ' ' || chr == '\t'
let trim ?(drop = is_white) bstr =
let len = length bstr in
if len == 0 then bstr
else
let max_idx = len - 1 in
let rec left_pos idx =
if idx > max_idx then len
else if drop bstr.{idx} then left_pos (succ idx)
else idx
in
let rec right_pos idx =
if idx < 0 then 0
else if drop bstr.{idx} then right_pos (pred idx)
else succ idx
in
let left = left_pos 0 in
if left = len then empty
else
let right = right_pos max_idx in
if left == 0 && right == len then bstr
else sub bstr ~off:left ~len:(right - left)
let fspan ?(min = 0) ?(max = max_int) ?(sat = Fun.const true) bstr =
if min < 0 then invalid_arg "Bstr.fspan";
if max < 0 then invalid_arg "Bstr.fspan";
if min > max || max == 0 then (empty, bstr)
else
let len = length bstr in
let max_idx = len - 1 in
let max_idx =
let k = max - 1 in
if k > max_idx then max_idx else k
in
let need_idx = min in
let rec go idx =
if idx <= max_idx && sat bstr.{idx} then go (succ idx)
else if idx < need_idx || idx == 0 then (empty, bstr)
else if idx == len then (bstr, empty)
else (sub bstr ~off:0 ~len:idx, sub bstr ~off:idx ~len:(len - idx))
in
go 0
let rspan ?(min = 0) ?(max = max_int) ?(sat = Fun.const true) bstr =
if min < 0 then invalid_arg "Bstr.rspan";
if max < 0 then invalid_arg "Bstr.rspan";
if min > max || max == 0 then (bstr, empty)
else
let len = length bstr in
let max_idx = len - 1 in
let min_idx =
let k = len - max in
if k < 0 then 0 else k
in
let need_idx = len - min - 1 in
let rec go idx =
if idx >= min_idx && sat bstr.{idx} then go (pred idx)
else if idx > need_idx || idx == max_idx then (bstr, empty)
else if idx < 0 then (empty, bstr)
else
let cut = idx + 1 in
(sub bstr ~off:0 ~len:cut, sub bstr ~off:cut ~len:(len - cut))
in
go max_idx
let span ?(rev = false) ?min ?max ?sat bstr =
match rev with
| true -> rspan ?min ?max ?sat bstr
| false -> fspan ?min ?max ?sat bstr
let take ?(rev = false) ?min ?max ?sat bstr =
let a, b = span ~rev ?min ?max ?sat bstr in
if rev then b else a
let drop ?(rev = false) ?min ?max ?sat bstr =
let a, b = span ~rev ?min ?max ?sat bstr in
if rev then a else b
let shift bstr off =
if off > length bstr then invalid_arg "Bstr.shift";
let len = length bstr - off in
Bigarray.Array1.sub bstr off len
external ptr : t -> (nativeint[@unboxed])
= "bstr_bytecode_ptr" "bstr_native_ptr"
[@@noalloc]
let overlap a b =
let src_a = ptr a in
let src_b = ptr b in
let len_a = Nativeint.of_int (length a) in
let len_b = Nativeint.of_int (length b) in
let len =
let ( + ) = Nativeint.add in
let ( - ) = Nativeint.sub in
Nativeint.max 0n (Nativeint.min (src_a + len_a) (src_b + len_b))
- Nativeint.max src_a src_b
in
let len = Nativeint.to_int len in
if src_a >= src_b && src_a < Nativeint.add src_b len_b then
let offset = Nativeint.(to_int (sub src_a src_b)) in
Some (len, 0, offset)
else if src_b >= src_a && src_b < Nativeint.add src_a len_a then
let offset = Nativeint.(to_int (sub src_b src_a)) in
Some (len, offset, 0)
else None
let split_on_char sep bstr =
let lst = ref [] in
let max = ref (length bstr) in
for idx = length bstr - 1 downto 0 do
if unsafe_get bstr idx == sep then begin
lst := sub bstr ~off:(idx + 1) ~len:(!max - idx - 1) :: !lst;
max := idx
end
done;
sub bstr ~off:0 ~len:!max :: !lst
let to_seq bstr =
let rec go idx () =
if idx == length bstr then Seq.Nil
else
let chr = unsafe_get bstr idx in
Seq.Cons (chr, go (idx + 1))
in
go 0
let to_seqi bstr =
let rec go idx () =
if idx == length bstr then Seq.Nil
else
let chr = unsafe_get bstr idx in
Seq.Cons ((idx, chr), go (idx + 1))
in
go 0
let of_seq seq =
let n = ref 0 in
let buf = ref (make 0x7ff '\000') in
let resize () =
let new_len = Int.min (2 * length !buf) Sys.max_string_length in
(* TODO(dinosaure): should we keep this limit? *)
if length !buf == new_len then failwith "Bstr.of_seq: cannot grow bigstring";
let new_buf = make new_len '\000' in
Bigarray.Array1.blit !buf (sub new_buf ~off:0 ~len:(length !buf));
buf := new_buf
in
let fn chr =
if !n == length !buf then resize ();
unsafe_set !buf !n chr;
incr n
in
Seq.iter fn seq; sub !buf ~off:0 ~len:!n