First commit

This commit is contained in:
Calascibetta Romain 2024-11-07 20:11:22 +01:00
commit d12c8d79b8
10 changed files with 559 additions and 0 deletions

6
.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
_build/
*~
*.install
.merlin
_opam
.envrc

13
.ocamlformat Normal file
View file

@ -0,0 +1,13 @@
version=0.26.2
exp-grouping=preserve
break-infix=wrap-or-vertical
break-collection-expressions=wrap
break-sequences=false
break-infix-before-func=false
dock-collection-brackets=true
break-separators=before
field-space=tight
if-then-else=compact
break-sequences=false
sequence-blank-line=compact
exp-grouping=preserve

30
README.md Normal file
View file

@ -0,0 +1,30 @@
# Cachet, a simple cache system for `mmap`
Cachet is a small library that provides a simple cache system for page-by-page
read access on a block device. The cache system requires a map function, which
can correspond to [Unix.map_file].
Here's a simple example using `Unix.map_file`:
```ocaml
let shared = true
let empty = Bigarray.Array1.create Bigarray.char Bigarray.c_layout 0
let map fd ~pos len =
let stat = Unix.fstat fd in
let len = Int.min len (stat.Unix.st_size - pos) in
if pos < stat.Unix.st_size
then let barr = Unix.map_file fd ~pos:(Int64.of_int pos)
Bigarray.char Bigarray.c_layout shared [| len |] in
Bigarray.array1_of_genarray barr
else empty
external getpagesize : unit -> int = "unix_getpagesize" [@noalloc]
let () =
let fd = Unix.openfile "disk.img" Unix.[ O_RDONLY ] 0o644 in
let finally () = Unix.close fd in
Fun.protect ~finally @@ fun () ->
let cache = Cachet.make ~pagesize:(getpagesize ()) ~map fd in
let seq = Cachet.get_seq cache 0 in
...
```

2
dune-project Normal file
View file

@ -0,0 +1,2 @@
(lang dune 3.5)
(using mode_specific_stubs 0.1)

290
lib/cachet.ml Normal file
View file

@ -0,0 +1,290 @@
type bigstring =
(char, Bigarray.int8_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t
external swap16 : int -> int = "%bswap16"
external swap32 : int32 -> int32 = "%bswap_int32"
external swap64 : int64 -> int64 = "%bswap_int64"
module Bstr = struct
type t = bigstring
let of_bigstring x = x
let length = Bigarray.Array1.dim
external get : t -> int -> char = "%caml_ba_ref_1"
external get_uint8 : t -> int -> int = "%caml_ba_ref_1"
external get_uint16_ne : t -> int -> int = "%caml_bigstring_get16"
external get_int32_ne : t -> int -> int32 = "%caml_bigstring_get32"
external get_int64_ne : t -> int -> int64 = "%caml_bigstring_get64"
let get_int8 bstr i =
(get_uint8 bstr i lsl (Sys.int_size - 8)) asr (Sys.int_size - 8)
let get_uint16_le bstr i =
if Sys.big_endian then swap16 (get_uint16_ne bstr i)
else get_uint16_ne bstr i
let get_uint16_be bstr i =
if not Sys.big_endian then swap16 (get_uint16_ne bstr i)
else get_uint16_ne bstr i
let get_int16_ne bstr i =
(get_uint16_ne bstr i lsl (Sys.int_size - 16)) asr (Sys.int_size - 16)
let get_int16_le bstr i =
(get_uint16_le bstr i lsl (Sys.int_size - 16)) asr (Sys.int_size - 16)
let get_int16_be bstr i =
(get_uint16_be bstr i lsl (Sys.int_size - 16)) asr (Sys.int_size - 16)
let get_int32_le bstr i =
if Sys.big_endian then swap32 (get_int32_ne bstr i) else get_int32_ne bstr i
let get_int32_be bstr i =
if not Sys.big_endian then swap32 (get_int32_ne bstr i)
else get_int32_ne bstr i
let get_int64_le bstr i =
if Sys.big_endian then swap64 (get_int64_ne bstr i) else get_int64_ne bstr i
let get_int64_be bstr i =
if not Sys.big_endian then swap64 (get_int64_ne bstr i)
else get_int64_ne bstr i
let sub t ~off ~len = Bigarray.Array1.sub t off len
let blit_to_bytes bstr ~src_off dst ~dst_off ~len =
if
len < 0
|| src_off < 0
|| src_off > length bstr - len
|| dst_off < 0
|| dst_off > Bytes.length dst - len
then invalid_arg "Cachet.Bstr.blit_to_bytes";
let len0 = len land 3 in
let len1 = len lsr 2 in
for i = 0 to len1 - 1 do
let i = i * 4 in
let v = get_int32_ne bstr (src_off + i) in
Bytes.set_int32_ne dst (dst_off + i) v
done;
for i = 0 to len0 - 1 do
let i = (len1 * 4) + i in
let v = get_uint8 bstr (src_off + i) in
Bytes.set_uint8 dst (dst_off + i) v
done
let sub_string bstr ~off ~len =
let buf = Bytes.create len in
blit_to_bytes bstr ~src_off:off buf ~dst_off:0 ~len;
Bytes.unsafe_to_string buf
let to_string bstr = sub_string bstr ~off:0 ~len:(length bstr)
let is_empty bstr = length bstr == 0
end
external hash : (int32[@unboxed]) -> int -> (int32[@unboxed])
= "cachet_hash_mix_intnat" "caml_hash_mix_intnat"
[@@noalloc]
let hash h d = Int32.to_int (hash h d)
let failwithf fmt = Format.ksprintf (fun str -> failwith str) fmt
type slice = { offset: int; length: int; payload: bigstring }
let pp_slice ppf { offset; length; _ } =
Format.fprintf ppf "{ @[<hov>offset= %x;@ length= %d;@] }" offset length
(* Counter Trailing Zero *)
let unsafe_ctz n =
let t = ref 1 in
let r = ref 0 in
while n land !t = 0 do
t := !t lsl 1;
incr r
done;
!r
let bstr_of_slice ?(logical_address = 0) { offset; length; payload } =
if logical_address < 0 then invalid_arg "Cachet.bstr_of_slice";
if logical_address == 0 || logical_address == offset then payload
else if logical_address > offset + length then
invalid_arg "Cachet.bstr_of_slice"
else
let pagesize = unsafe_ctz offset in
let off = logical_address land ((pagesize lsl 1) - 1) in
let len = length - off in
Bstr.sub payload ~off ~len
type metrics = { mutable cache_hit: int; mutable cache_miss: int }
let metrics () = { cache_hit= 0; cache_miss= 0 }
type 'fd t = {
arr: slice option array
; fd: 'fd
; map: 'fd map
; pagesize: int
; cachesize: int
; metrics: metrics
}
and 'fd map = 'fd -> pos:int -> int -> bigstring
let fd { fd; _ } = fd
let copy t =
{
arr= Array.make (1 lsl t.cachesize) None
; fd= t.fd
; map= t.map
; pagesize= t.pagesize
; cachesize= t.cachesize
; metrics= metrics ()
}
(* XXX(dinosaure): power of two. *)
let pot x = x land (x - 1) == 0 && x != 0
let make ?(cachesize = 1 lsl 10) ?(pagesize = 1 lsl 12) ~map fd =
if pot cachesize = false || pot pagesize = false then
invalid_arg "Chat.make: cachesize or pagesize must be a power of two";
let arr = Array.make cachesize None in
let pagesize = unsafe_ctz pagesize in
let cachesize = unsafe_ctz cachesize in
let metrics = metrics () in
{ arr; fd; map; pagesize; cachesize; metrics }
let load t logical_address =
let page = logical_address lsr t.pagesize in
let payload = t.map t.fd ~pos:(page lsl t.pagesize) (1 lsl t.pagesize) in
let length = Bigarray.Array1.dim payload in
let slice = { offset= page lsl t.pagesize; length; payload } in
let hash = hash 0l slice.offset land ((1 lsl t.cachesize) - 1) in
t.arr.(hash) <- Some slice;
slice
let none : slice option = None
let cache_miss t = t.metrics.cache_miss
let cache_hit t = t.metrics.cache_hit
let load t ?(len = 1) logical_address =
if len > 1 lsl t.pagesize then
invalid_arg "Cachet.load: you can not load more than a page";
if logical_address < 0 then
invalid_arg "Cachet.load: a logical address must be positive";
let page = logical_address lsr t.pagesize in
let hash = hash 0l (page lsl t.pagesize) land ((1 lsl t.cachesize) - 1) in
let offset = logical_address land ((t.pagesize lsl 1) - 1) in
match t.arr.(hash) with
| Some slice as value when slice.offset == page lsl t.pagesize ->
t.metrics.cache_hit <- t.metrics.cache_hit + 1;
if slice.length - offset >= len then value else none
| Some _ | None ->
t.metrics.cache_miss <- t.metrics.cache_miss + 1;
let slice = load t logical_address in
if slice.length - offset >= len then Some slice else none
let invalidate t ~off:logical_address ~len =
if logical_address < 0 || len < 0 then
invalid_arg
"Cachet.invalidate: the logical address and/or the number of bytes to \
invalid must be positives";
let start_page = logical_address lsr t.pagesize in
let end_page = (logical_address + len) lsr t.pagesize in
let mask = (1 lsl t.cachesize) - 1 in
for i = start_page to end_page - 1 do
t.arr.(hash 0l (i lsl t.pagesize) land mask) <- None
done
let is_aligned x = x land ((1 lsl 2) - 1) == 0
let get_uint8 t logical_address =
match load t ~len:1 logical_address with
| Some { payload; _ } ->
let offset = logical_address land ((1 lsl t.pagesize) - 1) in
Bstr.get_uint8 payload offset
| None -> failwithf "Cachet.get_uint8"
let get_int8 t logical_address =
(get_uint8 t logical_address lsl (Sys.int_size - 8)) asr (Sys.int_size - 8)
let blit_to_bytes t ~src_off:logical_address buf ~dst_off ~len =
if len < 0 || dst_off < 0 || dst_off > Bytes.length buf - len then
invalid_arg "Cachet.blit_to_bytes";
let off = logical_address land ((1 lsl t.pagesize) - 1) in
if is_aligned off && (1 lsl t.pagesize) - off >= len then begin
match load t ~len logical_address with
| None -> failwithf "Cachet.blit_to_bytes"
| Some slice ->
Bstr.blit_to_bytes slice.payload ~src_off:off buf ~dst_off:0 ~len
end
else
for i = 0 to len - 1 do
let v = get_uint8 t (logical_address + i) in
Bytes.set_uint8 buf (dst_off + i) v
done
let get_string t ~len logical_address =
let buf = Bytes.create len in
blit_to_bytes t ~src_off:logical_address buf ~dst_off:0 ~len;
Bytes.unsafe_to_string buf
let rec get_seq t logical_address () =
match load t logical_address with
| Some { offset; payload; length; _ } ->
let off = logical_address land ((1 lsl t.pagesize) - 1) in
let len = length - off in
let buf = Bytes.create len in
Bstr.blit_to_bytes payload ~src_off:off buf ~dst_off:0 ~len;
let str = Bytes.unsafe_to_string buf in
let next = get_seq t (offset + (1 lsl t.pagesize)) in
Seq.Cons (str, next)
| None -> Seq.Nil
let next t slice = load t (slice.offset + (1 lsl t.pagesize))
let naive_iter_with_len t len ~fn logical_address =
for i = 0 to len - 1 do
fn (get_uint8 t (logical_address + i))
done
let iter_with_len t len ~fn logical_address =
if len > 1 lsl t.pagesize then naive_iter_with_len t len ~fn logical_address
else begin
match load t logical_address with
| Some { offset; payload; length } ->
let off = logical_address land ((1 lsl t.pagesize) - 1) in
let max = Int.min (length - off) len in
for i = 0 to max - 1 do
fn (Bstr.get_uint8 payload (off + i))
done;
if max < len then begin
let logical_address = offset + (1 lsl t.pagesize) in
match load t logical_address with
| Some { payload; length; _ } ->
if len - max > length then failwith "Chat.iter_with_len";
for i = 0 to len - max - 1 do
fn (Bstr.get_uint8 payload i)
done
| None -> failwith "Chat.iter_with_len"
end
| None -> failwith "Chat.iter_with_len"
end
let iter t ?len ~fn logical_address =
match len with
| Some len -> iter_with_len t len ~fn logical_address
| None ->
let rec go logical_address =
match load t logical_address with
| Some { offset; payload; length } ->
let off = logical_address land ((1 lsl t.pagesize) - 1) in
let len = length - off in
for i = 0 to len - 1 do
fn (Bstr.get_uint8 payload (off + i))
done;
go (offset + (1 lsl t.pagesize))
| None -> ()
in
go logical_address

158
lib/cachet.mli Normal file
View file

@ -0,0 +1,158 @@
type bigstring =
(char, Bigarray.int8_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t
module Bstr : sig
(** A read-only bigstring. *)
type t = private bigstring
val of_bigstring : bigstring -> t
val length : t -> int
val get : t -> int -> char
val get_int8 : t -> int -> int
val get_uint8 : t -> int -> int
val get_int16_ne : t -> int -> int
val get_int16_le : t -> int -> int
val get_int16_be : t -> int -> int
val get_int32_ne : t -> int -> int32
val get_int32_le : t -> int -> int32
val get_int32_be : t -> int -> int32
val get_int64_ne : t -> int -> int64
val get_int64_le : t -> int -> int64
val get_int64_be : t -> int -> int64
val sub : t -> off:int -> len:int -> t
val sub_string : t -> off:int -> len:int -> string
val to_string : t -> string
val blit_to_bytes :
t -> src_off:int -> bytes -> dst_off:int -> len:int -> unit
val is_empty : t -> bool
(*
val is_prefix : affix:string -> t -> bool
val is_infix : affix:string -> t -> bool
val is_suffix : affix:string -> t -> bool
val for_all : (char -> bool) -> t -> bool
val exists : (char -> bool) -> t -> bool
val equal : t -> t -> bool
val compare : t -> t -> int
val with_range : ?first:int -> ?len:int -> t -> t
val with_index_range : ?first:int -> ?last:int -> t -> t
val trim : ?drop:(char -> bool) -> t -> t
val span : ?rev:bool -> ?min:int -> ?max:int -> ?sat:(char -> bool) -> t -> t * t
val take : ?rev:bool -> ?min:int -> ?max:int -> ?sat:(char -> bool) -> t -> t
val drop : ?rev:bool -> ?min:int -> ?max:int -> ?sat:(char -> bool) -> t -> t
val cut : ?rev:bool -> sep:string -> t -> (t * t) option
val cuts : ?rev:bool -> ?empty:bool -> sep:string -> t -> t list
*)
end
type slice = private { offset: int; length: int; payload: Bstr.t }
(** A slice is an aligned segment of bytes (according to the [pagesize]
specified by the cache, see {!val:make}) with its absolute position into the
underlying {i block-device} and size. *)
val pp_slice : Format.formatter -> slice -> unit
val bstr_of_slice : ?logical_address:int -> slice -> Bstr.t
type 'fd map = 'fd -> pos:int -> int -> bigstring
(** A value [map : 'fd map] when applied [map fd ~pos len] reads a
{!type:bigstring} at [pos]. [map] must return as much data as is available,
though never more than [len] bytes. [map] never fails. Instead, an empty
[bigstring] must be returned if e.g. the position is out of range.
Depending on how the cache is configured (see {!val:make}), [map] never
read more than [pagesize] bytes. *)
(** {2: Note about schedulers and [Cachet].}
[Cachet] assumes that {!type:map} is {b atomic}, in other words: {!type:map}
is a unit of work that is indivisible and guaranteed to be executed as a
single, coherent, and uninterrupted operation.
In this way, the [map] function is considered as a "direct" computation that
does {b not} interact with a scheduler. However, reading a page can take
time. It may therefore be necessary to add a cooperation point after
{!val:load} or the user-friendly functions.
These functions can read one or more pages. {!val:load} reads one page at
most. *)
type 'fd t
val fd : 'fd t -> 'fd
val cache_hit : 'fd t -> int
(** [cache_hit t] is the number of times a load hit the cache. *)
val cache_miss : 'fd t -> int
(** [cache_miss t] is the number of times a load didn't hit the cache. *)
val copy : 'fd t -> 'fd t
(** [copy t] creates a new, empty cache using the same [map] function. *)
val make : ?cachesize:int -> ?pagesize:int -> map:'fd map -> 'fd -> 'fd t
(** [make ~cachesize ~pagesize ~map fd] creates a new, empty cache using [map]
and [fd] for reading [pagesize] bytes. The size of the cache is [cachesize].
@raise Invalid_argument if either [cachesize] or [pagesize] is not a power
of two. *)
val load : 'fd t -> ?len:int -> int -> slice option
(** [load t ~len logical_address] loads a page at the given [logical_address]
and returns a {!type:slice}. [len] (defaults to [1]) is the expected
minimum number of bytes returned.
If the slice does not contains, at least, [len] bytes, [load] returns [None].
[load t ~len:0 logical_address] always returns an empty slice. *)
val invalidate : 'fd t -> off:int -> len:int -> unit
(** [invalidate t ~off ~len] invalidates the cache on [len] bytes from [off]. *)
(** {2 User friendly functions.} *)
(** {3 Binary decoding of integers.}
The functions in this section binary decode integers from byte sequences.
All following functions raise [Invalid_argument] if the space needed at
index [i] to decode the integer is not available.
Little-endian (resp. big-endian) encoding means that least (resp. most)
significant bytes are stored first. Big-endian is also known as network byte
order. Native-endian encoding is either little-endian or big-endian
depending on {!Sys.big_endian}.
32-bit and 64-bit integers are represented by the [int] type, which has more
bits than the binary encoding. Functions that decode signed (resp. unsigned)
8-bit or 16-bit integers represented by [int] values sign-extend (resp.
zero-extend) their result. *)
val get_int8 : 'fd t -> int -> int
val get_uint8 : 'fd t -> int -> int
(*
val get_uint16_ne : 'fd t -> int -> int
val get_uint16_le : 'fd t -> int -> int
val get_uint16_be : 'fd t -> int -> int
val get_int16_ne : 'fd t -> int -> int
val get_int16_le : 'fd t -> int -> int
val get_int16_be : 'fd t -> int -> int
val get_int32_ne : 'fd t -> int -> int32
val get_int32_le : 'fd t -> int -> int32
val get_int32_be : 'fd t -> int -> int32
val get_int64_ne : 'fd t -> int -> int64
val get_int64_le : 'fd t -> int -> int64
val get_int64_be : 'fd t -> int -> int64
*)
val get_string : 'fd t -> len:int -> int -> string
val get_seq : 'fd t -> int -> string Seq.t
val next : 'fd t -> slice -> slice option
val iter : 'fd t -> ?len:int -> fn:(int -> unit) -> int -> unit
val blit_to_bytes :
'fd t -> src_off:int -> bytes -> dst_off:int -> len:int -> unit
(*
val blit_to_bigstring : 'fd t -> src_off:int -> bigstring -> dst_off:int -> len:int -> unit
*)

8
lib/dune Normal file
View file

@ -0,0 +1,8 @@
(library
(name cachet)
(modes native)
; (foreign_stubs
; (language c)
; (mode byte)
; (names hash))
(modules cachet))

8
lib/hash.c Normal file
View file

@ -0,0 +1,8 @@
#include "caml/mlvalues.h"
#include "caml/hash.h"
#include "caml/alloc.h"
CAMLprim value
cachet_hash_mix_intnat(value h, value d) {
return caml_copy_int32(caml_hash_mix_intnat(Int32_val (h), Int_val (d)));
}

3
test/dune Normal file
View file

@ -0,0 +1,3 @@
(test
(name test)
(libraries cachet bigstringaf alcotest))

41
test/test.ml Normal file
View file

@ -0,0 +1,41 @@
let random ?g len =
let bstr = Bigstringaf.create len in
for i = 0 to len - 1 do
let chr =
match g with
| Some g -> Char.unsafe_chr (Random.State.bits g land 0xff)
| None -> Char.unsafe_chr (Random.bits () land 0xff)
in
Bigstringaf.set bstr i chr
done;
bstr
let make ?cachesize ?pagesize ?g len =
let bstr = random ?g len in
let map () ~pos len =
if pos < 0 || len < 0 || pos > Bigarray.Array1.dim bstr then
Printf.ksprintf invalid_arg "map ~pos:%d %d" pos len;
let len' = Int.min (Bigarray.Array1.dim bstr - pos) len in
Bigarray.Array1.sub bstr pos len'
in
(Cachet.make ?cachesize ?pagesize ~map (), bstr)
let test01 =
Alcotest.test_case "test01" `Quick @@ fun () ->
let t, oracle = make ~cachesize:0x100 ~pagesize:0x100 0xe000 in
let a = Cachet.get_uint8 t 0xdead in
let b = Char.code (Bigstringaf.get oracle 0xdead) in
Alcotest.(check int) "0xdead" a b;
let a = Cachet.get_string t 0xdead ~len:10 in
let b = Bigstringaf.substring oracle ~off:0xdead ~len:10 in
Alcotest.(check string) "0xdead" a b;
let a = Cachet.get_string t 0xdea0 ~len:10 in
let b = Bigstringaf.substring oracle ~off:0xdea0 ~len:10 in
Alcotest.(check string) "0xdea0" a b;
let a = Cachet.get_seq t 0 in
let b = Bigstringaf.to_string oracle in
let a = List.of_seq a in
let a = String.concat "" a in
Alcotest.(check string) "all" a b
let () = Alcotest.run "chat" [ ("simple", [ test01 ]) ]