From d12c8d79b8e5b4fe95b791253abdf868a05eab68 Mon Sep 17 00:00:00 2001 From: Calascibetta Romain Date: Thu, 7 Nov 2024 20:11:22 +0100 Subject: [PATCH] First commit --- .gitignore | 6 + .ocamlformat | 13 +++ README.md | 30 +++++ dune-project | 2 + lib/cachet.ml | 290 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/cachet.mli | 158 +++++++++++++++++++++++++++ lib/dune | 8 ++ lib/hash.c | 8 ++ test/dune | 3 + test/test.ml | 41 +++++++ 10 files changed, 559 insertions(+) create mode 100644 .gitignore create mode 100644 .ocamlformat create mode 100644 README.md create mode 100644 dune-project create mode 100644 lib/cachet.ml create mode 100644 lib/cachet.mli create mode 100644 lib/dune create mode 100644 lib/hash.c create mode 100644 test/dune create mode 100644 test/test.ml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..62f2453 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +_build/ +*~ +*.install +.merlin +_opam +.envrc diff --git a/.ocamlformat b/.ocamlformat new file mode 100644 index 0000000..3e2548b --- /dev/null +++ b/.ocamlformat @@ -0,0 +1,13 @@ +version=0.26.2 +exp-grouping=preserve +break-infix=wrap-or-vertical +break-collection-expressions=wrap +break-sequences=false +break-infix-before-func=false +dock-collection-brackets=true +break-separators=before +field-space=tight +if-then-else=compact +break-sequences=false +sequence-blank-line=compact +exp-grouping=preserve diff --git a/README.md b/README.md new file mode 100644 index 0000000..77af4c1 --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +# Cachet, a simple cache system for `mmap` + +Cachet is a small library that provides a simple cache system for page-by-page +read access on a block device. The cache system requires a map function, which +can correspond to [Unix.map_file]. + +Here's a simple example using `Unix.map_file`: +```ocaml +let shared = true +let empty = Bigarray.Array1.create Bigarray.char Bigarray.c_layout 0 + +let map fd ~pos len = + let stat = Unix.fstat fd in + let len = Int.min len (stat.Unix.st_size - pos) in + if pos < stat.Unix.st_size + then let barr = Unix.map_file fd ~pos:(Int64.of_int pos) + Bigarray.char Bigarray.c_layout shared [| len |] in + Bigarray.array1_of_genarray barr + else empty + +external getpagesize : unit -> int = "unix_getpagesize" [@noalloc] + +let () = + let fd = Unix.openfile "disk.img" Unix.[ O_RDONLY ] 0o644 in + let finally () = Unix.close fd in + Fun.protect ~finally @@ fun () -> + let cache = Cachet.make ~pagesize:(getpagesize ()) ~map fd in + let seq = Cachet.get_seq cache 0 in + ... +``` diff --git a/dune-project b/dune-project new file mode 100644 index 0000000..eaf11b9 --- /dev/null +++ b/dune-project @@ -0,0 +1,2 @@ +(lang dune 3.5) +(using mode_specific_stubs 0.1) diff --git a/lib/cachet.ml b/lib/cachet.ml new file mode 100644 index 0000000..cc34e8a --- /dev/null +++ b/lib/cachet.ml @@ -0,0 +1,290 @@ +type bigstring = + (char, Bigarray.int8_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t + +external swap16 : int -> int = "%bswap16" +external swap32 : int32 -> int32 = "%bswap_int32" +external swap64 : int64 -> int64 = "%bswap_int64" + +module Bstr = struct + type t = bigstring + + let of_bigstring x = x + let length = Bigarray.Array1.dim + + external get : t -> int -> char = "%caml_ba_ref_1" + external get_uint8 : t -> int -> int = "%caml_ba_ref_1" + external get_uint16_ne : t -> int -> int = "%caml_bigstring_get16" + external get_int32_ne : t -> int -> int32 = "%caml_bigstring_get32" + external get_int64_ne : t -> int -> int64 = "%caml_bigstring_get64" + + let get_int8 bstr i = + (get_uint8 bstr i lsl (Sys.int_size - 8)) asr (Sys.int_size - 8) + + let get_uint16_le bstr i = + if Sys.big_endian then swap16 (get_uint16_ne bstr i) + else get_uint16_ne bstr i + + let get_uint16_be bstr i = + if not Sys.big_endian then swap16 (get_uint16_ne bstr i) + else get_uint16_ne bstr i + + let get_int16_ne bstr i = + (get_uint16_ne bstr i lsl (Sys.int_size - 16)) asr (Sys.int_size - 16) + + let get_int16_le bstr i = + (get_uint16_le bstr i lsl (Sys.int_size - 16)) asr (Sys.int_size - 16) + + let get_int16_be bstr i = + (get_uint16_be bstr i lsl (Sys.int_size - 16)) asr (Sys.int_size - 16) + + let get_int32_le bstr i = + if Sys.big_endian then swap32 (get_int32_ne bstr i) else get_int32_ne bstr i + + let get_int32_be bstr i = + if not Sys.big_endian then swap32 (get_int32_ne bstr i) + else get_int32_ne bstr i + + let get_int64_le bstr i = + if Sys.big_endian then swap64 (get_int64_ne bstr i) else get_int64_ne bstr i + + let get_int64_be bstr i = + if not Sys.big_endian then swap64 (get_int64_ne bstr i) + else get_int64_ne bstr i + + let sub t ~off ~len = Bigarray.Array1.sub t off len + + let blit_to_bytes bstr ~src_off dst ~dst_off ~len = + if + len < 0 + || src_off < 0 + || src_off > length bstr - len + || dst_off < 0 + || dst_off > Bytes.length dst - len + then invalid_arg "Cachet.Bstr.blit_to_bytes"; + let len0 = len land 3 in + let len1 = len lsr 2 in + for i = 0 to len1 - 1 do + let i = i * 4 in + let v = get_int32_ne bstr (src_off + i) in + Bytes.set_int32_ne dst (dst_off + i) v + done; + for i = 0 to len0 - 1 do + let i = (len1 * 4) + i in + let v = get_uint8 bstr (src_off + i) in + Bytes.set_uint8 dst (dst_off + i) v + done + + let sub_string bstr ~off ~len = + let buf = Bytes.create len in + blit_to_bytes bstr ~src_off:off buf ~dst_off:0 ~len; + Bytes.unsafe_to_string buf + + let to_string bstr = sub_string bstr ~off:0 ~len:(length bstr) + let is_empty bstr = length bstr == 0 +end + +external hash : (int32[@unboxed]) -> int -> (int32[@unboxed]) + = "cachet_hash_mix_intnat" "caml_hash_mix_intnat" +[@@noalloc] + +let hash h d = Int32.to_int (hash h d) +let failwithf fmt = Format.ksprintf (fun str -> failwith str) fmt + +type slice = { offset: int; length: int; payload: bigstring } + +let pp_slice ppf { offset; length; _ } = + Format.fprintf ppf "{ @[offset= %x;@ length= %d;@] }" offset length + +(* Counter Trailing Zero *) +let unsafe_ctz n = + let t = ref 1 in + let r = ref 0 in + while n land !t = 0 do + t := !t lsl 1; + incr r + done; + !r + +let bstr_of_slice ?(logical_address = 0) { offset; length; payload } = + if logical_address < 0 then invalid_arg "Cachet.bstr_of_slice"; + if logical_address == 0 || logical_address == offset then payload + else if logical_address > offset + length then + invalid_arg "Cachet.bstr_of_slice" + else + let pagesize = unsafe_ctz offset in + let off = logical_address land ((pagesize lsl 1) - 1) in + let len = length - off in + Bstr.sub payload ~off ~len + +type metrics = { mutable cache_hit: int; mutable cache_miss: int } + +let metrics () = { cache_hit= 0; cache_miss= 0 } + +type 'fd t = { + arr: slice option array + ; fd: 'fd + ; map: 'fd map + ; pagesize: int + ; cachesize: int + ; metrics: metrics +} + +and 'fd map = 'fd -> pos:int -> int -> bigstring + +let fd { fd; _ } = fd + +let copy t = + { + arr= Array.make (1 lsl t.cachesize) None + ; fd= t.fd + ; map= t.map + ; pagesize= t.pagesize + ; cachesize= t.cachesize + ; metrics= metrics () + } + +(* XXX(dinosaure): power of two. *) +let pot x = x land (x - 1) == 0 && x != 0 + +let make ?(cachesize = 1 lsl 10) ?(pagesize = 1 lsl 12) ~map fd = + if pot cachesize = false || pot pagesize = false then + invalid_arg "Chat.make: cachesize or pagesize must be a power of two"; + let arr = Array.make cachesize None in + let pagesize = unsafe_ctz pagesize in + let cachesize = unsafe_ctz cachesize in + let metrics = metrics () in + { arr; fd; map; pagesize; cachesize; metrics } + +let load t logical_address = + let page = logical_address lsr t.pagesize in + let payload = t.map t.fd ~pos:(page lsl t.pagesize) (1 lsl t.pagesize) in + let length = Bigarray.Array1.dim payload in + let slice = { offset= page lsl t.pagesize; length; payload } in + let hash = hash 0l slice.offset land ((1 lsl t.cachesize) - 1) in + t.arr.(hash) <- Some slice; + slice + +let none : slice option = None +let cache_miss t = t.metrics.cache_miss +let cache_hit t = t.metrics.cache_hit + +let load t ?(len = 1) logical_address = + if len > 1 lsl t.pagesize then + invalid_arg "Cachet.load: you can not load more than a page"; + if logical_address < 0 then + invalid_arg "Cachet.load: a logical address must be positive"; + let page = logical_address lsr t.pagesize in + let hash = hash 0l (page lsl t.pagesize) land ((1 lsl t.cachesize) - 1) in + let offset = logical_address land ((t.pagesize lsl 1) - 1) in + match t.arr.(hash) with + | Some slice as value when slice.offset == page lsl t.pagesize -> + t.metrics.cache_hit <- t.metrics.cache_hit + 1; + if slice.length - offset >= len then value else none + | Some _ | None -> + t.metrics.cache_miss <- t.metrics.cache_miss + 1; + let slice = load t logical_address in + if slice.length - offset >= len then Some slice else none + +let invalidate t ~off:logical_address ~len = + if logical_address < 0 || len < 0 then + invalid_arg + "Cachet.invalidate: the logical address and/or the number of bytes to \ + invalid must be positives"; + let start_page = logical_address lsr t.pagesize in + let end_page = (logical_address + len) lsr t.pagesize in + let mask = (1 lsl t.cachesize) - 1 in + for i = start_page to end_page - 1 do + t.arr.(hash 0l (i lsl t.pagesize) land mask) <- None + done + +let is_aligned x = x land ((1 lsl 2) - 1) == 0 + +let get_uint8 t logical_address = + match load t ~len:1 logical_address with + | Some { payload; _ } -> + let offset = logical_address land ((1 lsl t.pagesize) - 1) in + Bstr.get_uint8 payload offset + | None -> failwithf "Cachet.get_uint8" + +let get_int8 t logical_address = + (get_uint8 t logical_address lsl (Sys.int_size - 8)) asr (Sys.int_size - 8) + +let blit_to_bytes t ~src_off:logical_address buf ~dst_off ~len = + if len < 0 || dst_off < 0 || dst_off > Bytes.length buf - len then + invalid_arg "Cachet.blit_to_bytes"; + let off = logical_address land ((1 lsl t.pagesize) - 1) in + if is_aligned off && (1 lsl t.pagesize) - off >= len then begin + match load t ~len logical_address with + | None -> failwithf "Cachet.blit_to_bytes" + | Some slice -> + Bstr.blit_to_bytes slice.payload ~src_off:off buf ~dst_off:0 ~len + end + else + for i = 0 to len - 1 do + let v = get_uint8 t (logical_address + i) in + Bytes.set_uint8 buf (dst_off + i) v + done + +let get_string t ~len logical_address = + let buf = Bytes.create len in + blit_to_bytes t ~src_off:logical_address buf ~dst_off:0 ~len; + Bytes.unsafe_to_string buf + +let rec get_seq t logical_address () = + match load t logical_address with + | Some { offset; payload; length; _ } -> + let off = logical_address land ((1 lsl t.pagesize) - 1) in + let len = length - off in + let buf = Bytes.create len in + Bstr.blit_to_bytes payload ~src_off:off buf ~dst_off:0 ~len; + let str = Bytes.unsafe_to_string buf in + let next = get_seq t (offset + (1 lsl t.pagesize)) in + Seq.Cons (str, next) + | None -> Seq.Nil + +let next t slice = load t (slice.offset + (1 lsl t.pagesize)) + +let naive_iter_with_len t len ~fn logical_address = + for i = 0 to len - 1 do + fn (get_uint8 t (logical_address + i)) + done + +let iter_with_len t len ~fn logical_address = + if len > 1 lsl t.pagesize then naive_iter_with_len t len ~fn logical_address + else begin + match load t logical_address with + | Some { offset; payload; length } -> + let off = logical_address land ((1 lsl t.pagesize) - 1) in + let max = Int.min (length - off) len in + for i = 0 to max - 1 do + fn (Bstr.get_uint8 payload (off + i)) + done; + if max < len then begin + let logical_address = offset + (1 lsl t.pagesize) in + match load t logical_address with + | Some { payload; length; _ } -> + if len - max > length then failwith "Chat.iter_with_len"; + for i = 0 to len - max - 1 do + fn (Bstr.get_uint8 payload i) + done + | None -> failwith "Chat.iter_with_len" + end + | None -> failwith "Chat.iter_with_len" + end + +let iter t ?len ~fn logical_address = + match len with + | Some len -> iter_with_len t len ~fn logical_address + | None -> + let rec go logical_address = + match load t logical_address with + | Some { offset; payload; length } -> + let off = logical_address land ((1 lsl t.pagesize) - 1) in + let len = length - off in + for i = 0 to len - 1 do + fn (Bstr.get_uint8 payload (off + i)) + done; + go (offset + (1 lsl t.pagesize)) + | None -> () + in + go logical_address diff --git a/lib/cachet.mli b/lib/cachet.mli new file mode 100644 index 0000000..411bc0d --- /dev/null +++ b/lib/cachet.mli @@ -0,0 +1,158 @@ +type bigstring = + (char, Bigarray.int8_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t + +module Bstr : sig + (** A read-only bigstring. *) + + type t = private bigstring + + val of_bigstring : bigstring -> t + val length : t -> int + val get : t -> int -> char + val get_int8 : t -> int -> int + val get_uint8 : t -> int -> int + val get_int16_ne : t -> int -> int + val get_int16_le : t -> int -> int + val get_int16_be : t -> int -> int + val get_int32_ne : t -> int -> int32 + val get_int32_le : t -> int -> int32 + val get_int32_be : t -> int -> int32 + val get_int64_ne : t -> int -> int64 + val get_int64_le : t -> int -> int64 + val get_int64_be : t -> int -> int64 + val sub : t -> off:int -> len:int -> t + val sub_string : t -> off:int -> len:int -> string + val to_string : t -> string + + val blit_to_bytes : + t -> src_off:int -> bytes -> dst_off:int -> len:int -> unit + + val is_empty : t -> bool + + (* + val is_prefix : affix:string -> t -> bool + val is_infix : affix:string -> t -> bool + val is_suffix : affix:string -> t -> bool + val for_all : (char -> bool) -> t -> bool + val exists : (char -> bool) -> t -> bool + val equal : t -> t -> bool + val compare : t -> t -> int + val with_range : ?first:int -> ?len:int -> t -> t + val with_index_range : ?first:int -> ?last:int -> t -> t + val trim : ?drop:(char -> bool) -> t -> t + val span : ?rev:bool -> ?min:int -> ?max:int -> ?sat:(char -> bool) -> t -> t * t + val take : ?rev:bool -> ?min:int -> ?max:int -> ?sat:(char -> bool) -> t -> t + val drop : ?rev:bool -> ?min:int -> ?max:int -> ?sat:(char -> bool) -> t -> t + val cut : ?rev:bool -> sep:string -> t -> (t * t) option + val cuts : ?rev:bool -> ?empty:bool -> sep:string -> t -> t list + *) +end + +type slice = private { offset: int; length: int; payload: Bstr.t } +(** A slice is an aligned segment of bytes (according to the [pagesize] + specified by the cache, see {!val:make}) with its absolute position into the + underlying {i block-device} and size. *) + +val pp_slice : Format.formatter -> slice -> unit +val bstr_of_slice : ?logical_address:int -> slice -> Bstr.t + +type 'fd map = 'fd -> pos:int -> int -> bigstring +(** A value [map : 'fd map] when applied [map fd ~pos len] reads a + {!type:bigstring} at [pos]. [map] must return as much data as is available, + though never more than [len] bytes. [map] never fails. Instead, an empty + [bigstring] must be returned if e.g. the position is out of range. + Depending on how the cache is configured (see {!val:make}), [map] never + read more than [pagesize] bytes. *) + +(** {2: Note about schedulers and [Cachet].} + + [Cachet] assumes that {!type:map} is {b atomic}, in other words: {!type:map} + is a unit of work that is indivisible and guaranteed to be executed as a + single, coherent, and uninterrupted operation. + + In this way, the [map] function is considered as a "direct" computation that + does {b not} interact with a scheduler. However, reading a page can take + time. It may therefore be necessary to add a cooperation point after + {!val:load} or the user-friendly functions. + + These functions can read one or more pages. {!val:load} reads one page at + most. *) + +type 'fd t + +val fd : 'fd t -> 'fd + +val cache_hit : 'fd t -> int +(** [cache_hit t] is the number of times a load hit the cache. *) + +val cache_miss : 'fd t -> int +(** [cache_miss t] is the number of times a load didn't hit the cache. *) + +val copy : 'fd t -> 'fd t +(** [copy t] creates a new, empty cache using the same [map] function. *) + +val make : ?cachesize:int -> ?pagesize:int -> map:'fd map -> 'fd -> 'fd t +(** [make ~cachesize ~pagesize ~map fd] creates a new, empty cache using [map] + and [fd] for reading [pagesize] bytes. The size of the cache is [cachesize]. + + @raise Invalid_argument if either [cachesize] or [pagesize] is not a power + of two. *) + +val load : 'fd t -> ?len:int -> int -> slice option +(** [load t ~len logical_address] loads a page at the given [logical_address] + and returns a {!type:slice}. [len] (defaults to [1]) is the expected + minimum number of bytes returned. + + If the slice does not contains, at least, [len] bytes, [load] returns [None]. + [load t ~len:0 logical_address] always returns an empty slice. *) + +val invalidate : 'fd t -> off:int -> len:int -> unit +(** [invalidate t ~off ~len] invalidates the cache on [len] bytes from [off]. *) + +(** {2 User friendly functions.} *) + +(** {3 Binary decoding of integers.} + + The functions in this section binary decode integers from byte sequences. + + All following functions raise [Invalid_argument] if the space needed at + index [i] to decode the integer is not available. + + Little-endian (resp. big-endian) encoding means that least (resp. most) + significant bytes are stored first. Big-endian is also known as network byte + order. Native-endian encoding is either little-endian or big-endian + depending on {!Sys.big_endian}. + + 32-bit and 64-bit integers are represented by the [int] type, which has more + bits than the binary encoding. Functions that decode signed (resp. unsigned) + 8-bit or 16-bit integers represented by [int] values sign-extend (resp. + zero-extend) their result. *) + +val get_int8 : 'fd t -> int -> int +val get_uint8 : 'fd t -> int -> int +(* +val get_uint16_ne : 'fd t -> int -> int +val get_uint16_le : 'fd t -> int -> int +val get_uint16_be : 'fd t -> int -> int +val get_int16_ne : 'fd t -> int -> int +val get_int16_le : 'fd t -> int -> int +val get_int16_be : 'fd t -> int -> int +val get_int32_ne : 'fd t -> int -> int32 +val get_int32_le : 'fd t -> int -> int32 +val get_int32_be : 'fd t -> int -> int32 +val get_int64_ne : 'fd t -> int -> int64 +val get_int64_le : 'fd t -> int -> int64 +val get_int64_be : 'fd t -> int -> int64 +*) + +val get_string : 'fd t -> len:int -> int -> string +val get_seq : 'fd t -> int -> string Seq.t +val next : 'fd t -> slice -> slice option +val iter : 'fd t -> ?len:int -> fn:(int -> unit) -> int -> unit + +val blit_to_bytes : + 'fd t -> src_off:int -> bytes -> dst_off:int -> len:int -> unit + +(* +val blit_to_bigstring : 'fd t -> src_off:int -> bigstring -> dst_off:int -> len:int -> unit +*) diff --git a/lib/dune b/lib/dune new file mode 100644 index 0000000..8e62221 --- /dev/null +++ b/lib/dune @@ -0,0 +1,8 @@ +(library + (name cachet) + (modes native) + ; (foreign_stubs + ; (language c) + ; (mode byte) + ; (names hash)) + (modules cachet)) diff --git a/lib/hash.c b/lib/hash.c new file mode 100644 index 0000000..efdc560 --- /dev/null +++ b/lib/hash.c @@ -0,0 +1,8 @@ +#include "caml/mlvalues.h" +#include "caml/hash.h" +#include "caml/alloc.h" + +CAMLprim value +cachet_hash_mix_intnat(value h, value d) { + return caml_copy_int32(caml_hash_mix_intnat(Int32_val (h), Int_val (d))); +} diff --git a/test/dune b/test/dune new file mode 100644 index 0000000..d62c811 --- /dev/null +++ b/test/dune @@ -0,0 +1,3 @@ +(test + (name test) + (libraries cachet bigstringaf alcotest)) diff --git a/test/test.ml b/test/test.ml new file mode 100644 index 0000000..c832f4f --- /dev/null +++ b/test/test.ml @@ -0,0 +1,41 @@ +let random ?g len = + let bstr = Bigstringaf.create len in + for i = 0 to len - 1 do + let chr = + match g with + | Some g -> Char.unsafe_chr (Random.State.bits g land 0xff) + | None -> Char.unsafe_chr (Random.bits () land 0xff) + in + Bigstringaf.set bstr i chr + done; + bstr + +let make ?cachesize ?pagesize ?g len = + let bstr = random ?g len in + let map () ~pos len = + if pos < 0 || len < 0 || pos > Bigarray.Array1.dim bstr then + Printf.ksprintf invalid_arg "map ~pos:%d %d" pos len; + let len' = Int.min (Bigarray.Array1.dim bstr - pos) len in + Bigarray.Array1.sub bstr pos len' + in + (Cachet.make ?cachesize ?pagesize ~map (), bstr) + +let test01 = + Alcotest.test_case "test01" `Quick @@ fun () -> + let t, oracle = make ~cachesize:0x100 ~pagesize:0x100 0xe000 in + let a = Cachet.get_uint8 t 0xdead in + let b = Char.code (Bigstringaf.get oracle 0xdead) in + Alcotest.(check int) "0xdead" a b; + let a = Cachet.get_string t 0xdead ~len:10 in + let b = Bigstringaf.substring oracle ~off:0xdead ~len:10 in + Alcotest.(check string) "0xdead" a b; + let a = Cachet.get_string t 0xdea0 ~len:10 in + let b = Bigstringaf.substring oracle ~off:0xdea0 ~len:10 in + Alcotest.(check string) "0xdea0" a b; + let a = Cachet.get_seq t 0 in + let b = Bigstringaf.to_string oracle in + let a = List.of_seq a in + let a = String.concat "" a in + Alcotest.(check string) "all" a b + +let () = Alcotest.run "chat" [ ("simple", [ test01 ]) ]