(* * Copyright (C) 2025 Pizie Dust * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. *) (** https://wiki.openzim.org/wiki/ZIM_file_format*) let ( let* ) = Result.bind module Header = struct type t = { magic_number: int32 (** Magic number to recognise the file format, must be 72173914 (0x44D495A) *) ; major_version: int ; minor_version: int ; uuid: Uuidm.t ; entry_count: int32 ; cluster_count: int32 ; path_ptr_pos: int64 ; title_ptr_pos: int64 ; cluster_ptr_pos: int64 ; mime_list_pos: int64 ; main_page: int32 ; layout_page: int32 ; checksum_pos: int64 } let magic_number_offset = 0 let major_version_offset = 4 let minor_version_offset = 6 let uuid_offset = 8 let entry_count_offset = 24 let cluster_count_offset = 28 let path_ptr_pos_offset = 32 let title_ptr_pos_offset = 40 let cluster_ptr_pos_offset = 48 let mime_list_pos_offset = 56 let main_page_offset = 64 let layout_page_offset = 68 let checksum_pos_offset = 72 let unmarshal buf = let magic_number = Cstruct.LE.get_uint32 buf magic_number_offset in let* () = match Int32.to_int magic_number with | 72173914 -> Ok () | x -> Error (Printf.sprintf "Magic number is wrong', got '%d'" x) in let major_version = Cstruct.LE.get_uint16 buf major_version_offset in let minor_version = Cstruct.LE.get_uint16 buf minor_version_offset in let uuid_bytes = Cstruct.sub buf uuid_offset 16 |> Cstruct.to_string in let uuid = Option.get @@ Uuidm.of_mixed_endian_binary_string uuid_bytes in let entry_count = Cstruct.LE.get_uint32 buf entry_count_offset in let cluster_count = Cstruct.LE.get_uint32 buf cluster_count_offset in let path_ptr_pos = Cstruct.LE.get_uint64 buf path_ptr_pos_offset in let title_ptr_pos = Cstruct.LE.get_uint64 buf title_ptr_pos_offset in let cluster_ptr_pos = Cstruct.LE.get_uint64 buf cluster_ptr_pos_offset in let mime_list_pos = Cstruct.LE.get_uint64 buf mime_list_pos_offset in let main_page = Cstruct.LE.get_uint32 buf main_page_offset in let layout_page = Cstruct.LE.get_uint32 buf layout_page_offset in let checksum_pos = Cstruct.LE.get_uint64 buf checksum_pos_offset in Ok { magic_number ; major_version ; minor_version ; uuid ; entry_count ; cluster_count ; path_ptr_pos ; title_ptr_pos ; cluster_ptr_pos ; mime_list_pos ; main_page ; layout_page ; checksum_pos } end module Directory = struct type entry = { mime_type: int; parameter_len: int; namespace: char; revision: int32; path: string; title: string; } type content_entry = { entry: entry; cluster_number: int32; blob_number: int32; } type redirect_entry = { entry: entry; redirect_index: int32; } type t = | Content of content_entry | Redirect of redirect_entry | LinkTarget | DeletedEntry let mime_type_offset = 0 let parameter_len_offset = 2 let namespace_offset = 3 let revision_offset = 4 let cluster_number_offset = 8 let blob_number_offset = 12 let path_offset = 16 let redirect_index_offset = 8 let parse_null_terminated_string buf offset = let rec find_null i = if Cstruct.get_uint8 buf i = 0 then i else find_null (i + 1) in let null_pos = find_null offset in Cstruct.to_string (Cstruct.sub buf offset (null_pos - offset)) let unmarshal buf = let mime_type = Cstruct.LE.get_uint16 buf mime_type_offset in if mime_type = 0xFFFE || mime_type = 0xFFFD then Ok DeletedEntry else let parameter_len = Cstruct.get_uint8 buf parameter_len_offset in let namespace = Cstruct.get_char buf namespace_offset in let revision = Cstruct.LE.get_uint32 buf revision_offset in let path = parse_null_terminated_string buf path_offset in let title_offset = path_offset + String.length path + 1 in let title = parse_null_terminated_string buf title_offset in let entry = { mime_type; parameter_len; namespace; revision; path; title } in if mime_type = 0xFFFF then let redirect_index = Cstruct.LE.get_uint32 buf redirect_index_offset in Ok (Redirect { entry; redirect_index }) else let cluster_number = Cstruct.LE.get_uint32 buf cluster_number_offset in let blob_number = Cstruct.LE.get_uint32 buf blob_number_offset in Ok (Content { entry; cluster_number; blob_number }) end type t = { header: Header.t ; mime_type_list: string list ; path_ptr_list: int32 list ; title_ptr_list: int32 list ; cluster_ptr_list: int32 list } let read_zero_terminated_list buf start_pos = let rec aux acc pos = if pos >= Cstruct.length buf then List.rev acc else let str = Cstruct.to_string ~off:pos buf in match String.index_opt str (Char.chr 0) with | Some null_index -> let mime_type = String.sub str 0 null_index in let next_pos = pos + null_index + 1 in if mime_type = "" then List.rev acc else aux (mime_type :: acc) next_pos | None -> List.rev acc in aux [] (Int64.to_int start_pos) let read_ptr_list ~buf ~start_pos ~end_pos ~offset = let rec read_pointers acc pos = if pos >= end_pos then List.rev acc else let ptr = Cstruct.LE.get_uint32 buf pos in read_pointers (ptr :: acc) (pos + offset) in read_pointers [] start_pos let unmarshal buf = let* header = Header.unmarshal buf in let mime_list_pos = header.mime_list_pos in let mime_type_list = read_zero_terminated_list buf mime_list_pos in let path_ptr_list = read_ptr_list ~buf ~start_pos:(Int64.to_int header.path_ptr_pos) ~end_pos:(Int64.to_int header.title_ptr_pos) ~offset:8 in let title_ptr_list = read_ptr_list ~buf ~start_pos:(Int64.to_int header.title_ptr_pos) ~end_pos:(Int64.to_int header.cluster_ptr_pos) ~offset:4 in Ok { header ; mime_type_list ; path_ptr_list ; title_ptr_list ; cluster_ptr_list= [] }