215 lines
6.7 KiB
OCaml
215 lines
6.7 KiB
OCaml
(*
|
|
* Copyright (C) 2025 Pizie Dust
|
|
*
|
|
* Permission to use, copy, modify, and distribute this software for any
|
|
* purpose with or without fee is hereby granted, provided that the above
|
|
* copyright notice and this permission notice appear in all copies.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
*)
|
|
|
|
(** https://wiki.openzim.org/wiki/ZIM_file_format*)
|
|
|
|
let ( let* ) = Result.bind
|
|
|
|
module Header = struct
|
|
type t = {
|
|
magic_number: int32
|
|
(** Magic number to recognise the file format, must be 72173914
|
|
(0x44D495A) *)
|
|
; major_version: int
|
|
; minor_version: int
|
|
; uuid: Uuidm.t
|
|
; entry_count: int32
|
|
; cluster_count: int32
|
|
; path_ptr_pos: int64
|
|
; title_ptr_pos: int64
|
|
; cluster_ptr_pos: int64
|
|
; mime_list_pos: int64
|
|
; main_page: int32
|
|
; layout_page: int32
|
|
; checksum_pos: int64
|
|
}
|
|
|
|
let magic_number_offset = 0
|
|
let major_version_offset = 4
|
|
let minor_version_offset = 6
|
|
let uuid_offset = 8
|
|
let entry_count_offset = 24
|
|
let cluster_count_offset = 28
|
|
let path_ptr_pos_offset = 32
|
|
let title_ptr_pos_offset = 40
|
|
let cluster_ptr_pos_offset = 48
|
|
let mime_list_pos_offset = 56
|
|
let main_page_offset = 64
|
|
let layout_page_offset = 68
|
|
let checksum_pos_offset = 72
|
|
|
|
let unmarshal buf =
|
|
let magic_number = Cstruct.LE.get_uint32 buf magic_number_offset in
|
|
let* () =
|
|
match Int32.to_int magic_number with
|
|
| 72173914 -> Ok ()
|
|
| x -> Error (Printf.sprintf "Magic number is wrong', got '%d'" x)
|
|
in
|
|
let major_version = Cstruct.LE.get_uint16 buf major_version_offset in
|
|
let minor_version = Cstruct.LE.get_uint16 buf minor_version_offset in
|
|
let uuid_bytes = Cstruct.sub buf uuid_offset 16 |> Cstruct.to_string in
|
|
let uuid = Option.get @@ Uuidm.of_mixed_endian_binary_string uuid_bytes in
|
|
let entry_count = Cstruct.LE.get_uint32 buf entry_count_offset in
|
|
let cluster_count = Cstruct.LE.get_uint32 buf cluster_count_offset in
|
|
let path_ptr_pos = Cstruct.LE.get_uint64 buf path_ptr_pos_offset in
|
|
let title_ptr_pos = Cstruct.LE.get_uint64 buf title_ptr_pos_offset in
|
|
let cluster_ptr_pos = Cstruct.LE.get_uint64 buf cluster_ptr_pos_offset in
|
|
let mime_list_pos = Cstruct.LE.get_uint64 buf mime_list_pos_offset in
|
|
let main_page = Cstruct.LE.get_uint32 buf main_page_offset in
|
|
let layout_page = Cstruct.LE.get_uint32 buf layout_page_offset in
|
|
let checksum_pos = Cstruct.LE.get_uint64 buf checksum_pos_offset in
|
|
Ok
|
|
{
|
|
magic_number
|
|
; major_version
|
|
; minor_version
|
|
; uuid
|
|
; entry_count
|
|
; cluster_count
|
|
; path_ptr_pos
|
|
; title_ptr_pos
|
|
; cluster_ptr_pos
|
|
; mime_list_pos
|
|
; main_page
|
|
; layout_page
|
|
; checksum_pos
|
|
}
|
|
end
|
|
|
|
module Directory = struct
|
|
type entry = {
|
|
mime_type: int;
|
|
parameter_len: int;
|
|
namespace: char;
|
|
revision: int32;
|
|
path: string;
|
|
title: string;
|
|
}
|
|
|
|
type content_entry = {
|
|
entry: entry;
|
|
cluster_number: int32;
|
|
blob_number: int32;
|
|
}
|
|
|
|
type redirect_entry = {
|
|
entry: entry;
|
|
redirect_index: int32;
|
|
}
|
|
|
|
type t =
|
|
| Content of content_entry
|
|
| Redirect of redirect_entry
|
|
| LinkTarget
|
|
| DeletedEntry
|
|
|
|
let mime_type_offset = 0
|
|
let parameter_len_offset = 2
|
|
let namespace_offset = 3
|
|
let revision_offset = 4
|
|
let cluster_number_offset = 8
|
|
let blob_number_offset = 12
|
|
let path_offset = 16
|
|
let redirect_index_offset = 8
|
|
|
|
let parse_null_terminated_string buf offset =
|
|
let rec find_null i =
|
|
if Cstruct.get_uint8 buf i = 0 then i else find_null (i + 1)
|
|
in
|
|
let null_pos = find_null offset in
|
|
Cstruct.to_string (Cstruct.sub buf offset (null_pos - offset))
|
|
|
|
let unmarshal buf =
|
|
let mime_type = Cstruct.LE.get_uint16 buf mime_type_offset in
|
|
if mime_type = 0xFFFE || mime_type = 0xFFFD then
|
|
Ok DeletedEntry
|
|
else
|
|
let parameter_len = Cstruct.get_uint8 buf parameter_len_offset in
|
|
let namespace = Cstruct.get_char buf namespace_offset in
|
|
let revision = Cstruct.LE.get_uint32 buf revision_offset in
|
|
let path = parse_null_terminated_string buf path_offset in
|
|
let title_offset = path_offset + String.length path + 1 in
|
|
let title = parse_null_terminated_string buf title_offset in
|
|
let entry = { mime_type; parameter_len; namespace; revision; path; title } in
|
|
if mime_type = 0xFFFF then
|
|
let redirect_index =
|
|
Cstruct.LE.get_uint32 buf redirect_index_offset
|
|
in
|
|
Ok (Redirect { entry; redirect_index })
|
|
else
|
|
let cluster_number =
|
|
Cstruct.LE.get_uint32 buf cluster_number_offset
|
|
in
|
|
let blob_number = Cstruct.LE.get_uint32 buf blob_number_offset in
|
|
Ok (Content { entry; cluster_number; blob_number })
|
|
end
|
|
|
|
type t = {
|
|
header: Header.t
|
|
; mime_type_list: string list
|
|
; path_ptr_list: int32 list
|
|
; title_ptr_list: int32 list
|
|
; cluster_ptr_list: int32 list
|
|
}
|
|
|
|
let read_zero_terminated_list buf start_pos =
|
|
let rec aux acc pos =
|
|
if pos >= Cstruct.length buf then List.rev acc
|
|
else
|
|
let str = Cstruct.to_string ~off:pos buf in
|
|
match String.index_opt str (Char.chr 0) with
|
|
| Some null_index ->
|
|
let mime_type = String.sub str 0 null_index in
|
|
let next_pos = pos + null_index + 1 in
|
|
if mime_type = "" then List.rev acc
|
|
else aux (mime_type :: acc) next_pos
|
|
| None -> List.rev acc
|
|
in
|
|
aux [] (Int64.to_int start_pos)
|
|
|
|
let read_ptr_list ~buf ~start_pos ~end_pos ~offset =
|
|
let rec read_pointers acc pos =
|
|
if pos >= end_pos then List.rev acc
|
|
else
|
|
let ptr = Cstruct.LE.get_uint32 buf pos in
|
|
read_pointers (ptr :: acc) (pos + offset)
|
|
in
|
|
read_pointers [] start_pos
|
|
|
|
let unmarshal buf =
|
|
let* header = Header.unmarshal buf in
|
|
let mime_list_pos = header.mime_list_pos in
|
|
let mime_type_list = read_zero_terminated_list buf mime_list_pos in
|
|
let path_ptr_list =
|
|
read_ptr_list ~buf
|
|
~start_pos:(Int64.to_int header.path_ptr_pos)
|
|
~end_pos:(Int64.to_int header.title_ptr_pos)
|
|
~offset:8
|
|
in
|
|
let title_ptr_list =
|
|
read_ptr_list ~buf
|
|
~start_pos:(Int64.to_int header.title_ptr_pos)
|
|
~end_pos:(Int64.to_int header.cluster_ptr_pos)
|
|
~offset:4
|
|
in
|
|
Ok
|
|
{
|
|
header
|
|
; mime_type_list
|
|
; path_ptr_list
|
|
; title_ptr_list
|
|
; cluster_ptr_list= []
|
|
}
|