ocaml-zim/lib/zim.ml
2025-01-27 14:49:18 +01:00

215 lines
6.7 KiB
OCaml

(*
* Copyright (C) 2025 Pizie Dust
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*)
(** https://wiki.openzim.org/wiki/ZIM_file_format*)
let ( let* ) = Result.bind
module Header = struct
type t = {
magic_number: int32
(** Magic number to recognise the file format, must be 72173914
(0x44D495A) *)
; major_version: int
; minor_version: int
; uuid: Uuidm.t
; entry_count: int32
; cluster_count: int32
; path_ptr_pos: int64
; title_ptr_pos: int64
; cluster_ptr_pos: int64
; mime_list_pos: int64
; main_page: int32
; layout_page: int32
; checksum_pos: int64
}
let magic_number_offset = 0
let major_version_offset = 4
let minor_version_offset = 6
let uuid_offset = 8
let entry_count_offset = 24
let cluster_count_offset = 28
let path_ptr_pos_offset = 32
let title_ptr_pos_offset = 40
let cluster_ptr_pos_offset = 48
let mime_list_pos_offset = 56
let main_page_offset = 64
let layout_page_offset = 68
let checksum_pos_offset = 72
let unmarshal buf =
let magic_number = Cstruct.LE.get_uint32 buf magic_number_offset in
let* () =
match Int32.to_int magic_number with
| 72173914 -> Ok ()
| x -> Error (Printf.sprintf "Magic number is wrong', got '%d'" x)
in
let major_version = Cstruct.LE.get_uint16 buf major_version_offset in
let minor_version = Cstruct.LE.get_uint16 buf minor_version_offset in
let uuid_bytes = Cstruct.sub buf uuid_offset 16 |> Cstruct.to_string in
let uuid = Option.get @@ Uuidm.of_mixed_endian_binary_string uuid_bytes in
let entry_count = Cstruct.LE.get_uint32 buf entry_count_offset in
let cluster_count = Cstruct.LE.get_uint32 buf cluster_count_offset in
let path_ptr_pos = Cstruct.LE.get_uint64 buf path_ptr_pos_offset in
let title_ptr_pos = Cstruct.LE.get_uint64 buf title_ptr_pos_offset in
let cluster_ptr_pos = Cstruct.LE.get_uint64 buf cluster_ptr_pos_offset in
let mime_list_pos = Cstruct.LE.get_uint64 buf mime_list_pos_offset in
let main_page = Cstruct.LE.get_uint32 buf main_page_offset in
let layout_page = Cstruct.LE.get_uint32 buf layout_page_offset in
let checksum_pos = Cstruct.LE.get_uint64 buf checksum_pos_offset in
Ok
{
magic_number
; major_version
; minor_version
; uuid
; entry_count
; cluster_count
; path_ptr_pos
; title_ptr_pos
; cluster_ptr_pos
; mime_list_pos
; main_page
; layout_page
; checksum_pos
}
end
module Directory = struct
type entry = {
mime_type: int;
parameter_len: int;
namespace: char;
revision: int32;
path: string;
title: string;
}
type content_entry = {
entry: entry;
cluster_number: int32;
blob_number: int32;
}
type redirect_entry = {
entry: entry;
redirect_index: int32;
}
type t =
| Content of content_entry
| Redirect of redirect_entry
| LinkTarget
| DeletedEntry
let mime_type_offset = 0
let parameter_len_offset = 2
let namespace_offset = 3
let revision_offset = 4
let cluster_number_offset = 8
let blob_number_offset = 12
let path_offset = 16
let redirect_index_offset = 8
let parse_null_terminated_string buf offset =
let rec find_null i =
if Cstruct.get_uint8 buf i = 0 then i else find_null (i + 1)
in
let null_pos = find_null offset in
Cstruct.to_string (Cstruct.sub buf offset (null_pos - offset))
let unmarshal buf =
let mime_type = Cstruct.LE.get_uint16 buf mime_type_offset in
if mime_type = 0xFFFE || mime_type = 0xFFFD then
Ok DeletedEntry
else
let parameter_len = Cstruct.get_uint8 buf parameter_len_offset in
let namespace = Cstruct.get_char buf namespace_offset in
let revision = Cstruct.LE.get_uint32 buf revision_offset in
let path = parse_null_terminated_string buf path_offset in
let title_offset = path_offset + String.length path + 1 in
let title = parse_null_terminated_string buf title_offset in
let entry = { mime_type; parameter_len; namespace; revision; path; title } in
if mime_type = 0xFFFF then
let redirect_index =
Cstruct.LE.get_uint32 buf redirect_index_offset
in
Ok (Redirect { entry; redirect_index })
else
let cluster_number =
Cstruct.LE.get_uint32 buf cluster_number_offset
in
let blob_number = Cstruct.LE.get_uint32 buf blob_number_offset in
Ok (Content { entry; cluster_number; blob_number })
end
type t = {
header: Header.t
; mime_type_list: string list
; path_ptr_list: int32 list
; title_ptr_list: int32 list
; cluster_ptr_list: int32 list
}
let read_zero_terminated_list buf start_pos =
let rec aux acc pos =
if pos >= Cstruct.length buf then List.rev acc
else
let str = Cstruct.to_string ~off:pos buf in
match String.index_opt str (Char.chr 0) with
| Some null_index ->
let mime_type = String.sub str 0 null_index in
let next_pos = pos + null_index + 1 in
if mime_type = "" then List.rev acc
else aux (mime_type :: acc) next_pos
| None -> List.rev acc
in
aux [] (Int64.to_int start_pos)
let read_ptr_list ~buf ~start_pos ~end_pos ~offset =
let rec read_pointers acc pos =
if pos >= end_pos then List.rev acc
else
let ptr = Cstruct.LE.get_uint32 buf pos in
read_pointers (ptr :: acc) (pos + offset)
in
read_pointers [] start_pos
let unmarshal buf =
let* header = Header.unmarshal buf in
let mime_list_pos = header.mime_list_pos in
let mime_type_list = read_zero_terminated_list buf mime_list_pos in
let path_ptr_list =
read_ptr_list ~buf
~start_pos:(Int64.to_int header.path_ptr_pos)
~end_pos:(Int64.to_int header.title_ptr_pos)
~offset:8
in
let title_ptr_list =
read_ptr_list ~buf
~start_pos:(Int64.to_int header.title_ptr_pos)
~end_pos:(Int64.to_int header.cluster_ptr_pos)
~offset:4
in
Ok
{
header
; mime_type_list
; path_ptr_list
; title_ptr_list
; cluster_ptr_list= []
}