commit f4de52a85e5a8f7b20a8a869f685fd32175aa21e Author: Pizie Dust Date: Mon Jan 27 14:49:18 2025 +0100 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..19b613e --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.vscode +_build +_opam \ No newline at end of file diff --git a/.ocamlformat b/.ocamlformat new file mode 100644 index 0000000..9db5b95 --- /dev/null +++ b/.ocamlformat @@ -0,0 +1,13 @@ +version=0.27.0 +exp-grouping=preserve +break-infix=wrap-or-vertical +break-collection-expressions=wrap +break-sequences=false +break-infix-before-func=false +dock-collection-brackets=true +break-separators=before +field-space=tight +if-then-else=compact +break-sequences=false +sequence-blank-line=compact +exp-grouping=preserve diff --git a/bin/dune b/bin/dune new file mode 100644 index 0000000..962994d --- /dev/null +++ b/bin/dune @@ -0,0 +1,4 @@ +(executable + (public_name zim_header_inspect) + (name zim_header_inspect) + (libraries zim cstruct cmdliner unix)) diff --git a/bin/zim_header_inspect.ml b/bin/zim_header_inspect.ml new file mode 100644 index 0000000..678b756 --- /dev/null +++ b/bin/zim_header_inspect.ml @@ -0,0 +1,69 @@ +open Cmdliner + +let print_zim_header zim = + Printf.printf "Zim Header:\n"; + Printf.printf "\tmagic number: %lu\n" zim.Zim.Header.magic_number; + Printf.printf "\tmajor version: %d\n" zim.Zim.Header.major_version; + Printf.printf "\tminor version: %d\n" zim.Zim.Header.minor_version; + Printf.printf "\tuuid: %s\n" (Uuidm.to_string zim.Zim.Header.uuid); + Printf.printf "\tentry count: %lu\n" zim.Zim.Header.entry_count; + Printf.printf "\tcluster count: %lu\n" zim.Zim.Header.cluster_count; + Printf.printf "\tpath pointer position: %Lu\n" zim.Zim.Header.path_ptr_pos; + Printf.printf "\ttitle pointer position: %Lu\n" zim.Zim.Header.title_ptr_pos; + Printf.printf "\tcluster pointer position: %Lu\n" + zim.Zim.Header.cluster_ptr_pos; + Printf.printf "\tmime list position: %Lu\n" zim.Zim.Header.mime_list_pos; + Printf.printf "\tmain page: %lu\n" zim.Zim.Header.main_page; + Printf.printf "\tlayout page: %lu\n" zim.Zim.Header.layout_page; + Printf.printf "\tchecksum position: %Lu\n" zim.Zim.Header.checksum_pos + +let print_int32_in_columns count lst = + List.iteri + (fun i zim -> + if i > 0 && i mod count = 0 then Printf.printf "\n"; + Printf.printf "%10lu " zim) + lst; + Printf.printf "\n"; + () + +let print_zim zim = + print_zim_header zim.Zim.header; + Printf.printf "MIME Type List:\n"; + List.iter (fun zim -> Printf.printf "\t%s\n" zim) zim.Zim.mime_type_list; + Printf.printf "Path Pointer List:\n"; + print_int32_in_columns 10 zim.path_ptr_list; + Printf.printf "Title Pointer List:\n"; + print_int32_in_columns 10 zim.title_ptr_list; + () + +let really_input fd buf pos len = + let rec loop pos remaining = + if remaining > 0 then ( + let len = Unix.read fd buf pos remaining in + if len = 0 then raise End_of_file; + loop (pos + len) (remaining - len)) + in + loop pos len + +let read_zims zims = + List.iter + (fun zim -> + let fd = Unix.openfile zim Unix.[ O_RDONLY; O_CLOEXEC ] 0 in + let f_stats = Unix.stat zim in + let size = f_stats.st_size in + let buf = Bytes.create size in + let () = really_input fd buf 0 size in + match Zim.unmarshal (Cstruct.of_bytes buf) with + | Ok zim -> print_zim zim + | Error err -> Printf.printf "%s" err) + zims + +let zims = Arg.(non_empty & pos_all file [] & info [] ~docv:"zim files") + +let cmd = + let doc = "Inspect the headers of one or more zim files." in + let info = Cmd.info "zim_header_inspect" ~version:"1.0.0" ~doc in + Cmd.v info Term.(const read_zims $ zims) + +let main () = exit (Cmd.eval cmd) +let () = main () diff --git a/dune-project b/dune-project new file mode 100644 index 0000000..5a5718a --- /dev/null +++ b/dune-project @@ -0,0 +1,3 @@ +(lang dune 3.17) +(name zim) +(generate_opam_files true) \ No newline at end of file diff --git a/lib/dune b/lib/dune new file mode 100644 index 0000000..9874972 --- /dev/null +++ b/lib/dune @@ -0,0 +1,4 @@ +(library + (public_name zim) + (name zim) + (libraries cstruct uuidm)) diff --git a/lib/zim.ml b/lib/zim.ml new file mode 100644 index 0000000..e766d56 --- /dev/null +++ b/lib/zim.ml @@ -0,0 +1,215 @@ +(* + * Copyright (C) 2025 Pizie Dust + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + *) + +(** https://wiki.openzim.org/wiki/ZIM_file_format*) + +let ( let* ) = Result.bind + +module Header = struct + type t = { + magic_number: int32 + (** Magic number to recognise the file format, must be 72173914 + (0x44D495A) *) + ; major_version: int + ; minor_version: int + ; uuid: Uuidm.t + ; entry_count: int32 + ; cluster_count: int32 + ; path_ptr_pos: int64 + ; title_ptr_pos: int64 + ; cluster_ptr_pos: int64 + ; mime_list_pos: int64 + ; main_page: int32 + ; layout_page: int32 + ; checksum_pos: int64 + } + + let magic_number_offset = 0 + let major_version_offset = 4 + let minor_version_offset = 6 + let uuid_offset = 8 + let entry_count_offset = 24 + let cluster_count_offset = 28 + let path_ptr_pos_offset = 32 + let title_ptr_pos_offset = 40 + let cluster_ptr_pos_offset = 48 + let mime_list_pos_offset = 56 + let main_page_offset = 64 + let layout_page_offset = 68 + let checksum_pos_offset = 72 + + let unmarshal buf = + let magic_number = Cstruct.LE.get_uint32 buf magic_number_offset in + let* () = + match Int32.to_int magic_number with + | 72173914 -> Ok () + | x -> Error (Printf.sprintf "Magic number is wrong', got '%d'" x) + in + let major_version = Cstruct.LE.get_uint16 buf major_version_offset in + let minor_version = Cstruct.LE.get_uint16 buf minor_version_offset in + let uuid_bytes = Cstruct.sub buf uuid_offset 16 |> Cstruct.to_string in + let uuid = Option.get @@ Uuidm.of_mixed_endian_binary_string uuid_bytes in + let entry_count = Cstruct.LE.get_uint32 buf entry_count_offset in + let cluster_count = Cstruct.LE.get_uint32 buf cluster_count_offset in + let path_ptr_pos = Cstruct.LE.get_uint64 buf path_ptr_pos_offset in + let title_ptr_pos = Cstruct.LE.get_uint64 buf title_ptr_pos_offset in + let cluster_ptr_pos = Cstruct.LE.get_uint64 buf cluster_ptr_pos_offset in + let mime_list_pos = Cstruct.LE.get_uint64 buf mime_list_pos_offset in + let main_page = Cstruct.LE.get_uint32 buf main_page_offset in + let layout_page = Cstruct.LE.get_uint32 buf layout_page_offset in + let checksum_pos = Cstruct.LE.get_uint64 buf checksum_pos_offset in + Ok + { + magic_number + ; major_version + ; minor_version + ; uuid + ; entry_count + ; cluster_count + ; path_ptr_pos + ; title_ptr_pos + ; cluster_ptr_pos + ; mime_list_pos + ; main_page + ; layout_page + ; checksum_pos + } +end + +module Directory = struct + type entry = { + mime_type: int; + parameter_len: int; + namespace: char; + revision: int32; + path: string; + title: string; + } + + type content_entry = { + entry: entry; + cluster_number: int32; + blob_number: int32; + } + + type redirect_entry = { + entry: entry; + redirect_index: int32; + } + + type t = + | Content of content_entry + | Redirect of redirect_entry + | LinkTarget + | DeletedEntry + + let mime_type_offset = 0 + let parameter_len_offset = 2 + let namespace_offset = 3 + let revision_offset = 4 + let cluster_number_offset = 8 + let blob_number_offset = 12 + let path_offset = 16 + let redirect_index_offset = 8 + + let parse_null_terminated_string buf offset = + let rec find_null i = + if Cstruct.get_uint8 buf i = 0 then i else find_null (i + 1) + in + let null_pos = find_null offset in + Cstruct.to_string (Cstruct.sub buf offset (null_pos - offset)) + + let unmarshal buf = + let mime_type = Cstruct.LE.get_uint16 buf mime_type_offset in + if mime_type = 0xFFFE || mime_type = 0xFFFD then + Ok DeletedEntry + else + let parameter_len = Cstruct.get_uint8 buf parameter_len_offset in + let namespace = Cstruct.get_char buf namespace_offset in + let revision = Cstruct.LE.get_uint32 buf revision_offset in + let path = parse_null_terminated_string buf path_offset in + let title_offset = path_offset + String.length path + 1 in + let title = parse_null_terminated_string buf title_offset in + let entry = { mime_type; parameter_len; namespace; revision; path; title } in + if mime_type = 0xFFFF then + let redirect_index = + Cstruct.LE.get_uint32 buf redirect_index_offset + in + Ok (Redirect { entry; redirect_index }) + else + let cluster_number = + Cstruct.LE.get_uint32 buf cluster_number_offset + in + let blob_number = Cstruct.LE.get_uint32 buf blob_number_offset in + Ok (Content { entry; cluster_number; blob_number }) +end + +type t = { + header: Header.t + ; mime_type_list: string list + ; path_ptr_list: int32 list + ; title_ptr_list: int32 list + ; cluster_ptr_list: int32 list +} + +let read_zero_terminated_list buf start_pos = + let rec aux acc pos = + if pos >= Cstruct.length buf then List.rev acc + else + let str = Cstruct.to_string ~off:pos buf in + match String.index_opt str (Char.chr 0) with + | Some null_index -> + let mime_type = String.sub str 0 null_index in + let next_pos = pos + null_index + 1 in + if mime_type = "" then List.rev acc + else aux (mime_type :: acc) next_pos + | None -> List.rev acc + in + aux [] (Int64.to_int start_pos) + +let read_ptr_list ~buf ~start_pos ~end_pos ~offset = + let rec read_pointers acc pos = + if pos >= end_pos then List.rev acc + else + let ptr = Cstruct.LE.get_uint32 buf pos in + read_pointers (ptr :: acc) (pos + offset) + in + read_pointers [] start_pos + +let unmarshal buf = + let* header = Header.unmarshal buf in + let mime_list_pos = header.mime_list_pos in + let mime_type_list = read_zero_terminated_list buf mime_list_pos in + let path_ptr_list = + read_ptr_list ~buf + ~start_pos:(Int64.to_int header.path_ptr_pos) + ~end_pos:(Int64.to_int header.title_ptr_pos) + ~offset:8 + in + let title_ptr_list = + read_ptr_list ~buf + ~start_pos:(Int64.to_int header.title_ptr_pos) + ~end_pos:(Int64.to_int header.cluster_ptr_pos) + ~offset:4 + in + Ok + { + header + ; mime_type_list + ; path_ptr_list + ; title_ptr_list + ; cluster_ptr_list= [] + } diff --git a/test/dune b/test/dune new file mode 100644 index 0000000..3e00844 --- /dev/null +++ b/test/dune @@ -0,0 +1,2 @@ +(test + (name test_zim)) diff --git a/test/test_zim.ml b/test/test_zim.ml new file mode 100644 index 0000000..e69de29 diff --git a/wiktionary_sample_file.zim b/wiktionary_sample_file.zim new file mode 100644 index 0000000..bae1a39 Binary files /dev/null and b/wiktionary_sample_file.zim differ diff --git a/zim.opam b/zim.opam new file mode 100644 index 0000000..4c0e05f --- /dev/null +++ b/zim.opam @@ -0,0 +1,33 @@ +# This file is generated by dune, edit dune-project instead +opam-version: "2.0" +synopsis: "A short synopsis" +description: "A longer description" +maintainer: ["Maintainer Name "] +authors: ["Author Name "] +license: "LICENSE" +tags: ["add topics" "to describe" "your" "project"] +homepage: "https://github.com/username/reponame" +doc: "https://url/to/documentation" +bug-reports: "https://github.com/username/reponame/issues" +depends: [ + "dune" {>= "3.17"} + "ocaml" + "uuidm" + "cstruct" + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] +] +dev-repo: "git+https://github.com/username/reponame.git"