initial commit

This commit is contained in:
Pizie Dust 2025-01-27 14:49:18 +01:00
commit f4de52a85e
11 changed files with 346 additions and 0 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
.vscode
_build
_opam

13
.ocamlformat Normal file
View file

@ -0,0 +1,13 @@
version=0.27.0
exp-grouping=preserve
break-infix=wrap-or-vertical
break-collection-expressions=wrap
break-sequences=false
break-infix-before-func=false
dock-collection-brackets=true
break-separators=before
field-space=tight
if-then-else=compact
break-sequences=false
sequence-blank-line=compact
exp-grouping=preserve

4
bin/dune Normal file
View file

@ -0,0 +1,4 @@
(executable
(public_name zim_header_inspect)
(name zim_header_inspect)
(libraries zim cstruct cmdliner unix))

69
bin/zim_header_inspect.ml Normal file
View file

@ -0,0 +1,69 @@
open Cmdliner
let print_zim_header zim =
Printf.printf "Zim Header:\n";
Printf.printf "\tmagic number: %lu\n" zim.Zim.Header.magic_number;
Printf.printf "\tmajor version: %d\n" zim.Zim.Header.major_version;
Printf.printf "\tminor version: %d\n" zim.Zim.Header.minor_version;
Printf.printf "\tuuid: %s\n" (Uuidm.to_string zim.Zim.Header.uuid);
Printf.printf "\tentry count: %lu\n" zim.Zim.Header.entry_count;
Printf.printf "\tcluster count: %lu\n" zim.Zim.Header.cluster_count;
Printf.printf "\tpath pointer position: %Lu\n" zim.Zim.Header.path_ptr_pos;
Printf.printf "\ttitle pointer position: %Lu\n" zim.Zim.Header.title_ptr_pos;
Printf.printf "\tcluster pointer position: %Lu\n"
zim.Zim.Header.cluster_ptr_pos;
Printf.printf "\tmime list position: %Lu\n" zim.Zim.Header.mime_list_pos;
Printf.printf "\tmain page: %lu\n" zim.Zim.Header.main_page;
Printf.printf "\tlayout page: %lu\n" zim.Zim.Header.layout_page;
Printf.printf "\tchecksum position: %Lu\n" zim.Zim.Header.checksum_pos
let print_int32_in_columns count lst =
List.iteri
(fun i zim ->
if i > 0 && i mod count = 0 then Printf.printf "\n";
Printf.printf "%10lu " zim)
lst;
Printf.printf "\n";
()
let print_zim zim =
print_zim_header zim.Zim.header;
Printf.printf "MIME Type List:\n";
List.iter (fun zim -> Printf.printf "\t%s\n" zim) zim.Zim.mime_type_list;
Printf.printf "Path Pointer List:\n";
print_int32_in_columns 10 zim.path_ptr_list;
Printf.printf "Title Pointer List:\n";
print_int32_in_columns 10 zim.title_ptr_list;
()
let really_input fd buf pos len =
let rec loop pos remaining =
if remaining > 0 then (
let len = Unix.read fd buf pos remaining in
if len = 0 then raise End_of_file;
loop (pos + len) (remaining - len))
in
loop pos len
let read_zims zims =
List.iter
(fun zim ->
let fd = Unix.openfile zim Unix.[ O_RDONLY; O_CLOEXEC ] 0 in
let f_stats = Unix.stat zim in
let size = f_stats.st_size in
let buf = Bytes.create size in
let () = really_input fd buf 0 size in
match Zim.unmarshal (Cstruct.of_bytes buf) with
| Ok zim -> print_zim zim
| Error err -> Printf.printf "%s" err)
zims
let zims = Arg.(non_empty & pos_all file [] & info [] ~docv:"zim files")
let cmd =
let doc = "Inspect the headers of one or more zim files." in
let info = Cmd.info "zim_header_inspect" ~version:"1.0.0" ~doc in
Cmd.v info Term.(const read_zims $ zims)
let main () = exit (Cmd.eval cmd)
let () = main ()

3
dune-project Normal file
View file

@ -0,0 +1,3 @@
(lang dune 3.17)
(name zim)
(generate_opam_files true)

4
lib/dune Normal file
View file

@ -0,0 +1,4 @@
(library
(public_name zim)
(name zim)
(libraries cstruct uuidm))

215
lib/zim.ml Normal file
View file

@ -0,0 +1,215 @@
(*
* Copyright (C) 2025 Pizie Dust
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*)
(** https://wiki.openzim.org/wiki/ZIM_file_format*)
let ( let* ) = Result.bind
module Header = struct
type t = {
magic_number: int32
(** Magic number to recognise the file format, must be 72173914
(0x44D495A) *)
; major_version: int
; minor_version: int
; uuid: Uuidm.t
; entry_count: int32
; cluster_count: int32
; path_ptr_pos: int64
; title_ptr_pos: int64
; cluster_ptr_pos: int64
; mime_list_pos: int64
; main_page: int32
; layout_page: int32
; checksum_pos: int64
}
let magic_number_offset = 0
let major_version_offset = 4
let minor_version_offset = 6
let uuid_offset = 8
let entry_count_offset = 24
let cluster_count_offset = 28
let path_ptr_pos_offset = 32
let title_ptr_pos_offset = 40
let cluster_ptr_pos_offset = 48
let mime_list_pos_offset = 56
let main_page_offset = 64
let layout_page_offset = 68
let checksum_pos_offset = 72
let unmarshal buf =
let magic_number = Cstruct.LE.get_uint32 buf magic_number_offset in
let* () =
match Int32.to_int magic_number with
| 72173914 -> Ok ()
| x -> Error (Printf.sprintf "Magic number is wrong', got '%d'" x)
in
let major_version = Cstruct.LE.get_uint16 buf major_version_offset in
let minor_version = Cstruct.LE.get_uint16 buf minor_version_offset in
let uuid_bytes = Cstruct.sub buf uuid_offset 16 |> Cstruct.to_string in
let uuid = Option.get @@ Uuidm.of_mixed_endian_binary_string uuid_bytes in
let entry_count = Cstruct.LE.get_uint32 buf entry_count_offset in
let cluster_count = Cstruct.LE.get_uint32 buf cluster_count_offset in
let path_ptr_pos = Cstruct.LE.get_uint64 buf path_ptr_pos_offset in
let title_ptr_pos = Cstruct.LE.get_uint64 buf title_ptr_pos_offset in
let cluster_ptr_pos = Cstruct.LE.get_uint64 buf cluster_ptr_pos_offset in
let mime_list_pos = Cstruct.LE.get_uint64 buf mime_list_pos_offset in
let main_page = Cstruct.LE.get_uint32 buf main_page_offset in
let layout_page = Cstruct.LE.get_uint32 buf layout_page_offset in
let checksum_pos = Cstruct.LE.get_uint64 buf checksum_pos_offset in
Ok
{
magic_number
; major_version
; minor_version
; uuid
; entry_count
; cluster_count
; path_ptr_pos
; title_ptr_pos
; cluster_ptr_pos
; mime_list_pos
; main_page
; layout_page
; checksum_pos
}
end
module Directory = struct
type entry = {
mime_type: int;
parameter_len: int;
namespace: char;
revision: int32;
path: string;
title: string;
}
type content_entry = {
entry: entry;
cluster_number: int32;
blob_number: int32;
}
type redirect_entry = {
entry: entry;
redirect_index: int32;
}
type t =
| Content of content_entry
| Redirect of redirect_entry
| LinkTarget
| DeletedEntry
let mime_type_offset = 0
let parameter_len_offset = 2
let namespace_offset = 3
let revision_offset = 4
let cluster_number_offset = 8
let blob_number_offset = 12
let path_offset = 16
let redirect_index_offset = 8
let parse_null_terminated_string buf offset =
let rec find_null i =
if Cstruct.get_uint8 buf i = 0 then i else find_null (i + 1)
in
let null_pos = find_null offset in
Cstruct.to_string (Cstruct.sub buf offset (null_pos - offset))
let unmarshal buf =
let mime_type = Cstruct.LE.get_uint16 buf mime_type_offset in
if mime_type = 0xFFFE || mime_type = 0xFFFD then
Ok DeletedEntry
else
let parameter_len = Cstruct.get_uint8 buf parameter_len_offset in
let namespace = Cstruct.get_char buf namespace_offset in
let revision = Cstruct.LE.get_uint32 buf revision_offset in
let path = parse_null_terminated_string buf path_offset in
let title_offset = path_offset + String.length path + 1 in
let title = parse_null_terminated_string buf title_offset in
let entry = { mime_type; parameter_len; namespace; revision; path; title } in
if mime_type = 0xFFFF then
let redirect_index =
Cstruct.LE.get_uint32 buf redirect_index_offset
in
Ok (Redirect { entry; redirect_index })
else
let cluster_number =
Cstruct.LE.get_uint32 buf cluster_number_offset
in
let blob_number = Cstruct.LE.get_uint32 buf blob_number_offset in
Ok (Content { entry; cluster_number; blob_number })
end
type t = {
header: Header.t
; mime_type_list: string list
; path_ptr_list: int32 list
; title_ptr_list: int32 list
; cluster_ptr_list: int32 list
}
let read_zero_terminated_list buf start_pos =
let rec aux acc pos =
if pos >= Cstruct.length buf then List.rev acc
else
let str = Cstruct.to_string ~off:pos buf in
match String.index_opt str (Char.chr 0) with
| Some null_index ->
let mime_type = String.sub str 0 null_index in
let next_pos = pos + null_index + 1 in
if mime_type = "" then List.rev acc
else aux (mime_type :: acc) next_pos
| None -> List.rev acc
in
aux [] (Int64.to_int start_pos)
let read_ptr_list ~buf ~start_pos ~end_pos ~offset =
let rec read_pointers acc pos =
if pos >= end_pos then List.rev acc
else
let ptr = Cstruct.LE.get_uint32 buf pos in
read_pointers (ptr :: acc) (pos + offset)
in
read_pointers [] start_pos
let unmarshal buf =
let* header = Header.unmarshal buf in
let mime_list_pos = header.mime_list_pos in
let mime_type_list = read_zero_terminated_list buf mime_list_pos in
let path_ptr_list =
read_ptr_list ~buf
~start_pos:(Int64.to_int header.path_ptr_pos)
~end_pos:(Int64.to_int header.title_ptr_pos)
~offset:8
in
let title_ptr_list =
read_ptr_list ~buf
~start_pos:(Int64.to_int header.title_ptr_pos)
~end_pos:(Int64.to_int header.cluster_ptr_pos)
~offset:4
in
Ok
{
header
; mime_type_list
; path_ptr_list
; title_ptr_list
; cluster_ptr_list= []
}

2
test/dune Normal file
View file

@ -0,0 +1,2 @@
(test
(name test_zim))

0
test/test_zim.ml Normal file
View file

BIN
wiktionary_sample_file.zim Normal file

Binary file not shown.

33
zim.opam Normal file
View file

@ -0,0 +1,33 @@
# This file is generated by dune, edit dune-project instead
opam-version: "2.0"
synopsis: "A short synopsis"
description: "A longer description"
maintainer: ["Maintainer Name <maintainer@example.com>"]
authors: ["Author Name <author@example.com>"]
license: "LICENSE"
tags: ["add topics" "to describe" "your" "project"]
homepage: "https://github.com/username/reponame"
doc: "https://url/to/documentation"
bug-reports: "https://github.com/username/reponame/issues"
depends: [
"dune" {>= "3.17"}
"ocaml"
"uuidm"
"cstruct"
"odoc" {with-doc}
]
build: [
["dune" "subst"] {dev}
[
"dune"
"build"
"-p"
name
"-j"
jobs
"@install"
"@runtest" {with-test}
"@doc" {with-doc}
]
]
dev-repo: "git+https://github.com/username/reponame.git"