initial commit
This commit is contained in:
commit
f4de52a85e
11 changed files with 346 additions and 0 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
.vscode
|
||||||
|
_build
|
||||||
|
_opam
|
13
.ocamlformat
Normal file
13
.ocamlformat
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
version=0.27.0
|
||||||
|
exp-grouping=preserve
|
||||||
|
break-infix=wrap-or-vertical
|
||||||
|
break-collection-expressions=wrap
|
||||||
|
break-sequences=false
|
||||||
|
break-infix-before-func=false
|
||||||
|
dock-collection-brackets=true
|
||||||
|
break-separators=before
|
||||||
|
field-space=tight
|
||||||
|
if-then-else=compact
|
||||||
|
break-sequences=false
|
||||||
|
sequence-blank-line=compact
|
||||||
|
exp-grouping=preserve
|
4
bin/dune
Normal file
4
bin/dune
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
(executable
|
||||||
|
(public_name zim_header_inspect)
|
||||||
|
(name zim_header_inspect)
|
||||||
|
(libraries zim cstruct cmdliner unix))
|
69
bin/zim_header_inspect.ml
Normal file
69
bin/zim_header_inspect.ml
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
open Cmdliner
|
||||||
|
|
||||||
|
let print_zim_header zim =
|
||||||
|
Printf.printf "Zim Header:\n";
|
||||||
|
Printf.printf "\tmagic number: %lu\n" zim.Zim.Header.magic_number;
|
||||||
|
Printf.printf "\tmajor version: %d\n" zim.Zim.Header.major_version;
|
||||||
|
Printf.printf "\tminor version: %d\n" zim.Zim.Header.minor_version;
|
||||||
|
Printf.printf "\tuuid: %s\n" (Uuidm.to_string zim.Zim.Header.uuid);
|
||||||
|
Printf.printf "\tentry count: %lu\n" zim.Zim.Header.entry_count;
|
||||||
|
Printf.printf "\tcluster count: %lu\n" zim.Zim.Header.cluster_count;
|
||||||
|
Printf.printf "\tpath pointer position: %Lu\n" zim.Zim.Header.path_ptr_pos;
|
||||||
|
Printf.printf "\ttitle pointer position: %Lu\n" zim.Zim.Header.title_ptr_pos;
|
||||||
|
Printf.printf "\tcluster pointer position: %Lu\n"
|
||||||
|
zim.Zim.Header.cluster_ptr_pos;
|
||||||
|
Printf.printf "\tmime list position: %Lu\n" zim.Zim.Header.mime_list_pos;
|
||||||
|
Printf.printf "\tmain page: %lu\n" zim.Zim.Header.main_page;
|
||||||
|
Printf.printf "\tlayout page: %lu\n" zim.Zim.Header.layout_page;
|
||||||
|
Printf.printf "\tchecksum position: %Lu\n" zim.Zim.Header.checksum_pos
|
||||||
|
|
||||||
|
let print_int32_in_columns count lst =
|
||||||
|
List.iteri
|
||||||
|
(fun i zim ->
|
||||||
|
if i > 0 && i mod count = 0 then Printf.printf "\n";
|
||||||
|
Printf.printf "%10lu " zim)
|
||||||
|
lst;
|
||||||
|
Printf.printf "\n";
|
||||||
|
()
|
||||||
|
|
||||||
|
let print_zim zim =
|
||||||
|
print_zim_header zim.Zim.header;
|
||||||
|
Printf.printf "MIME Type List:\n";
|
||||||
|
List.iter (fun zim -> Printf.printf "\t%s\n" zim) zim.Zim.mime_type_list;
|
||||||
|
Printf.printf "Path Pointer List:\n";
|
||||||
|
print_int32_in_columns 10 zim.path_ptr_list;
|
||||||
|
Printf.printf "Title Pointer List:\n";
|
||||||
|
print_int32_in_columns 10 zim.title_ptr_list;
|
||||||
|
()
|
||||||
|
|
||||||
|
let really_input fd buf pos len =
|
||||||
|
let rec loop pos remaining =
|
||||||
|
if remaining > 0 then (
|
||||||
|
let len = Unix.read fd buf pos remaining in
|
||||||
|
if len = 0 then raise End_of_file;
|
||||||
|
loop (pos + len) (remaining - len))
|
||||||
|
in
|
||||||
|
loop pos len
|
||||||
|
|
||||||
|
let read_zims zims =
|
||||||
|
List.iter
|
||||||
|
(fun zim ->
|
||||||
|
let fd = Unix.openfile zim Unix.[ O_RDONLY; O_CLOEXEC ] 0 in
|
||||||
|
let f_stats = Unix.stat zim in
|
||||||
|
let size = f_stats.st_size in
|
||||||
|
let buf = Bytes.create size in
|
||||||
|
let () = really_input fd buf 0 size in
|
||||||
|
match Zim.unmarshal (Cstruct.of_bytes buf) with
|
||||||
|
| Ok zim -> print_zim zim
|
||||||
|
| Error err -> Printf.printf "%s" err)
|
||||||
|
zims
|
||||||
|
|
||||||
|
let zims = Arg.(non_empty & pos_all file [] & info [] ~docv:"zim files")
|
||||||
|
|
||||||
|
let cmd =
|
||||||
|
let doc = "Inspect the headers of one or more zim files." in
|
||||||
|
let info = Cmd.info "zim_header_inspect" ~version:"1.0.0" ~doc in
|
||||||
|
Cmd.v info Term.(const read_zims $ zims)
|
||||||
|
|
||||||
|
let main () = exit (Cmd.eval cmd)
|
||||||
|
let () = main ()
|
3
dune-project
Normal file
3
dune-project
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
(lang dune 3.17)
|
||||||
|
(name zim)
|
||||||
|
(generate_opam_files true)
|
4
lib/dune
Normal file
4
lib/dune
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
(library
|
||||||
|
(public_name zim)
|
||||||
|
(name zim)
|
||||||
|
(libraries cstruct uuidm))
|
215
lib/zim.ml
Normal file
215
lib/zim.ml
Normal file
|
@ -0,0 +1,215 @@
|
||||||
|
(*
|
||||||
|
* Copyright (C) 2025 Pizie Dust
|
||||||
|
*
|
||||||
|
* Permission to use, copy, modify, and distribute this software for any
|
||||||
|
* purpose with or without fee is hereby granted, provided that the above
|
||||||
|
* copyright notice and this permission notice appear in all copies.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
*)
|
||||||
|
|
||||||
|
(** https://wiki.openzim.org/wiki/ZIM_file_format*)
|
||||||
|
|
||||||
|
let ( let* ) = Result.bind
|
||||||
|
|
||||||
|
module Header = struct
|
||||||
|
type t = {
|
||||||
|
magic_number: int32
|
||||||
|
(** Magic number to recognise the file format, must be 72173914
|
||||||
|
(0x44D495A) *)
|
||||||
|
; major_version: int
|
||||||
|
; minor_version: int
|
||||||
|
; uuid: Uuidm.t
|
||||||
|
; entry_count: int32
|
||||||
|
; cluster_count: int32
|
||||||
|
; path_ptr_pos: int64
|
||||||
|
; title_ptr_pos: int64
|
||||||
|
; cluster_ptr_pos: int64
|
||||||
|
; mime_list_pos: int64
|
||||||
|
; main_page: int32
|
||||||
|
; layout_page: int32
|
||||||
|
; checksum_pos: int64
|
||||||
|
}
|
||||||
|
|
||||||
|
let magic_number_offset = 0
|
||||||
|
let major_version_offset = 4
|
||||||
|
let minor_version_offset = 6
|
||||||
|
let uuid_offset = 8
|
||||||
|
let entry_count_offset = 24
|
||||||
|
let cluster_count_offset = 28
|
||||||
|
let path_ptr_pos_offset = 32
|
||||||
|
let title_ptr_pos_offset = 40
|
||||||
|
let cluster_ptr_pos_offset = 48
|
||||||
|
let mime_list_pos_offset = 56
|
||||||
|
let main_page_offset = 64
|
||||||
|
let layout_page_offset = 68
|
||||||
|
let checksum_pos_offset = 72
|
||||||
|
|
||||||
|
let unmarshal buf =
|
||||||
|
let magic_number = Cstruct.LE.get_uint32 buf magic_number_offset in
|
||||||
|
let* () =
|
||||||
|
match Int32.to_int magic_number with
|
||||||
|
| 72173914 -> Ok ()
|
||||||
|
| x -> Error (Printf.sprintf "Magic number is wrong', got '%d'" x)
|
||||||
|
in
|
||||||
|
let major_version = Cstruct.LE.get_uint16 buf major_version_offset in
|
||||||
|
let minor_version = Cstruct.LE.get_uint16 buf minor_version_offset in
|
||||||
|
let uuid_bytes = Cstruct.sub buf uuid_offset 16 |> Cstruct.to_string in
|
||||||
|
let uuid = Option.get @@ Uuidm.of_mixed_endian_binary_string uuid_bytes in
|
||||||
|
let entry_count = Cstruct.LE.get_uint32 buf entry_count_offset in
|
||||||
|
let cluster_count = Cstruct.LE.get_uint32 buf cluster_count_offset in
|
||||||
|
let path_ptr_pos = Cstruct.LE.get_uint64 buf path_ptr_pos_offset in
|
||||||
|
let title_ptr_pos = Cstruct.LE.get_uint64 buf title_ptr_pos_offset in
|
||||||
|
let cluster_ptr_pos = Cstruct.LE.get_uint64 buf cluster_ptr_pos_offset in
|
||||||
|
let mime_list_pos = Cstruct.LE.get_uint64 buf mime_list_pos_offset in
|
||||||
|
let main_page = Cstruct.LE.get_uint32 buf main_page_offset in
|
||||||
|
let layout_page = Cstruct.LE.get_uint32 buf layout_page_offset in
|
||||||
|
let checksum_pos = Cstruct.LE.get_uint64 buf checksum_pos_offset in
|
||||||
|
Ok
|
||||||
|
{
|
||||||
|
magic_number
|
||||||
|
; major_version
|
||||||
|
; minor_version
|
||||||
|
; uuid
|
||||||
|
; entry_count
|
||||||
|
; cluster_count
|
||||||
|
; path_ptr_pos
|
||||||
|
; title_ptr_pos
|
||||||
|
; cluster_ptr_pos
|
||||||
|
; mime_list_pos
|
||||||
|
; main_page
|
||||||
|
; layout_page
|
||||||
|
; checksum_pos
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
module Directory = struct
|
||||||
|
type entry = {
|
||||||
|
mime_type: int;
|
||||||
|
parameter_len: int;
|
||||||
|
namespace: char;
|
||||||
|
revision: int32;
|
||||||
|
path: string;
|
||||||
|
title: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
type content_entry = {
|
||||||
|
entry: entry;
|
||||||
|
cluster_number: int32;
|
||||||
|
blob_number: int32;
|
||||||
|
}
|
||||||
|
|
||||||
|
type redirect_entry = {
|
||||||
|
entry: entry;
|
||||||
|
redirect_index: int32;
|
||||||
|
}
|
||||||
|
|
||||||
|
type t =
|
||||||
|
| Content of content_entry
|
||||||
|
| Redirect of redirect_entry
|
||||||
|
| LinkTarget
|
||||||
|
| DeletedEntry
|
||||||
|
|
||||||
|
let mime_type_offset = 0
|
||||||
|
let parameter_len_offset = 2
|
||||||
|
let namespace_offset = 3
|
||||||
|
let revision_offset = 4
|
||||||
|
let cluster_number_offset = 8
|
||||||
|
let blob_number_offset = 12
|
||||||
|
let path_offset = 16
|
||||||
|
let redirect_index_offset = 8
|
||||||
|
|
||||||
|
let parse_null_terminated_string buf offset =
|
||||||
|
let rec find_null i =
|
||||||
|
if Cstruct.get_uint8 buf i = 0 then i else find_null (i + 1)
|
||||||
|
in
|
||||||
|
let null_pos = find_null offset in
|
||||||
|
Cstruct.to_string (Cstruct.sub buf offset (null_pos - offset))
|
||||||
|
|
||||||
|
let unmarshal buf =
|
||||||
|
let mime_type = Cstruct.LE.get_uint16 buf mime_type_offset in
|
||||||
|
if mime_type = 0xFFFE || mime_type = 0xFFFD then
|
||||||
|
Ok DeletedEntry
|
||||||
|
else
|
||||||
|
let parameter_len = Cstruct.get_uint8 buf parameter_len_offset in
|
||||||
|
let namespace = Cstruct.get_char buf namespace_offset in
|
||||||
|
let revision = Cstruct.LE.get_uint32 buf revision_offset in
|
||||||
|
let path = parse_null_terminated_string buf path_offset in
|
||||||
|
let title_offset = path_offset + String.length path + 1 in
|
||||||
|
let title = parse_null_terminated_string buf title_offset in
|
||||||
|
let entry = { mime_type; parameter_len; namespace; revision; path; title } in
|
||||||
|
if mime_type = 0xFFFF then
|
||||||
|
let redirect_index =
|
||||||
|
Cstruct.LE.get_uint32 buf redirect_index_offset
|
||||||
|
in
|
||||||
|
Ok (Redirect { entry; redirect_index })
|
||||||
|
else
|
||||||
|
let cluster_number =
|
||||||
|
Cstruct.LE.get_uint32 buf cluster_number_offset
|
||||||
|
in
|
||||||
|
let blob_number = Cstruct.LE.get_uint32 buf blob_number_offset in
|
||||||
|
Ok (Content { entry; cluster_number; blob_number })
|
||||||
|
end
|
||||||
|
|
||||||
|
type t = {
|
||||||
|
header: Header.t
|
||||||
|
; mime_type_list: string list
|
||||||
|
; path_ptr_list: int32 list
|
||||||
|
; title_ptr_list: int32 list
|
||||||
|
; cluster_ptr_list: int32 list
|
||||||
|
}
|
||||||
|
|
||||||
|
let read_zero_terminated_list buf start_pos =
|
||||||
|
let rec aux acc pos =
|
||||||
|
if pos >= Cstruct.length buf then List.rev acc
|
||||||
|
else
|
||||||
|
let str = Cstruct.to_string ~off:pos buf in
|
||||||
|
match String.index_opt str (Char.chr 0) with
|
||||||
|
| Some null_index ->
|
||||||
|
let mime_type = String.sub str 0 null_index in
|
||||||
|
let next_pos = pos + null_index + 1 in
|
||||||
|
if mime_type = "" then List.rev acc
|
||||||
|
else aux (mime_type :: acc) next_pos
|
||||||
|
| None -> List.rev acc
|
||||||
|
in
|
||||||
|
aux [] (Int64.to_int start_pos)
|
||||||
|
|
||||||
|
let read_ptr_list ~buf ~start_pos ~end_pos ~offset =
|
||||||
|
let rec read_pointers acc pos =
|
||||||
|
if pos >= end_pos then List.rev acc
|
||||||
|
else
|
||||||
|
let ptr = Cstruct.LE.get_uint32 buf pos in
|
||||||
|
read_pointers (ptr :: acc) (pos + offset)
|
||||||
|
in
|
||||||
|
read_pointers [] start_pos
|
||||||
|
|
||||||
|
let unmarshal buf =
|
||||||
|
let* header = Header.unmarshal buf in
|
||||||
|
let mime_list_pos = header.mime_list_pos in
|
||||||
|
let mime_type_list = read_zero_terminated_list buf mime_list_pos in
|
||||||
|
let path_ptr_list =
|
||||||
|
read_ptr_list ~buf
|
||||||
|
~start_pos:(Int64.to_int header.path_ptr_pos)
|
||||||
|
~end_pos:(Int64.to_int header.title_ptr_pos)
|
||||||
|
~offset:8
|
||||||
|
in
|
||||||
|
let title_ptr_list =
|
||||||
|
read_ptr_list ~buf
|
||||||
|
~start_pos:(Int64.to_int header.title_ptr_pos)
|
||||||
|
~end_pos:(Int64.to_int header.cluster_ptr_pos)
|
||||||
|
~offset:4
|
||||||
|
in
|
||||||
|
Ok
|
||||||
|
{
|
||||||
|
header
|
||||||
|
; mime_type_list
|
||||||
|
; path_ptr_list
|
||||||
|
; title_ptr_list
|
||||||
|
; cluster_ptr_list= []
|
||||||
|
}
|
2
test/dune
Normal file
2
test/dune
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
(test
|
||||||
|
(name test_zim))
|
0
test/test_zim.ml
Normal file
0
test/test_zim.ml
Normal file
BIN
wiktionary_sample_file.zim
Normal file
BIN
wiktionary_sample_file.zim
Normal file
Binary file not shown.
33
zim.opam
Normal file
33
zim.opam
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
# This file is generated by dune, edit dune-project instead
|
||||||
|
opam-version: "2.0"
|
||||||
|
synopsis: "A short synopsis"
|
||||||
|
description: "A longer description"
|
||||||
|
maintainer: ["Maintainer Name <maintainer@example.com>"]
|
||||||
|
authors: ["Author Name <author@example.com>"]
|
||||||
|
license: "LICENSE"
|
||||||
|
tags: ["add topics" "to describe" "your" "project"]
|
||||||
|
homepage: "https://github.com/username/reponame"
|
||||||
|
doc: "https://url/to/documentation"
|
||||||
|
bug-reports: "https://github.com/username/reponame/issues"
|
||||||
|
depends: [
|
||||||
|
"dune" {>= "3.17"}
|
||||||
|
"ocaml"
|
||||||
|
"uuidm"
|
||||||
|
"cstruct"
|
||||||
|
"odoc" {with-doc}
|
||||||
|
]
|
||||||
|
build: [
|
||||||
|
["dune" "subst"] {dev}
|
||||||
|
[
|
||||||
|
"dune"
|
||||||
|
"build"
|
||||||
|
"-p"
|
||||||
|
name
|
||||||
|
"-j"
|
||||||
|
jobs
|
||||||
|
"@install"
|
||||||
|
"@runtest" {with-test}
|
||||||
|
"@doc" {with-doc}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
dev-repo: "git+https://github.com/username/reponame.git"
|
Loading…
Reference in a new issue