initial commit
This commit is contained in:
commit
f4de52a85e
11 changed files with 346 additions and 0 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
.vscode
|
||||
_build
|
||||
_opam
|
13
.ocamlformat
Normal file
13
.ocamlformat
Normal file
|
@ -0,0 +1,13 @@
|
|||
version=0.27.0
|
||||
exp-grouping=preserve
|
||||
break-infix=wrap-or-vertical
|
||||
break-collection-expressions=wrap
|
||||
break-sequences=false
|
||||
break-infix-before-func=false
|
||||
dock-collection-brackets=true
|
||||
break-separators=before
|
||||
field-space=tight
|
||||
if-then-else=compact
|
||||
break-sequences=false
|
||||
sequence-blank-line=compact
|
||||
exp-grouping=preserve
|
4
bin/dune
Normal file
4
bin/dune
Normal file
|
@ -0,0 +1,4 @@
|
|||
(executable
|
||||
(public_name zim_header_inspect)
|
||||
(name zim_header_inspect)
|
||||
(libraries zim cstruct cmdliner unix))
|
69
bin/zim_header_inspect.ml
Normal file
69
bin/zim_header_inspect.ml
Normal file
|
@ -0,0 +1,69 @@
|
|||
open Cmdliner
|
||||
|
||||
let print_zim_header zim =
|
||||
Printf.printf "Zim Header:\n";
|
||||
Printf.printf "\tmagic number: %lu\n" zim.Zim.Header.magic_number;
|
||||
Printf.printf "\tmajor version: %d\n" zim.Zim.Header.major_version;
|
||||
Printf.printf "\tminor version: %d\n" zim.Zim.Header.minor_version;
|
||||
Printf.printf "\tuuid: %s\n" (Uuidm.to_string zim.Zim.Header.uuid);
|
||||
Printf.printf "\tentry count: %lu\n" zim.Zim.Header.entry_count;
|
||||
Printf.printf "\tcluster count: %lu\n" zim.Zim.Header.cluster_count;
|
||||
Printf.printf "\tpath pointer position: %Lu\n" zim.Zim.Header.path_ptr_pos;
|
||||
Printf.printf "\ttitle pointer position: %Lu\n" zim.Zim.Header.title_ptr_pos;
|
||||
Printf.printf "\tcluster pointer position: %Lu\n"
|
||||
zim.Zim.Header.cluster_ptr_pos;
|
||||
Printf.printf "\tmime list position: %Lu\n" zim.Zim.Header.mime_list_pos;
|
||||
Printf.printf "\tmain page: %lu\n" zim.Zim.Header.main_page;
|
||||
Printf.printf "\tlayout page: %lu\n" zim.Zim.Header.layout_page;
|
||||
Printf.printf "\tchecksum position: %Lu\n" zim.Zim.Header.checksum_pos
|
||||
|
||||
let print_int32_in_columns count lst =
|
||||
List.iteri
|
||||
(fun i zim ->
|
||||
if i > 0 && i mod count = 0 then Printf.printf "\n";
|
||||
Printf.printf "%10lu " zim)
|
||||
lst;
|
||||
Printf.printf "\n";
|
||||
()
|
||||
|
||||
let print_zim zim =
|
||||
print_zim_header zim.Zim.header;
|
||||
Printf.printf "MIME Type List:\n";
|
||||
List.iter (fun zim -> Printf.printf "\t%s\n" zim) zim.Zim.mime_type_list;
|
||||
Printf.printf "Path Pointer List:\n";
|
||||
print_int32_in_columns 10 zim.path_ptr_list;
|
||||
Printf.printf "Title Pointer List:\n";
|
||||
print_int32_in_columns 10 zim.title_ptr_list;
|
||||
()
|
||||
|
||||
let really_input fd buf pos len =
|
||||
let rec loop pos remaining =
|
||||
if remaining > 0 then (
|
||||
let len = Unix.read fd buf pos remaining in
|
||||
if len = 0 then raise End_of_file;
|
||||
loop (pos + len) (remaining - len))
|
||||
in
|
||||
loop pos len
|
||||
|
||||
let read_zims zims =
|
||||
List.iter
|
||||
(fun zim ->
|
||||
let fd = Unix.openfile zim Unix.[ O_RDONLY; O_CLOEXEC ] 0 in
|
||||
let f_stats = Unix.stat zim in
|
||||
let size = f_stats.st_size in
|
||||
let buf = Bytes.create size in
|
||||
let () = really_input fd buf 0 size in
|
||||
match Zim.unmarshal (Cstruct.of_bytes buf) with
|
||||
| Ok zim -> print_zim zim
|
||||
| Error err -> Printf.printf "%s" err)
|
||||
zims
|
||||
|
||||
let zims = Arg.(non_empty & pos_all file [] & info [] ~docv:"zim files")
|
||||
|
||||
let cmd =
|
||||
let doc = "Inspect the headers of one or more zim files." in
|
||||
let info = Cmd.info "zim_header_inspect" ~version:"1.0.0" ~doc in
|
||||
Cmd.v info Term.(const read_zims $ zims)
|
||||
|
||||
let main () = exit (Cmd.eval cmd)
|
||||
let () = main ()
|
3
dune-project
Normal file
3
dune-project
Normal file
|
@ -0,0 +1,3 @@
|
|||
(lang dune 3.17)
|
||||
(name zim)
|
||||
(generate_opam_files true)
|
4
lib/dune
Normal file
4
lib/dune
Normal file
|
@ -0,0 +1,4 @@
|
|||
(library
|
||||
(public_name zim)
|
||||
(name zim)
|
||||
(libraries cstruct uuidm))
|
215
lib/zim.ml
Normal file
215
lib/zim.ml
Normal file
|
@ -0,0 +1,215 @@
|
|||
(*
|
||||
* Copyright (C) 2025 Pizie Dust
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*)
|
||||
|
||||
(** https://wiki.openzim.org/wiki/ZIM_file_format*)
|
||||
|
||||
let ( let* ) = Result.bind
|
||||
|
||||
module Header = struct
|
||||
type t = {
|
||||
magic_number: int32
|
||||
(** Magic number to recognise the file format, must be 72173914
|
||||
(0x44D495A) *)
|
||||
; major_version: int
|
||||
; minor_version: int
|
||||
; uuid: Uuidm.t
|
||||
; entry_count: int32
|
||||
; cluster_count: int32
|
||||
; path_ptr_pos: int64
|
||||
; title_ptr_pos: int64
|
||||
; cluster_ptr_pos: int64
|
||||
; mime_list_pos: int64
|
||||
; main_page: int32
|
||||
; layout_page: int32
|
||||
; checksum_pos: int64
|
||||
}
|
||||
|
||||
let magic_number_offset = 0
|
||||
let major_version_offset = 4
|
||||
let minor_version_offset = 6
|
||||
let uuid_offset = 8
|
||||
let entry_count_offset = 24
|
||||
let cluster_count_offset = 28
|
||||
let path_ptr_pos_offset = 32
|
||||
let title_ptr_pos_offset = 40
|
||||
let cluster_ptr_pos_offset = 48
|
||||
let mime_list_pos_offset = 56
|
||||
let main_page_offset = 64
|
||||
let layout_page_offset = 68
|
||||
let checksum_pos_offset = 72
|
||||
|
||||
let unmarshal buf =
|
||||
let magic_number = Cstruct.LE.get_uint32 buf magic_number_offset in
|
||||
let* () =
|
||||
match Int32.to_int magic_number with
|
||||
| 72173914 -> Ok ()
|
||||
| x -> Error (Printf.sprintf "Magic number is wrong', got '%d'" x)
|
||||
in
|
||||
let major_version = Cstruct.LE.get_uint16 buf major_version_offset in
|
||||
let minor_version = Cstruct.LE.get_uint16 buf minor_version_offset in
|
||||
let uuid_bytes = Cstruct.sub buf uuid_offset 16 |> Cstruct.to_string in
|
||||
let uuid = Option.get @@ Uuidm.of_mixed_endian_binary_string uuid_bytes in
|
||||
let entry_count = Cstruct.LE.get_uint32 buf entry_count_offset in
|
||||
let cluster_count = Cstruct.LE.get_uint32 buf cluster_count_offset in
|
||||
let path_ptr_pos = Cstruct.LE.get_uint64 buf path_ptr_pos_offset in
|
||||
let title_ptr_pos = Cstruct.LE.get_uint64 buf title_ptr_pos_offset in
|
||||
let cluster_ptr_pos = Cstruct.LE.get_uint64 buf cluster_ptr_pos_offset in
|
||||
let mime_list_pos = Cstruct.LE.get_uint64 buf mime_list_pos_offset in
|
||||
let main_page = Cstruct.LE.get_uint32 buf main_page_offset in
|
||||
let layout_page = Cstruct.LE.get_uint32 buf layout_page_offset in
|
||||
let checksum_pos = Cstruct.LE.get_uint64 buf checksum_pos_offset in
|
||||
Ok
|
||||
{
|
||||
magic_number
|
||||
; major_version
|
||||
; minor_version
|
||||
; uuid
|
||||
; entry_count
|
||||
; cluster_count
|
||||
; path_ptr_pos
|
||||
; title_ptr_pos
|
||||
; cluster_ptr_pos
|
||||
; mime_list_pos
|
||||
; main_page
|
||||
; layout_page
|
||||
; checksum_pos
|
||||
}
|
||||
end
|
||||
|
||||
module Directory = struct
|
||||
type entry = {
|
||||
mime_type: int;
|
||||
parameter_len: int;
|
||||
namespace: char;
|
||||
revision: int32;
|
||||
path: string;
|
||||
title: string;
|
||||
}
|
||||
|
||||
type content_entry = {
|
||||
entry: entry;
|
||||
cluster_number: int32;
|
||||
blob_number: int32;
|
||||
}
|
||||
|
||||
type redirect_entry = {
|
||||
entry: entry;
|
||||
redirect_index: int32;
|
||||
}
|
||||
|
||||
type t =
|
||||
| Content of content_entry
|
||||
| Redirect of redirect_entry
|
||||
| LinkTarget
|
||||
| DeletedEntry
|
||||
|
||||
let mime_type_offset = 0
|
||||
let parameter_len_offset = 2
|
||||
let namespace_offset = 3
|
||||
let revision_offset = 4
|
||||
let cluster_number_offset = 8
|
||||
let blob_number_offset = 12
|
||||
let path_offset = 16
|
||||
let redirect_index_offset = 8
|
||||
|
||||
let parse_null_terminated_string buf offset =
|
||||
let rec find_null i =
|
||||
if Cstruct.get_uint8 buf i = 0 then i else find_null (i + 1)
|
||||
in
|
||||
let null_pos = find_null offset in
|
||||
Cstruct.to_string (Cstruct.sub buf offset (null_pos - offset))
|
||||
|
||||
let unmarshal buf =
|
||||
let mime_type = Cstruct.LE.get_uint16 buf mime_type_offset in
|
||||
if mime_type = 0xFFFE || mime_type = 0xFFFD then
|
||||
Ok DeletedEntry
|
||||
else
|
||||
let parameter_len = Cstruct.get_uint8 buf parameter_len_offset in
|
||||
let namespace = Cstruct.get_char buf namespace_offset in
|
||||
let revision = Cstruct.LE.get_uint32 buf revision_offset in
|
||||
let path = parse_null_terminated_string buf path_offset in
|
||||
let title_offset = path_offset + String.length path + 1 in
|
||||
let title = parse_null_terminated_string buf title_offset in
|
||||
let entry = { mime_type; parameter_len; namespace; revision; path; title } in
|
||||
if mime_type = 0xFFFF then
|
||||
let redirect_index =
|
||||
Cstruct.LE.get_uint32 buf redirect_index_offset
|
||||
in
|
||||
Ok (Redirect { entry; redirect_index })
|
||||
else
|
||||
let cluster_number =
|
||||
Cstruct.LE.get_uint32 buf cluster_number_offset
|
||||
in
|
||||
let blob_number = Cstruct.LE.get_uint32 buf blob_number_offset in
|
||||
Ok (Content { entry; cluster_number; blob_number })
|
||||
end
|
||||
|
||||
type t = {
|
||||
header: Header.t
|
||||
; mime_type_list: string list
|
||||
; path_ptr_list: int32 list
|
||||
; title_ptr_list: int32 list
|
||||
; cluster_ptr_list: int32 list
|
||||
}
|
||||
|
||||
let read_zero_terminated_list buf start_pos =
|
||||
let rec aux acc pos =
|
||||
if pos >= Cstruct.length buf then List.rev acc
|
||||
else
|
||||
let str = Cstruct.to_string ~off:pos buf in
|
||||
match String.index_opt str (Char.chr 0) with
|
||||
| Some null_index ->
|
||||
let mime_type = String.sub str 0 null_index in
|
||||
let next_pos = pos + null_index + 1 in
|
||||
if mime_type = "" then List.rev acc
|
||||
else aux (mime_type :: acc) next_pos
|
||||
| None -> List.rev acc
|
||||
in
|
||||
aux [] (Int64.to_int start_pos)
|
||||
|
||||
let read_ptr_list ~buf ~start_pos ~end_pos ~offset =
|
||||
let rec read_pointers acc pos =
|
||||
if pos >= end_pos then List.rev acc
|
||||
else
|
||||
let ptr = Cstruct.LE.get_uint32 buf pos in
|
||||
read_pointers (ptr :: acc) (pos + offset)
|
||||
in
|
||||
read_pointers [] start_pos
|
||||
|
||||
let unmarshal buf =
|
||||
let* header = Header.unmarshal buf in
|
||||
let mime_list_pos = header.mime_list_pos in
|
||||
let mime_type_list = read_zero_terminated_list buf mime_list_pos in
|
||||
let path_ptr_list =
|
||||
read_ptr_list ~buf
|
||||
~start_pos:(Int64.to_int header.path_ptr_pos)
|
||||
~end_pos:(Int64.to_int header.title_ptr_pos)
|
||||
~offset:8
|
||||
in
|
||||
let title_ptr_list =
|
||||
read_ptr_list ~buf
|
||||
~start_pos:(Int64.to_int header.title_ptr_pos)
|
||||
~end_pos:(Int64.to_int header.cluster_ptr_pos)
|
||||
~offset:4
|
||||
in
|
||||
Ok
|
||||
{
|
||||
header
|
||||
; mime_type_list
|
||||
; path_ptr_list
|
||||
; title_ptr_list
|
||||
; cluster_ptr_list= []
|
||||
}
|
2
test/dune
Normal file
2
test/dune
Normal file
|
@ -0,0 +1,2 @@
|
|||
(test
|
||||
(name test_zim))
|
0
test/test_zim.ml
Normal file
0
test/test_zim.ml
Normal file
BIN
wiktionary_sample_file.zim
Normal file
BIN
wiktionary_sample_file.zim
Normal file
Binary file not shown.
33
zim.opam
Normal file
33
zim.opam
Normal file
|
@ -0,0 +1,33 @@
|
|||
# This file is generated by dune, edit dune-project instead
|
||||
opam-version: "2.0"
|
||||
synopsis: "A short synopsis"
|
||||
description: "A longer description"
|
||||
maintainer: ["Maintainer Name <maintainer@example.com>"]
|
||||
authors: ["Author Name <author@example.com>"]
|
||||
license: "LICENSE"
|
||||
tags: ["add topics" "to describe" "your" "project"]
|
||||
homepage: "https://github.com/username/reponame"
|
||||
doc: "https://url/to/documentation"
|
||||
bug-reports: "https://github.com/username/reponame/issues"
|
||||
depends: [
|
||||
"dune" {>= "3.17"}
|
||||
"ocaml"
|
||||
"uuidm"
|
||||
"cstruct"
|
||||
"odoc" {with-doc}
|
||||
]
|
||||
build: [
|
||||
["dune" "subst"] {dev}
|
||||
[
|
||||
"dune"
|
||||
"build"
|
||||
"-p"
|
||||
name
|
||||
"-j"
|
||||
jobs
|
||||
"@install"
|
||||
"@runtest" {with-test}
|
||||
"@doc" {with-doc}
|
||||
]
|
||||
]
|
||||
dev-repo: "git+https://github.com/username/reponame.git"
|
Loading…
Reference in a new issue