uniquify urls in respect to sha256 checksums (#21)
this reduces 19139 urls down to 18563 urls
This commit is contained in:
parent
6a0ae2bcab
commit
da4533c2d3
1 changed files with 53 additions and 21 deletions
|
@ -9,6 +9,10 @@ module K = struct
|
||||||
let doc = Arg.info ~doc:"Only check the cache" ["check"] in
|
let doc = Arg.info ~doc:"Only check the cache" ["check"] in
|
||||||
Mirage_runtime.register_arg Arg.(value & flag doc)
|
Mirage_runtime.register_arg Arg.(value & flag doc)
|
||||||
|
|
||||||
|
let skip_download =
|
||||||
|
let doc = Arg.info ~doc:"Skip downloading archives" ["skip-download"] in
|
||||||
|
Mirage_runtime.register_arg Arg.(value & flag doc)
|
||||||
|
|
||||||
let upstream_caches =
|
let upstream_caches =
|
||||||
let doc = Arg.info ~doc:"Upstream caches (e.g. https://opam.ocaml.org/cache)" ["upstream-cache"] in
|
let doc = Arg.info ~doc:"Upstream caches (e.g. https://opam.ocaml.org/cache)" ["upstream-cache"] in
|
||||||
Mirage_runtime.register_arg Arg.(value & opt_all string [] doc)
|
Mirage_runtime.register_arg Arg.(value & opt_all string [] doc)
|
||||||
|
@ -136,6 +140,10 @@ module Make
|
||||||
in
|
in
|
||||||
Lwt_stream.from more
|
Lwt_stream.from more
|
||||||
|
|
||||||
|
let sha256s = Hashtbl.create 13
|
||||||
|
|
||||||
|
let empty () = Hashtbl.clear sha256s
|
||||||
|
|
||||||
let find_urls acc path data =
|
let find_urls acc path data =
|
||||||
if Mirage_kv.Key.basename path = "opam" then
|
if Mirage_kv.Key.basename path = "opam" then
|
||||||
let path = Mirage_kv.Key.to_string path in
|
let path = Mirage_kv.Key.to_string path in
|
||||||
|
@ -156,8 +164,25 @@ module Make
|
||||||
if HM.cardinal csums = 0 then
|
if HM.cardinal csums = 0 then
|
||||||
(add_parse_error path ("no checksums for " ^ url);
|
(add_parse_error path ("no checksums for " ^ url);
|
||||||
acc)
|
acc)
|
||||||
else
|
else begin
|
||||||
|
let url' =
|
||||||
|
match HM.find_opt `SHA256 csums with
|
||||||
|
| None -> url
|
||||||
|
| Some hash ->
|
||||||
|
match Hashtbl.find_opt sha256s hash with
|
||||||
|
| None -> Hashtbl.add sha256s hash url; url
|
||||||
|
| Some url' ->
|
||||||
|
if not (String.equal url url') then
|
||||||
|
Logs.warn (fun m -> m "same hash for url %s and %s" url url');
|
||||||
|
url'
|
||||||
|
in
|
||||||
let mirrors = SSet.of_list mirrors in
|
let mirrors = SSet.of_list mirrors in
|
||||||
|
let url, mirrors =
|
||||||
|
if String.equal url url' then
|
||||||
|
url, mirrors
|
||||||
|
else
|
||||||
|
url', SSet.add url mirrors
|
||||||
|
in
|
||||||
SM.update url (function
|
SM.update url (function
|
||||||
| None -> Some (csums, SSet.union mirrors (upstream csums))
|
| None -> Some (csums, SSet.union mirrors (upstream csums))
|
||||||
| Some (csums', mirrors') ->
|
| Some (csums', mirrors') ->
|
||||||
|
@ -173,7 +198,8 @@ module Make
|
||||||
"mismatching hashes for %s: %s vs %s"
|
"mismatching hashes for %s: %s vs %s"
|
||||||
url (hm_to_s csums') (hm_to_s csums));
|
url (hm_to_s csums') (hm_to_s csums));
|
||||||
None
|
None
|
||||||
end) acc) acc url_csums
|
end) acc
|
||||||
|
end) acc url_csums
|
||||||
else
|
else
|
||||||
acc
|
acc
|
||||||
|
|
||||||
|
@ -271,6 +297,7 @@ module Make
|
||||||
{ md5s = SM.empty ; sha512s = SM.empty ; checked = Some KS.empty ; dev; dev_md5s; dev_sha512s ; dev_swap }
|
{ md5s = SM.empty ; sha512s = SM.empty ; checked = Some KS.empty ; dev; dev_md5s; dev_sha512s ; dev_swap }
|
||||||
|
|
||||||
let add_checked t path =
|
let add_checked t path =
|
||||||
|
Logs.info (fun m -> m "add checked %a" Mirage_kv.Key.pp path);
|
||||||
match t.checked with
|
match t.checked with
|
||||||
| None -> ()
|
| None -> ()
|
||||||
| Some s -> t.checked <- Some (KS.add path s)
|
| Some s -> t.checked <- Some (KS.add path s)
|
||||||
|
@ -620,6 +647,7 @@ module Make
|
||||||
let mtime = Option.value ~default:0 Ptime.(Span.to_int_s (to_span now)) in
|
let mtime = Option.value ~default:0 Ptime.(Span.to_int_s (to_span now)) in
|
||||||
let urls = ref SM.empty in
|
let urls = ref SM.empty in
|
||||||
entries_of_git ~mtime store repo urls >>= fun entries ->
|
entries_of_git ~mtime store repo urls >>= fun entries ->
|
||||||
|
Git.empty ();
|
||||||
let t = Tar.out ~level:Ustar entries in
|
let t = Tar.out ~level:Ustar entries in
|
||||||
let t = Tar_gz.out_gzipped ~level:4 ~mtime:(Int32.of_int mtime) Gz.Unix t in
|
let t = Tar_gz.out_gzipped ~level:4 ~mtime:(Int32.of_int mtime) Gz.Unix t in
|
||||||
let buf = Buffer.create 1024 in
|
let buf = Buffer.create 1024 in
|
||||||
|
@ -944,6 +972,7 @@ stamp: %S
|
||||||
Lwt.return_unit
|
Lwt.return_unit
|
||||||
| false ->
|
| false ->
|
||||||
let rec download url mirrors =
|
let rec download url mirrors =
|
||||||
|
Logs.info (fun m -> m "downloading %s (%u mirrors)" url (SSet.cardinal mirrors));
|
||||||
let retry () =
|
let retry () =
|
||||||
if SSet.is_empty mirrors then begin
|
if SSet.is_empty mirrors then begin
|
||||||
decr remaining_downloads;
|
decr remaining_downloads;
|
||||||
|
@ -957,6 +986,7 @@ stamp: %S
|
||||||
in
|
in
|
||||||
let quux, body_init = Disk.init_write disk csums in
|
let quux, body_init = Disk.init_write disk csums in
|
||||||
add_to_active url (Ptime.v (Pclock.now_d_ps ()));
|
add_to_active url (Ptime.v (Pclock.now_d_ps ()));
|
||||||
|
if not (K.skip_download ()) then
|
||||||
Http_mirage_client.request http_client url (Disk.write_partial disk quux url) body_init >>= function
|
Http_mirage_client.request http_client url (Disk.write_partial disk quux url) body_init >>= function
|
||||||
| Ok (resp, r) ->
|
| Ok (resp, r) ->
|
||||||
begin match r with
|
begin match r with
|
||||||
|
@ -977,6 +1007,8 @@ stamp: %S
|
||||||
| Error me ->
|
| Error me ->
|
||||||
add_failed url (Ptime.v (Pclock.now_d_ps ())) (`Mimic me);
|
add_failed url (Ptime.v (Pclock.now_d_ps ())) (`Mimic me);
|
||||||
retry ()
|
retry ()
|
||||||
|
else
|
||||||
|
retry ()
|
||||||
in
|
in
|
||||||
download url mirrors)
|
download url mirrors)
|
||||||
(SM.bindings urls) >>= fun () ->
|
(SM.bindings urls) >>= fun () ->
|
||||||
|
|
Loading…
Reference in a new issue