From 74ccedb76ab113b89ed655480aff85e9f2b689f7 Mon Sep 17 00:00:00 2001 From: nhz2 Date: Mon, 17 Jul 2023 16:34:00 -0400 Subject: [PATCH 1/2] work --- src/filename-checks.jl | 49 +++++++++++++++++++++++++++++++++--- src/types.jl | 4 ++- src/writer.jl | 3 +-- test/test_filename-checks.jl | 19 ++++++++++++++ 4 files changed, 69 insertions(+), 6 deletions(-) create mode 100644 test/test_filename-checks.jl diff --git a/src/filename-checks.jl b/src/filename-checks.jl index 978b7ad..c6fc0a8 100644 --- a/src/filename-checks.jl +++ b/src/filename-checks.jl @@ -4,6 +4,7 @@ function basic_name_check(name::String)::Nothing @argcheck !isempty(name) @argcheck isvalid(name) @argcheck !startswith(name, "/") + @argcheck !contains(name, "//") @argcheck !contains(name, '\0') @argcheck !contains(name, '\\') @argcheck !contains(name, ':') @@ -28,12 +29,54 @@ function basic_name_check(name::String)::Nothing end function norm_name(name::AbstractString)::String - lowercase(join( + s1 = join( split( name, '/'; keepempty=false ), '/' - )) -end \ No newline at end of file + ) + if endswith(name, '/') + s1*'/' + else + s1 + end +end + + +# name is expected to be already normed and basic checked. +function check_name_used(name::String, used_names_lower::Set{String}, used_dir_names_lower::Set{String})::Bool + # basic_name_check(name) + # @assert norm_name(name) == name + @argcheck name ∉ used_names_lower + if !endswith(name, '/') + @argcheck name ∉ used_dir_names_lower + end + # also any of the parent directories implied by name must not be files. + pos::Int = 1 + while true + i = findnext('/', name, pos) + isnothing(i) && break + parent_name = @view(name[begin:i-1]) + @show parent_name + @argcheck parent_name ∉ used_names_lower + pos = i+1 + end +end + +# name is expected to be already normed, but might not be checked. +function add_name_used!(name::String, used_names_lower::Set{String}, used_dir_names_lower::Set{String})::Nothing + # @assert norm_name(name) == name + push!(name, used_names_lower) + # also any of the parent directories implied by name must added to used_dir_names_lower. + pos::Int = 1 + while true + i = findnext('/', name, pos) + isnothing(i) && break + parent_name = name[begin:i-1] + @show parent_name + push!(used_dir_names_lower, parent_name) + pos = i+1 + end +end diff --git a/src/types.jl b/src/types.jl index 5fddec5..73b9005 100644 --- a/src/types.jl +++ b/src/types.jl @@ -96,7 +96,8 @@ mutable struct ZipWriter{S<:IO} <: IO partial_entry::Union{Nothing, PartialEntry} closed::Bool force_zip64::Bool - used_names_lower::Set{String} + used_names::Set{String} + used_dir_names::Set{String} check_names::Bool transcoder::Union{Nothing, NoopStream{S}, DeflateCompressorStream{S}} function ZipWriter(io::IO; @@ -113,6 +114,7 @@ mutable struct ZipWriter{S<:IO} <: IO false, force_zip64, Set{String}(), + Set{String}(), check_names, nothing, ) diff --git a/src/writer.jl b/src/writer.jl index 5f9761f..cbe0161 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -440,8 +440,7 @@ end Return true if `new_name` matches an existing committed entry. -The check is case insensitive, and -insensitive to leading, trailing, and repeated `/`. +The check is insensitive to leading and repeated `/`. """ zip_name_collision(w::ZipWriter, new_name::AbstractString)::Bool = zip_name_collision(w, String(new_name)) function zip_name_collision(w::ZipWriter, new_name::String)::Bool diff --git a/test/test_filename-checks.jl b/test/test_filename-checks.jl new file mode 100644 index 0000000..788689d --- /dev/null +++ b/test/test_filename-checks.jl @@ -0,0 +1,19 @@ +using ZipArchives +using Test +using Random + +@testset "norm_name unit tests" begin + @test ZipArchives.norm_name("") == "" + @test ZipArchives.norm_name("/") == "/" + @test ZipArchives.norm_name("//") == "/" + @test ZipArchives.norm_name("a") == "a" + @test ZipArchives.norm_name("a/") == "a/" + @test ZipArchives.norm_name("a//") == "a/" + @test ZipArchives.norm_name("AaAa") == "AaAa" + @test ZipArchives.norm_name("a/b") == "a/b" + @test ZipArchives.norm_name("a//b") == "a/b" + # make sure norm_name doesn't error with random input + for i in 1:10000 + ZipArchives.norm_name(String(rand(UInt8, rand(1:10)))) + end +end \ No newline at end of file From cb3e3a27ca72d9832d0f076d09a9e549d5371723 Mon Sep 17 00:00:00 2001 From: nhz2 Date: Tue, 18 Jul 2023 10:23:29 -0400 Subject: [PATCH 2/2] remove name normalization checks --- Project.toml | 2 +- src/filename-checks.jl | 69 +++++++++++++++++++----------------- src/types.jl | 15 ++++++-- src/writer.jl | 28 ++++++--------- test/runtests.jl | 1 + test/test_filename-checks.jl | 43 +++++++++++++++------- test/test_show.jl | 8 ++--- test/test_writer.jl | 13 +++---- 8 files changed, 103 insertions(+), 76 deletions(-) diff --git a/Project.toml b/Project.toml index d1e4b9d..e837739 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ZipArchives" uuid = "49080126-0e18-4c2a-b176-c102e4b3760c" authors = ["nhz2 "] -version = "0.4.6" +version = "0.5.0" [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" diff --git a/src/filename-checks.jl b/src/filename-checks.jl index c6fc0a8..49d1d77 100644 --- a/src/filename-checks.jl +++ b/src/filename-checks.jl @@ -28,55 +28,58 @@ function basic_name_check(name::String)::Nothing end end -function norm_name(name::AbstractString)::String - s1 = join( - split( - name, - '/'; - keepempty=false - ), - '/' - ) - if endswith(name, '/') - s1*'/' - else - s1 - end -end - - -# name is expected to be already normed and basic checked. -function check_name_used(name::String, used_names_lower::Set{String}, used_dir_names_lower::Set{String})::Bool +# The zip format seems to allow an empty base file name at the top level. +# Other base file name cannot be empty. +# Any directory name can be just a "/". +# For example "a///b.txt" is the following path ("a/", "/", "/", "b.txt") +# But there is no path for a file ("a/", ""), +# because the path "a/" is an empty directory +# or a place to store metadata about "a/" +# Then on extract on windows the empty "/" turn into "_". +# that is why I check !contains(name, "//") +# Any entry name that ends in a "/" is always interpreted as +# a directory entry. +# used_stripped_dir_names are the path to all directories with the trailing "/" removed +function check_name_used(name::String, used_names::Set{String}, used_stripped_dir_names::Set{String})::Nothing # basic_name_check(name) - # @assert norm_name(name) == name - @argcheck name ∉ used_names_lower + data = codeunits(name) + @argcheck name ∉ used_names if !endswith(name, '/') - @argcheck name ∉ used_dir_names_lower + @argcheck name ∉ used_stripped_dir_names end # also any of the parent directories implied by name must not be files. pos::Int = 1 while true i = findnext('/', name, pos) isnothing(i) && break - parent_name = @view(name[begin:i-1]) - @show parent_name - @argcheck parent_name ∉ used_names_lower + parent_data = @view(data[begin:i-1]) + # need to rstrip because of repeated '/' + if isempty(parent_data) || parent_data[end] != UInt8('/') + # this effectively rstrips all '/' + parent_name = String(parent_data) + # @show parent_name + @argcheck parent_name ∉ used_names + end pos = i+1 end end -# name is expected to be already normed, but might not be checked. -function add_name_used!(name::String, used_names_lower::Set{String}, used_dir_names_lower::Set{String})::Nothing - # @assert norm_name(name) == name - push!(name, used_names_lower) - # also any of the parent directories implied by name must added to used_dir_names_lower. +function add_name_used!(name::String, used_names::Set{String}, used_stripped_dir_names::Set{String})::Nothing + data = codeunits(name) + push!(used_names, name) + # also any of the parent directories implied by name must added to used_stripped_dir_names. pos::Int = 1 while true i = findnext('/', name, pos) isnothing(i) && break - parent_name = name[begin:i-1] - @show parent_name - push!(used_dir_names_lower, parent_name) + parent_data = @view(data[begin:i-1]) + # need to rstrip because of repeated '/' + if isempty(parent_data) || parent_data[end] != UInt8('/') + # this effectively rstrips all '/' + parent_name = String(parent_data) + # @show parent_name + push!(used_stripped_dir_names, parent_name) + end pos = i+1 end end diff --git a/src/types.jl b/src/types.jl index 73b9005..8be8a7b 100644 --- a/src/types.jl +++ b/src/types.jl @@ -72,8 +72,6 @@ end Base.@kwdef mutable struct PartialEntry name::String - "lowercase normalized name used to check for name collisions" - normed_name::Union{Nothing, String} comment::String = "" external_attrs::UInt32 = UInt32(0o0100644)<<16 # external file attributes: https://unix.stackexchange.com/questions/14705/the-zip-formats-external-file-attribute method::UInt16 = Store # compression method @@ -96,8 +94,19 @@ mutable struct ZipWriter{S<:IO} <: IO partial_entry::Union{Nothing, PartialEntry} closed::Bool force_zip64::Bool + + "If checking names, this contains all committed entry names, unmodified." used_names::Set{String} - used_dir_names::Set{String} + + """ + If checking names, this contains all implicit and explicit directory paths with all trailing '/' + removed. for example, the "/a.txt" entry name creates an explicit directory "/" + Which will be stored here as "" + + This is because in zip land there is no concept of "root", + and directories are allowed to be empty strings. + """ + used_stripped_dir_names::Set{String} check_names::Bool transcoder::Union{Nothing, NoopStream{S}, DeflateCompressorStream{S}} function ZipWriter(io::IO; diff --git a/src/writer.jl b/src/writer.jl index cbe0161..54315b7 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -104,7 +104,9 @@ function zip_append_archive(io::IO; trunc_footer=true, zip_kwargs=(;))::ZipWrite w.entries = entries w.central_dir_buffer = central_dir_buffer if w.check_names - w.used_names_lower = Set{String}(norm_name(String(view(central_dir_buffer, e.name_range))) for e in entries) + for e in entries + add_name_used!(String(view(central_dir_buffer, e.name_range)), w.used_names, w.used_stripped_dir_names) + end end w catch # close io if there is an error parsing entries @@ -182,18 +184,15 @@ function zip_newfile(w::ZipWriter, name::AbstractString; namestr::String = String(name) @argcheck ncodeunits(namestr) ≤ typemax(UInt16) @argcheck ncodeunits(comment) ≤ typemax(UInt16) - normed_name = nothing if w.check_names basic_name_check(namestr) @argcheck !isnothing(external_attrs) || !endswith(namestr, "/") - normed_name = norm_name(namestr) - @argcheck normed_name ∉ w.used_names_lower + check_name_used(namestr, w.used_names, w.used_stripped_dir_names) end @assert !iswritable(w) io = w._io pe = PartialEntry(; name=namestr, - normed_name, comment, w.force_zip64, offset=0, # place holder offset @@ -337,7 +336,7 @@ function zip_commitfile(w::ZipWriter) end entry = append_entry!(w.central_dir_buffer, pe) if w.check_names - push!(w.used_names_lower, something(pe.normed_name)) + add_name_used!(pe.name, w.used_names, w.used_stripped_dir_names) end push!(w.entries, entry) end @@ -394,19 +393,16 @@ function zip_writefile(w::ZipWriter, name::AbstractString, data::AbstractVector{ namestr::String = String(name) @argcheck ncodeunits(namestr) ≤ typemax(UInt16) @argcheck ncodeunits(comment) ≤ typemax(UInt16) - normed_name=nothing if w.check_names basic_name_check(namestr) @argcheck !isnothing(external_attrs) || !endswith(namestr, "/") - normed_name = norm_name(namestr) - @argcheck normed_name ∉ w.used_names_lower + check_name_used(namestr, w.used_names, w.used_stripped_dir_names) end @assert !iswritable(w) io = w._io crc32 = zip_crc32(data) pe = PartialEntry(; name=namestr, - normed_name, comment, offset=0,# place holder offset w.force_zip64, @@ -429,7 +425,7 @@ function zip_writefile(w::ZipWriter, name::AbstractString, data::AbstractVector{ @assert !iswritable(w) entry = append_entry!(w.central_dir_buffer, pe) if w.check_names - push!(w.used_names_lower, something(pe.normed_name)) + add_name_used!(namestr, w.used_names, w.used_stripped_dir_names) end push!(w.entries, entry) nothing @@ -438,18 +434,16 @@ end """ zip_name_collision(w::ZipWriter, new_name::AbstractString)::Bool -Return true if `new_name` matches an existing committed entry. - -The check is insensitive to leading and repeated `/`. +Return true if `new_name` exactly matches an existing committed entry name. """ zip_name_collision(w::ZipWriter, new_name::AbstractString)::Bool = zip_name_collision(w, String(new_name)) function zip_name_collision(w::ZipWriter, new_name::String)::Bool if w.check_names - norm_name(new_name) ∈ w.used_names_lower + new_name ∈ w.used_names else - nname = norm_name(new_name) + data = codeunits(new_name) any(eachindex(w.entries)) do i - nname == norm_name(String(_name_view(w, i))) + data == _name_view(w, i) end end end diff --git a/test/runtests.jl b/test/runtests.jl index f0395e9..31d3a56 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,6 +11,7 @@ Random.seed!(1234) # @test Any[] == detect_ambiguities(Base, Core, ZipArchives) include("test_simple-usage.jl") +include("test_filename-checks.jl") include("test_show.jl") include("test_writer.jl") include("test_reader.jl") diff --git a/test/test_filename-checks.jl b/test/test_filename-checks.jl index 788689d..20e17b4 100644 --- a/test/test_filename-checks.jl +++ b/test/test_filename-checks.jl @@ -2,18 +2,37 @@ using ZipArchives using Test using Random -@testset "norm_name unit tests" begin - @test ZipArchives.norm_name("") == "" - @test ZipArchives.norm_name("/") == "/" - @test ZipArchives.norm_name("//") == "/" - @test ZipArchives.norm_name("a") == "a" - @test ZipArchives.norm_name("a/") == "a/" - @test ZipArchives.norm_name("a//") == "a/" - @test ZipArchives.norm_name("AaAa") == "AaAa" - @test ZipArchives.norm_name("a/b") == "a/b" - @test ZipArchives.norm_name("a//b") == "a/b" - # make sure norm_name doesn't error with random input +@testset "check_name_used unit tests" begin + # make sure check_name_used doesn't error with random input + for i in 1:100000 + name = String(rand(UInt8, rand(1:10))) + ZipArchives.check_name_used(name, Set{String}(), Set{String}()) + end + used_names = Set{String}() + used_dirs = Set{String}() for i in 1:10000 - ZipArchives.norm_name(String(rand(UInt8, rand(1:10)))) + name = String(rand(UInt8, rand(1:10))) + ZipArchives.add_name_used!(name, used_names, used_dirs) + end + for i in 1:1000 + name = String(rand(UInt8, rand(1:10))) + used_names = Set{String}() + used_dirs = Set{String}() + ZipArchives.add_name_used!(name, used_names, used_dirs) + @test_throws ArgumentError ZipArchives.check_name_used(name, used_names, used_dirs) + if !endswith(name, "/") + @test_throws ArgumentError ZipArchives.check_name_used(name*"/", used_names, used_dirs) + end end + + used_names = Set{String}() + used_dirs = Set{String}() + ZipArchives.add_name_used!("", used_names, used_dirs) + @test_throws ArgumentError ZipArchives.check_name_used("/a.txt", used_names, used_dirs) + ZipArchives.check_name_used("a.txt", used_names, used_dirs) + ZipArchives.add_name_used!("a/b/c.txt", used_names, used_dirs) + @test_throws ArgumentError ZipArchives.check_name_used("a/b", used_names, used_dirs) + ZipArchives.check_name_used("a/b/", used_names, used_dirs) + ZipArchives.add_name_used!("a//c.txt", used_names, used_dirs) + ZipArchives.check_name_used("a/c.txt", used_names, used_dirs) end \ No newline at end of file diff --git a/test/test_show.jl b/test/test_show.jl index 5e0819b..092de43 100644 --- a/test/test_show.jl +++ b/test/test_show.jl @@ -21,18 +21,18 @@ using Test io = IOBuffer() ZipWriter(io) do w zip_writefile(w, "test", b"data") - zip_writefile(w, "test/foo", b"data") + zip_writefile(w, "testdir/foo", b"data") end data = take!(io) r = ZipBufferReader(data) @test repr(r) == "ZipArchives.ZipBufferReader($(data))" @test sprint(io->(show(io, MIME"text/plain"(), r))) == """ - 246 byte, 2 entry ZipBufferReader{Vector{UInt8}} + 252 byte, 2 entry ZipBufferReader{Vector{UInt8}} total uncompressed size: 8 bytes "test" - \"test/\"""" + \"testdir/\"""" @test sprint(io->(show(IOContext(io, :displaysize => (3, 80)), MIME"text/plain"(), r))) == """ - 246 byte, 2 entry ZipBufferReader{Vector{UInt8}} + 252 byte, 2 entry ZipBufferReader{Vector{UInt8}} total uncompressed size: 8 bytes ⋮""" end diff --git a/test/test_writer.jl b/test/test_writer.jl index 04a0a3e..e1bb27e 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -56,11 +56,12 @@ ZipWriter(joinpath(tmp, "utf8.zip")) do w @test !zip_name_collision(w, "test2") @test !zip_name_collision(w, SubString("test2", 1:2)) @test !zip_name_collision(w, "test2.txt/") + @test !zip_name_collision(w, "test2.txt") zip_commitfile(w) @test !zip_name_collision(w, "test2") - @test zip_name_collision(w, "test2.txt/") + @test !zip_name_collision(w, "test2.txt/") @test zip_name_collision(w, "test2.txt") - @test zip_name_collision(w, "Test2.txt") + @test !zip_name_collision(w, "Test2.txt") @test zip_name_collision(w, "🐨.txt") end @@ -170,11 +171,11 @@ end io = IOBuffer() w = ZipWriter(io; check_names=false) zip_mkdir(w, "empty_dir") - @test zip_name_collision(w, "empty_dir") + @test !zip_name_collision(w, "empty_dir") @test zip_name_collision(w, "empty_dir/") - @test zip_name_collision(w, "empty_dir//") - @test zip_name_collision(w, "/empty_dir") - @test zip_name_collision(w, "/empty_Dir/") + @test !zip_name_collision(w, "empty_dir//") + @test !zip_name_collision(w, "/empty_dir") + @test !zip_name_collision(w, "/empty_Dir/") @test !zip_name_collision(w, "foo/empty_dir") # Adding symlinks requires check_names=false ZipArchives.zip_symlink(w, "this is a invalid target", "symlink_entry")