From 74ccedb76ab113b89ed655480aff85e9f2b689f7 Mon Sep 17 00:00:00 2001
From: nhz2 <nhz2@cornell.edu>
Date: Mon, 17 Jul 2023 16:34:00 -0400
Subject: [PATCH 1/2] work

---
 src/filename-checks.jl       | 49 +++++++++++++++++++++++++++++++++---
 src/types.jl                 |  4 ++-
 src/writer.jl                |  3 +--
 test/test_filename-checks.jl | 19 ++++++++++++++
 4 files changed, 69 insertions(+), 6 deletions(-)
 create mode 100644 test/test_filename-checks.jl

diff --git a/src/filename-checks.jl b/src/filename-checks.jl
index 978b7ad..c6fc0a8 100644
--- a/src/filename-checks.jl
+++ b/src/filename-checks.jl
@@ -4,6 +4,7 @@ function basic_name_check(name::String)::Nothing
     @argcheck !isempty(name)
     @argcheck isvalid(name)
     @argcheck !startswith(name, "/")
+    @argcheck !contains(name, "//")
     @argcheck !contains(name, '\0')
     @argcheck !contains(name, '\\')
     @argcheck !contains(name, ':')
@@ -28,12 +29,54 @@ function basic_name_check(name::String)::Nothing
 end
 
 function norm_name(name::AbstractString)::String
-    lowercase(join(
+    s1 = join(
         split(
             name,
             '/';
             keepempty=false
         ),
         '/'
-    ))
-end
\ No newline at end of file
+    )
+    if endswith(name, '/')
+        s1*'/'
+    else
+        s1
+    end
+end
+
+
+# name is expected to be already normed and basic checked.
+function check_name_used(name::String, used_names_lower::Set{String}, used_dir_names_lower::Set{String})::Bool
+    # basic_name_check(name)
+    # @assert norm_name(name) == name
+    @argcheck name ∉ used_names_lower
+    if !endswith(name, '/')
+        @argcheck name ∉ used_dir_names_lower
+    end
+    # also any of the parent directories implied by name must not be files.
+    pos::Int = 1
+    while true
+        i = findnext('/', name, pos)
+        isnothing(i) && break
+        parent_name = @view(name[begin:i-1])
+        @show parent_name
+        @argcheck parent_name ∉ used_names_lower
+        pos = i+1
+    end
+end
+
+# name is expected to be already normed, but might not be checked.
+function add_name_used!(name::String, used_names_lower::Set{String}, used_dir_names_lower::Set{String})::Nothing
+    # @assert norm_name(name) == name
+    push!(name, used_names_lower)
+    # also any of the parent directories implied by name must added to used_dir_names_lower.
+    pos::Int = 1
+    while true
+        i = findnext('/', name, pos)
+        isnothing(i) && break
+        parent_name = name[begin:i-1]
+        @show parent_name
+        push!(used_dir_names_lower, parent_name)
+        pos = i+1
+    end
+end
diff --git a/src/types.jl b/src/types.jl
index 5fddec5..73b9005 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -96,7 +96,8 @@ mutable struct ZipWriter{S<:IO} <: IO
     partial_entry::Union{Nothing, PartialEntry}
     closed::Bool
     force_zip64::Bool
-    used_names_lower::Set{String}
+    used_names::Set{String}
+    used_dir_names::Set{String}
     check_names::Bool
     transcoder::Union{Nothing, NoopStream{S}, DeflateCompressorStream{S}}
     function ZipWriter(io::IO;
@@ -113,6 +114,7 @@ mutable struct ZipWriter{S<:IO} <: IO
             false,
             force_zip64,
             Set{String}(),
+            Set{String}(),
             check_names,
             nothing,
         )
diff --git a/src/writer.jl b/src/writer.jl
index 5f9761f..cbe0161 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -440,8 +440,7 @@ end
 
 Return true if `new_name` matches an existing committed entry.
 
-The check is case insensitive, and 
-insensitive to leading, trailing, and repeated `/`.
+The check is insensitive to leading and repeated `/`.
 """
 zip_name_collision(w::ZipWriter, new_name::AbstractString)::Bool = zip_name_collision(w, String(new_name))
 function zip_name_collision(w::ZipWriter, new_name::String)::Bool
diff --git a/test/test_filename-checks.jl b/test/test_filename-checks.jl
new file mode 100644
index 0000000..788689d
--- /dev/null
+++ b/test/test_filename-checks.jl
@@ -0,0 +1,19 @@
+using ZipArchives
+using Test
+using Random
+
+@testset "norm_name unit tests" begin
+    @test ZipArchives.norm_name("") == ""
+    @test ZipArchives.norm_name("/") == "/"
+    @test ZipArchives.norm_name("//") == "/"
+    @test ZipArchives.norm_name("a") == "a"
+    @test ZipArchives.norm_name("a/") == "a/"
+    @test ZipArchives.norm_name("a//") == "a/"
+    @test ZipArchives.norm_name("AaAa") == "AaAa"
+    @test ZipArchives.norm_name("a/b") == "a/b"
+    @test ZipArchives.norm_name("a//b") == "a/b"
+    # make sure norm_name doesn't error with random input
+    for i in 1:10000
+        ZipArchives.norm_name(String(rand(UInt8, rand(1:10))))
+    end
+end
\ No newline at end of file

From cb3e3a27ca72d9832d0f076d09a9e549d5371723 Mon Sep 17 00:00:00 2001
From: nhz2 <nhz2@cornell.edu>
Date: Tue, 18 Jul 2023 10:23:29 -0400
Subject: [PATCH 2/2] remove name normalization checks

---
 Project.toml                 |  2 +-
 src/filename-checks.jl       | 69 +++++++++++++++++++-----------------
 src/types.jl                 | 15 ++++++--
 src/writer.jl                | 28 ++++++---------
 test/runtests.jl             |  1 +
 test/test_filename-checks.jl | 43 +++++++++++++++-------
 test/test_show.jl            |  8 ++---
 test/test_writer.jl          | 13 +++----
 8 files changed, 103 insertions(+), 76 deletions(-)

diff --git a/Project.toml b/Project.toml
index d1e4b9d..e837739 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ZipArchives"
 uuid = "49080126-0e18-4c2a-b176-c102e4b3760c"
 authors = ["nhz2 <nhz2@cornell.edu>"]
-version = "0.4.6"
+version = "0.5.0"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
diff --git a/src/filename-checks.jl b/src/filename-checks.jl
index c6fc0a8..49d1d77 100644
--- a/src/filename-checks.jl
+++ b/src/filename-checks.jl
@@ -28,55 +28,58 @@ function basic_name_check(name::String)::Nothing
     end
 end
 
-function norm_name(name::AbstractString)::String
-    s1 = join(
-        split(
-            name,
-            '/';
-            keepempty=false
-        ),
-        '/'
-    )
-    if endswith(name, '/')
-        s1*'/'
-    else
-        s1
-    end
-end
-
-
-# name is expected to be already normed and basic checked.
-function check_name_used(name::String, used_names_lower::Set{String}, used_dir_names_lower::Set{String})::Bool
+# The zip format seems to allow an empty base file name at the top level.
+# Other base file name cannot be empty.
+# Any directory name can be just a "/".
+# For example "a///b.txt" is the following path ("a/", "/", "/", "b.txt")
+# But there is no path for a file ("a/", ""), 
+# because the path "a/" is an empty directory 
+# or a place to store metadata about "a/"
+# Then on extract on windows the empty "/" turn into "_".
+# that is why I check !contains(name, "//")
+# Any entry name that ends in a "/" is always interpreted as 
+# a directory entry.
+# used_stripped_dir_names are the path to all directories with the trailing "/" removed
+function check_name_used(name::String, used_names::Set{String}, used_stripped_dir_names::Set{String})::Nothing
     # basic_name_check(name)
-    # @assert norm_name(name) == name
-    @argcheck name ∉ used_names_lower
+    data = codeunits(name)
+    @argcheck name ∉ used_names
     if !endswith(name, '/')
-        @argcheck name ∉ used_dir_names_lower
+        @argcheck name ∉ used_stripped_dir_names
     end
     # also any of the parent directories implied by name must not be files.
     pos::Int = 1
     while true
         i = findnext('/', name, pos)
         isnothing(i) && break
-        parent_name = @view(name[begin:i-1])
-        @show parent_name
-        @argcheck parent_name ∉ used_names_lower
+        parent_data = @view(data[begin:i-1])
+        # need to rstrip because of repeated '/'
+        if isempty(parent_data) || parent_data[end] != UInt8('/')
+            # this effectively rstrips all '/'
+            parent_name = String(parent_data)
+            # @show parent_name
+            @argcheck parent_name ∉ used_names
+        end
         pos = i+1
     end
 end
 
-# name is expected to be already normed, but might not be checked.
-function add_name_used!(name::String, used_names_lower::Set{String}, used_dir_names_lower::Set{String})::Nothing
-    # @assert norm_name(name) == name
-    push!(name, used_names_lower)
-    # also any of the parent directories implied by name must added to used_dir_names_lower.
+function add_name_used!(name::String, used_names::Set{String}, used_stripped_dir_names::Set{String})::Nothing
+    data = codeunits(name)
+    push!(used_names, name)
+    # also any of the parent directories implied by name must added to used_stripped_dir_names.
     pos::Int = 1
     while true
         i = findnext('/', name, pos)
         isnothing(i) && break
-        parent_name = name[begin:i-1]
-        @show parent_name
-        push!(used_dir_names_lower, parent_name)
+        parent_data = @view(data[begin:i-1])
+        # need to rstrip because of repeated '/'
+        if isempty(parent_data) || parent_data[end] != UInt8('/')
+            # this effectively rstrips all '/'
+            parent_name = String(parent_data)
+            # @show parent_name
+            push!(used_stripped_dir_names, parent_name)
+        end     
         pos = i+1
     end
 end
diff --git a/src/types.jl b/src/types.jl
index 73b9005..8be8a7b 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -72,8 +72,6 @@ end
 
 Base.@kwdef mutable struct PartialEntry
     name::String
-    "lowercase normalized name used to check for name collisions"
-    normed_name::Union{Nothing, String}
     comment::String = ""
     external_attrs::UInt32 = UInt32(0o0100644)<<16 # external file attributes: https://unix.stackexchange.com/questions/14705/the-zip-formats-external-file-attribute
     method::UInt16 = Store # compression method
@@ -96,8 +94,19 @@ mutable struct ZipWriter{S<:IO} <: IO
     partial_entry::Union{Nothing, PartialEntry}
     closed::Bool
     force_zip64::Bool
+
+    "If checking names, this contains all committed entry names, unmodified."
     used_names::Set{String}
-    used_dir_names::Set{String}
+    
+    """
+    If checking names, this contains all implicit and explicit directory paths with all trailing '/'
+    removed. for example, the "/a.txt" entry name creates an explicit directory "/"
+    Which will be stored here as ""
+
+    This is because in zip land there is no concept of "root", 
+    and directories are allowed to be empty strings.
+    """
+    used_stripped_dir_names::Set{String}
     check_names::Bool
     transcoder::Union{Nothing, NoopStream{S}, DeflateCompressorStream{S}}
     function ZipWriter(io::IO;
diff --git a/src/writer.jl b/src/writer.jl
index cbe0161..54315b7 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -104,7 +104,9 @@ function zip_append_archive(io::IO; trunc_footer=true, zip_kwargs=(;))::ZipWrite
         w.entries = entries
         w.central_dir_buffer = central_dir_buffer
         if w.check_names
-            w.used_names_lower = Set{String}(norm_name(String(view(central_dir_buffer, e.name_range))) for e in entries)
+            for e in entries
+                add_name_used!(String(view(central_dir_buffer, e.name_range)), w.used_names, w.used_stripped_dir_names)
+            end
         end
         w
     catch # close io if there is an error parsing entries
@@ -182,18 +184,15 @@ function zip_newfile(w::ZipWriter, name::AbstractString;
     namestr::String = String(name)
     @argcheck ncodeunits(namestr) ≤ typemax(UInt16)
     @argcheck ncodeunits(comment) ≤ typemax(UInt16)
-    normed_name = nothing
     if w.check_names
         basic_name_check(namestr)
         @argcheck !isnothing(external_attrs) || !endswith(namestr, "/")
-        normed_name = norm_name(namestr)
-        @argcheck normed_name ∉ w.used_names_lower
+        check_name_used(namestr, w.used_names, w.used_stripped_dir_names)
     end
     @assert !iswritable(w)
     io = w._io
     pe = PartialEntry(;
         name=namestr,
-        normed_name,
         comment,
         w.force_zip64,
         offset=0, # place holder offset
@@ -337,7 +336,7 @@ function zip_commitfile(w::ZipWriter)
         end
         entry = append_entry!(w.central_dir_buffer, pe)
         if w.check_names
-            push!(w.used_names_lower, something(pe.normed_name))
+            add_name_used!(pe.name, w.used_names, w.used_stripped_dir_names)
         end
         push!(w.entries, entry)
     end
@@ -394,19 +393,16 @@ function zip_writefile(w::ZipWriter, name::AbstractString, data::AbstractVector{
     namestr::String = String(name)
     @argcheck ncodeunits(namestr) ≤ typemax(UInt16)
     @argcheck ncodeunits(comment) ≤ typemax(UInt16)
-    normed_name=nothing
     if w.check_names
         basic_name_check(namestr)
         @argcheck !isnothing(external_attrs) || !endswith(namestr, "/")
-        normed_name = norm_name(namestr)
-        @argcheck normed_name ∉ w.used_names_lower
+        check_name_used(namestr, w.used_names, w.used_stripped_dir_names)
     end
     @assert !iswritable(w)
     io = w._io
     crc32 = zip_crc32(data)
     pe = PartialEntry(;
         name=namestr,
-        normed_name,
         comment,
         offset=0,# place holder offset
         w.force_zip64,
@@ -429,7 +425,7 @@ function zip_writefile(w::ZipWriter, name::AbstractString, data::AbstractVector{
     @assert !iswritable(w)
     entry = append_entry!(w.central_dir_buffer, pe)
     if w.check_names
-        push!(w.used_names_lower, something(pe.normed_name))
+        add_name_used!(namestr, w.used_names, w.used_stripped_dir_names)
     end
     push!(w.entries, entry)
     nothing
@@ -438,18 +434,16 @@ end
 """
     zip_name_collision(w::ZipWriter, new_name::AbstractString)::Bool
 
-Return true if `new_name` matches an existing committed entry.
-
-The check is insensitive to leading and repeated `/`.
+Return true if `new_name` exactly matches an existing committed entry name.
 """
 zip_name_collision(w::ZipWriter, new_name::AbstractString)::Bool = zip_name_collision(w, String(new_name))
 function zip_name_collision(w::ZipWriter, new_name::String)::Bool
     if w.check_names
-        norm_name(new_name) ∈ w.used_names_lower
+        new_name ∈ w.used_names
     else
-        nname = norm_name(new_name)
+        data = codeunits(new_name)
         any(eachindex(w.entries)) do i
-            nname == norm_name(String(_name_view(w, i)))
+            data == _name_view(w, i)
         end
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index f0395e9..31d3a56 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,6 +11,7 @@ Random.seed!(1234)
 
 # @test Any[] == detect_ambiguities(Base, Core, ZipArchives)
 include("test_simple-usage.jl")
+include("test_filename-checks.jl")
 include("test_show.jl")
 include("test_writer.jl")
 include("test_reader.jl")
diff --git a/test/test_filename-checks.jl b/test/test_filename-checks.jl
index 788689d..20e17b4 100644
--- a/test/test_filename-checks.jl
+++ b/test/test_filename-checks.jl
@@ -2,18 +2,37 @@ using ZipArchives
 using Test
 using Random
 
-@testset "norm_name unit tests" begin
-    @test ZipArchives.norm_name("") == ""
-    @test ZipArchives.norm_name("/") == "/"
-    @test ZipArchives.norm_name("//") == "/"
-    @test ZipArchives.norm_name("a") == "a"
-    @test ZipArchives.norm_name("a/") == "a/"
-    @test ZipArchives.norm_name("a//") == "a/"
-    @test ZipArchives.norm_name("AaAa") == "AaAa"
-    @test ZipArchives.norm_name("a/b") == "a/b"
-    @test ZipArchives.norm_name("a//b") == "a/b"
-    # make sure norm_name doesn't error with random input
+@testset "check_name_used unit tests" begin
+    # make sure check_name_used doesn't error with random input
+    for i in 1:100000
+        name = String(rand(UInt8, rand(1:10)))
+        ZipArchives.check_name_used(name, Set{String}(), Set{String}())
+    end
+    used_names = Set{String}()
+    used_dirs = Set{String}()
     for i in 1:10000
-        ZipArchives.norm_name(String(rand(UInt8, rand(1:10))))
+        name = String(rand(UInt8, rand(1:10)))
+        ZipArchives.add_name_used!(name, used_names, used_dirs)
+    end
+    for i in 1:1000
+        name = String(rand(UInt8, rand(1:10)))
+        used_names = Set{String}()
+        used_dirs = Set{String}()
+        ZipArchives.add_name_used!(name, used_names, used_dirs)
+        @test_throws ArgumentError ZipArchives.check_name_used(name, used_names, used_dirs)
+        if !endswith(name, "/")
+            @test_throws ArgumentError ZipArchives.check_name_used(name*"/", used_names, used_dirs)
+        end
     end
+
+    used_names = Set{String}()
+    used_dirs = Set{String}()
+    ZipArchives.add_name_used!("", used_names, used_dirs)
+    @test_throws ArgumentError ZipArchives.check_name_used("/a.txt", used_names, used_dirs)
+    ZipArchives.check_name_used("a.txt", used_names, used_dirs)
+    ZipArchives.add_name_used!("a/b/c.txt", used_names, used_dirs)
+    @test_throws ArgumentError ZipArchives.check_name_used("a/b", used_names, used_dirs)
+    ZipArchives.check_name_used("a/b/", used_names, used_dirs)
+    ZipArchives.add_name_used!("a//c.txt", used_names, used_dirs)
+    ZipArchives.check_name_used("a/c.txt", used_names, used_dirs)
 end
\ No newline at end of file
diff --git a/test/test_show.jl b/test/test_show.jl
index 5e0819b..092de43 100644
--- a/test/test_show.jl
+++ b/test/test_show.jl
@@ -21,18 +21,18 @@ using Test
     io = IOBuffer()
     ZipWriter(io) do w
         zip_writefile(w, "test", b"data")
-        zip_writefile(w, "test/foo", b"data")
+        zip_writefile(w, "testdir/foo", b"data")
     end
     data = take!(io)
     r = ZipBufferReader(data)
     @test repr(r) == "ZipArchives.ZipBufferReader($(data))"
     @test sprint(io->(show(io, MIME"text/plain"(), r))) == """
-    246 byte, 2 entry ZipBufferReader{Vector{UInt8}}
+    252 byte, 2 entry ZipBufferReader{Vector{UInt8}}
     total uncompressed size: 8 bytes
       "test"
-      \"test/\""""
+      \"testdir/\""""
     @test sprint(io->(show(IOContext(io, :displaysize => (3, 80)), MIME"text/plain"(), r))) == """
-    246 byte, 2 entry ZipBufferReader{Vector{UInt8}}
+    252 byte, 2 entry ZipBufferReader{Vector{UInt8}}
     total uncompressed size: 8 bytes
       ⋮"""
 end
diff --git a/test/test_writer.jl b/test/test_writer.jl
index 04a0a3e..e1bb27e 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -56,11 +56,12 @@ ZipWriter(joinpath(tmp, "utf8.zip")) do w
     @test !zip_name_collision(w, "test2")
     @test !zip_name_collision(w, SubString("test2", 1:2))
     @test !zip_name_collision(w, "test2.txt/")
+    @test !zip_name_collision(w, "test2.txt")
     zip_commitfile(w)
     @test !zip_name_collision(w, "test2")
-    @test zip_name_collision(w, "test2.txt/")
+    @test !zip_name_collision(w, "test2.txt/")
     @test zip_name_collision(w, "test2.txt")
-    @test zip_name_collision(w, "Test2.txt")
+    @test !zip_name_collision(w, "Test2.txt")
     @test zip_name_collision(w, "🐨.txt")
 end
 
@@ -170,11 +171,11 @@ end
     io = IOBuffer()
     w = ZipWriter(io; check_names=false)
     zip_mkdir(w, "empty_dir")
-    @test zip_name_collision(w, "empty_dir")
+    @test !zip_name_collision(w, "empty_dir")
     @test zip_name_collision(w, "empty_dir/")
-    @test zip_name_collision(w, "empty_dir//")
-    @test zip_name_collision(w, "/empty_dir")
-    @test zip_name_collision(w, "/empty_Dir/")
+    @test !zip_name_collision(w, "empty_dir//")
+    @test !zip_name_collision(w, "/empty_dir")
+    @test !zip_name_collision(w, "/empty_Dir/")
     @test !zip_name_collision(w, "foo/empty_dir")
     # Adding symlinks requires check_names=false
     ZipArchives.zip_symlink(w, "this is a invalid target", "symlink_entry")