diff --git a/Project.toml b/Project.toml index c010b93..68a06a9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,10 @@ name = "PairwiseMappingFormat" uuid = "30ee2946-b067-41c0-bed2-0900b78dbe59" -authors = ["Jakob Nybo Nissen "] version = "0.1.0" +authors = ["Jakob Nybo Nissen "] [deps] -MemViews = "a791c907-b98b-4e44-8f4d-e4c2362c6b2f" +MemoryViews = "a791c907-b98b-4e44-8f4d-e4c2362c6b2f" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" StringViews = "354b36f9-a18e-4713-926e-db85100087ba" XAMAuxData = "e99d641e-1821-45d7-9150-ecb7bf333fe1" @@ -12,7 +12,7 @@ XAMAuxData = "e99d641e-1821-45d7-9150-ecb7bf333fe1" [compat] Aqua = "0.8.7" FormatSpecimens = "1.3" -MemViews = "0.2" +MemoryViews = "0.2" PrecompileTools = "1.2.1" StringViews = "1.3.3" Test = "1.11" @@ -22,10 +22,10 @@ julia = "1.11" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd" -MemViews = "a791c907-b98b-4e44-8f4d-e4c2362c6b2f" +MemoryViews = "a791c907-b98b-4e44-8f4d-e4c2362c6b2f" StringViews = "354b36f9-a18e-4713-926e-db85100087ba" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" XAMAuxData = "e99d641e-1821-45d7-9150-ecb7bf333fe1" [targets] -test = ["Test", "FormatSpecimens", "MemViews", "StringViews", "XAMAuxData", "Aqua"] +test = ["Test", "FormatSpecimens", "MemoryViews", "StringViews", "XAMAuxData", "Aqua"] diff --git a/docs/Project.toml b/docs/Project.toml index 69ce7a6..388614e 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,6 +1,6 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -MemViews = "a791c907-b98b-4e44-8f4d-e4c2362c6b2f" +MemoryViews = "a791c907-b98b-4e44-8f4d-e4c2362c6b2f" PairwiseMappingFormat = "30ee2946-b067-41c0-bed2-0900b78dbe59" StringViews = "354b36f9-a18e-4713-926e-db85100087ba" XAMAuxData = "e99d641e-1821-45d7-9150-ecb7bf333fe1" diff --git a/docs/src/index.md b/docs/src/index.md index 79ed10a..b5aa8ea 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -12,7 +12,8 @@ end # PairwiseMappingFormat.jl This package is for reading files of the PAF (Pairwise mApping Format) format, -which is a simple tab-delimited format used by e.g. minimap2 and strobealign. +which is a simple tab-delimited format used by e.g. minimap2 and strobealign +to store records representing pairwise alignment between biological sequences. ## Reader The [`PAFReader`](@ref) type wraps an `IO`, and is an iterator of [`PAFRecord`](@ref) objects: @@ -34,32 +35,28 @@ Similar to the common `open(path) do io`-syntax in Julia, `PAFReader` takes an o `PAFReader` object, and close the underlying io when `f` returns (or errors): ```jldoctest -PAFReader(open(path_to_paf)) do reader - for record in reader - println(record.qlen) - end -end - -# output +julia> PAFReader(open(path_to_paf)) do reader + for record in reader + println(record.qlen) + end + end 301156 299273 288659 - ``` The [`PAFReader`](@ref) constructor takes an optional keyword `copy`, which defaults to `true`. If it is `false`, the record will iterate the _same_ [`PAFRecord`](@ref) object, overwriting it in-place. -This reduces allocations and give a slight speed increase, but may result in bugs -if records, or references of records of previous iterations are stored: +This reduces allocations and gives a slight speed increase, but may result in bugs +if records, or references of records of previous iterations are stored and +unexpectedly overwritten: ```jldoctest -records = PAFReader(collect, open(path_to_paf); copy=false) -println(map(i -> i.qlen, records)) +julia> records = PAFReader(collect, open(path_to_paf); copy=false); -# output +julia> println(map(i -> i.qlen, records)) # NB: All the same record! [288659, 288659, 288659] - ``` At the moments, readers do not support seeking. @@ -69,15 +66,15 @@ The mutable [`PAFRecord`](@ref) object represents a single line in a PAF file. The individual columns of the PAF line is obtained by accessing properties of the records: ```jldoctest record -record = PAFReader(first, open(path_to_paf)) - -println(record.qname) -println(record.qlen) -println(record.mapq) +julia> record = PAFReader(first, open(path_to_paf)); -# output +julia> println(record.qname) query1 + +julia> println(record.qlen) 301156 + +julia> println(record.mapq) 0 ``` @@ -137,34 +134,36 @@ The target name and the strand of an unmapped return is `nothing`. You can check if a record is mapped or unmapped with the function `is_mapped`: ```jldoctest unmapped -unmapped = parse(PAFRecord, "myquery\t5032251\t9\t11\t*\t*\t5\t2\t1\t7\t4\t0") -@assert !is_mapped(unmapped) -println(unmapped.qname) -println(unmapped.qlen) -@assert isnothing(unmapped.tname) -# The strand is given by is_rc, and is nothing for unmapped records -@assert isnothing(unmapped.is_rc) +julia> unmapped = parse(PAFRecord, "myquery\t5032251\t9\t11\t*\t*\t5\t2\t1\t7\t4\t0"); -# output +julia> @assert !is_mapped(unmapped) + +julia> println(unmapped.qname) myquery + +julia> println(unmapped.qlen) 5032251 + +julia> @assert isnothing(unmapped.tname) + +julia> # The strand is given by is_rc, and is nothing for unmapped records + @assert isnothing(unmapped.is_rc) ``` All other properties of unmapped records may contain arbitrary data, and should not be relied on. For this reason, the functions [`target_coverage`](@ref), [`query_coverage`](@ref) and [`aln_identity`](@ref) may give nonsense answers for unmapped records: ```jldoctest unmapped -# Note! These results are arbitrary and not guaranteed to be stable -# across minor releases of this package -println(target_coverage(unmapped)) -println(query_coverage(unmapped)) -println(aln_identity(unmapped)) - -# output +julia> # Note! These results are arbitrary and not guaranteed to be stable + # across minor releases of this package + println(target_coverage(unmapped)) Inf + +julia> println(query_coverage(unmapped)) 1.9871822768776836e-7 -NaN +julia> println(aln_identity(unmapped)) +NaN ``` ## Low-level interface @@ -233,7 +232,7 @@ julia> err.line 1 julia> err.kind -TooFewFields +TooFewFields::Err = 0 ``` The precise value of this `.kind` object for any given error condition is subject diff --git a/src/PairwiseMappingFormat.jl b/src/PairwiseMappingFormat.jl index 0ef272d..02c16c8 100644 --- a/src/PairwiseMappingFormat.jl +++ b/src/PairwiseMappingFormat.jl @@ -28,7 +28,7 @@ using PrecompileTools: @compile_workload, @setup_workload, @recompile_invalidati using StringViews: StringView end -using MemViews: MemView, ImmutableMemView +using MemoryViews: MemoryView, ImmutableMemoryView using XAMAuxData.SAM: Auxiliary @@ -178,7 +178,7 @@ end # Return StringView to not allocate function qname(record::PAFRecord) - StringView(ImmutableMemView(getfield(record, :data))[1:(getfield(record, :qname_len))]) + StringView(ImmutableMemoryView(getfield(record, :data))[1:(getfield(record, :qname_len))]) end # Return StringView to not allocate @@ -186,7 +186,7 @@ function tname(record::PAFRecord)::Union{StringView, Nothing} if is_mapped(record) ql = getfield(record, :qname_len) span = (ql+ 1):(ql + getfield(record, :tname_len)) - StringView(ImmutableMemView(getfield(record, :data))[span]) + StringView(ImmutableMemoryView(getfield(record, :data))[span]) else nothing end @@ -396,7 +396,7 @@ end try_parse(x) -> Union{PAFRecord, ParserException} Try parsing `x` into a `PAFRecord`. `x` may be any type that implements -`MemView`, such as `String` or `Vector{UInt8}`. +`MemoryView`, such as `String` or `Vector{UInt8}`. # Examples ```jldoctest @@ -411,10 +411,10 @@ PairwiseMappingFormat.ParserException See also: [`PAFRecord`](@ref), [`try_next!`](@ref) """ -try_parse(x) = try_parse(ImmutableMemView(x)) +try_parse(x) = try_parse(ImmutableMemoryView(x)) # At least 19 bytes of input are stored in the fixed fields, so # we need at most 19 fewer bytes in the data field of the vector. -try_parse(x::ImmutableMemView{UInt8}) = parse_line!(PAFRecord(length(x) - 19), x) +try_parse(x::ImmutableMemoryView{UInt8}) = parse_line!(PAFRecord(length(x) - 19), x) function Base.parse(::Type{PAFRecord}, s::Union{String, SubString{String}}) y = try_parse(s) @@ -423,7 +423,7 @@ end function parse_line!( record::PAFRecord, - mem::ImmutableMemView{UInt8}, + mem::ImmutableMemoryView{UInt8}, )::Union{PAFRecord, ParserException} if lastindex(mem) > typemax(Int32) error("PairwiseMappingFormat.jl can't handle lines longer than 2147483647 bytes") @@ -477,7 +477,7 @@ function parse_line!( data = getfield(record, :data) length(data) == filled || resize!(data, filled) doff = 1 - dataview = MemView(data) + dataview = MemoryView(data) unsafe_copyto!(dataview, mem[qname]) doff += length(qname) unsafe_copyto!(dataview[doff:end], mem[tname]) @@ -509,7 +509,7 @@ end # i is the byte index of the strand plus two function finish_unmapped!( record::PAFRecord, - mem::ImmutableMemView{UInt8}, + mem::ImmutableMemoryView{UInt8}, qname::UnitRange{Int}, qlen::Int32, i::Int @@ -531,7 +531,7 @@ function finish_unmapped!( # Copy data filled = length(qname) + length(aux) data = getfield(record, :data) - dataview = MemView(data) + dataview = MemoryView(data) length(data) == filled || resize!(data, filled) unsafe_copyto!(dataview, mem[qname]) unsafe_copyto!(dataview[length(qname)+1:end], mem[aux]) @@ -556,7 +556,7 @@ end # Just get the index of the next \t function parse_str( - v::ImmutableMemView{UInt8}, + v::ImmutableMemoryView{UInt8}, from::Int, )::Union{Tuple{UnitRange{Int}, Int}, ParserException} i = findnext(==(UInt8('\t')), v, from) @@ -569,7 +569,7 @@ end # TODO: 40% of runtime is spent in this function. Microoptimize it function parse_int( - v::ImmutableMemView{UInt8}, + v::ImmutableMemoryView{UInt8}, from::Int, allow_zero::Bool, at_end::Bool=false, @@ -820,7 +820,7 @@ function try_next!(reader::PAFReader)::Union{Nothing, PAFRecord, ParserException n_scanned = 0 newline = findfirst( ==(0x0a), - ImmutableMemView(reader.mem)[(reader.index + n_scanned):(reader.filled % Int)], + ImmutableMemoryView(reader.mem)[(reader.index + n_scanned):(reader.filled % Int)], ) n_bytes_read = 0 # Keep reading until we find a newline or reach EOF @@ -832,7 +832,7 @@ function try_next!(reader::PAFReader)::Union{Nothing, PAFRecord, ParserException iszero(n_bytes_read) && break newline = findfirst( ==(0x0a), - ImmutableMemView(reader.mem)[(reader.index + n_scanned):(reader.filled % Int)], + ImmutableMemoryView(reader.mem)[(reader.index + n_scanned):(reader.filled % Int)], ) end # If a newline was found, the current value of newline was relative to where @@ -852,13 +852,13 @@ function try_next!(reader::PAFReader)::Union{Nothing, PAFRecord, ParserException newline - (1 + has_windows_newline) end end - parse_mem = ImmutableMemView(mem)[reader.index:last_index] + parse_mem = ImmutableMemoryView(mem)[reader.index:last_index] # If there is no memory to parse (and we have already verified that # there are no more bytes in the IO), we end if isempty(parse_mem) line = reader.line col = 0 - @inbounds for b in ImmutableMemView(mem)[reader.index:reader.filled] + @inbounds for b in ImmutableMemoryView(mem)[reader.index:reader.filled] col += 1 line += b == UInt8('\n') col *= b != UInt8('\n') diff --git a/sticker.svg b/sticker.svg index 1b2ea24..c2c0b89 100644 --- a/sticker.svg +++ b/sticker.svg @@ -1,26 +1,169 @@ - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/runtests.jl b/test/runtests.jl index 51b6b1e..848a043 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,7 +4,7 @@ using PairwiseMappingFormat using PairwiseMappingFormat: Errors, try_parse using Test using FormatSpecimens -using MemViews +using MemoryViews using Aqua using XAMAuxData.SAM: AuxTag @@ -171,7 +171,7 @@ end # that the aux data has the right span and byte content. function test_aux_content(record_data::String, target_aux::String) rec = parse(PAFRecord, record_data) - @test collect(MemView(aux_data(rec))) == codeunits(target_aux) + @test collect(MemoryView(aux_data(rec))) == codeunits(target_aux) end @testset "Auxiliary data" begin