diff --git a/src/irc/tags/simd.rs b/src/irc/tags/simd.rs index bedcd55..7fec92d 100644 --- a/src/irc/tags/simd.rs +++ b/src/irc/tags/simd.rs @@ -140,7 +140,9 @@ fn find_first(data: &[u8], byte: u8) -> Option { let chunk_3 = V::load_aligned(data, offset + V::SIZE * 3).eq(byte); // early exit if we know that all chunks will have a zero mask - if V::SUPPORTS_MOVEMASK_WILL_HAVE_NON_ZERO && !chunk_0.movemask_will_have_non_zero() { + if V::SUPPORTS_MOVEMASK_WILL_HAVE_NON_ZERO + && !(chunk_0 | chunk_1 | chunk_2 | chunk_3).movemask_will_have_non_zero() + { offset += V::SIZE; continue; } diff --git a/src/irc/wide/aarch64/neon.rs b/src/irc/wide/aarch64/neon.rs index e1508f3..51c6c87 100644 --- a/src/irc/wide/aarch64/neon.rs +++ b/src/irc/wide/aarch64/neon.rs @@ -31,8 +31,8 @@ // To obtain the position of the charater, divide its trailing zeros by 4. use core::arch::aarch64::{ - uint8x16_t, vceqq_u8, vget_lane_u64, vld1q_u8, vreinterpret_u64_u8, vreinterpretq_u16_u8, - vshrn_n_u16, + uint8x16_t, vceqq_u8, vget_lane_u64, vgetq_lane_u64, vld1q_u8, vorrq_u8, vpmaxq_u8, + vreinterpret_u64_u8, vreinterpretq_u16_u8, vreinterpretq_u64_u8, vshrn_n_u16, }; // NOTE: neon has no alignment requirements for loads, @@ -104,6 +104,25 @@ impl Vector { Mask(matches) } } + + pub const SUPPORTS_MOVEMASK_WILL_HAVE_NON_ZERO: bool = true; + + #[inline(always)] + pub fn movemask_will_have_non_zero(self) -> bool { + unsafe { + let low = vreinterpretq_u64_u8(vpmaxq_u8(self.0, self.0)); + vgetq_lane_u64(low, 0) != 0 + } + } +} + +impl std::ops::BitOr for Vector { + type Output = Self; + + #[inline(always)] + fn bitor(self, rhs: Self) -> Self { + Self(unsafe { vorrq_u8(self.0, rhs.0) }) + } } #[derive(Clone, Copy)]