wip: movemask_will_have_non_zero

jprochazk · Jun 24, 2024 · a49590a · a49590a
1 parent 089529b
commit a49590a
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 3 deletions.
diff --git a/src/irc/tags/simd.rs b/src/irc/tags/simd.rs
@@ -140,7 +140,9 @@ fn find_first(data: &[u8], byte: u8) -> Option<usize> {
     let chunk_3 = V::load_aligned(data, offset + V::SIZE * 3).eq(byte);
 
     // early exit if we know that all chunks will have a zero mask
-    if V::SUPPORTS_MOVEMASK_WILL_HAVE_NON_ZERO && !chunk_0.movemask_will_have_non_zero() {
+    if V::SUPPORTS_MOVEMASK_WILL_HAVE_NON_ZERO
+      && !(chunk_0 | chunk_1 | chunk_2 | chunk_3).movemask_will_have_non_zero()
+    {
       offset += V::SIZE;
       continue;
     }

diff --git a/src/irc/wide/aarch64/neon.rs b/src/irc/wide/aarch64/neon.rs
@@ -31,8 +31,8 @@
 // To obtain the position of the charater, divide its trailing zeros by 4.
 
 use core::arch::aarch64::{
-  uint8x16_t, vceqq_u8, vget_lane_u64, vld1q_u8, vreinterpret_u64_u8, vreinterpretq_u16_u8,
-  vshrn_n_u16,
+  uint8x16_t, vceqq_u8, vget_lane_u64, vgetq_lane_u64, vld1q_u8, vorrq_u8, vpmaxq_u8,
+  vreinterpret_u64_u8, vreinterpretq_u16_u8, vreinterpretq_u64_u8, vshrn_n_u16,
 };
 
 // NOTE: neon has no alignment requirements for loads,
@@ -104,6 +104,25 @@ impl Vector {
       Mask(matches)
     }
   }
+
+  pub const SUPPORTS_MOVEMASK_WILL_HAVE_NON_ZERO: bool = true;
+
+  #[inline(always)]
+  pub fn movemask_will_have_non_zero(self) -> bool {
+    unsafe {
+      let low = vreinterpretq_u64_u8(vpmaxq_u8(self.0, self.0));
+      vgetq_lane_u64(low, 0) != 0
+    }
+  }
+}
+
+impl std::ops::BitOr for Vector {
+  type Output = Self;
+
+  #[inline(always)]
+  fn bitor(self, rhs: Self) -> Self {
+    Self(unsafe { vorrq_u8(self.0, rhs.0) })
+  }
 }
 
 #[derive(Clone, Copy)]