// This load is safe because needles.size() >= 16
auto arr2 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(needles.data()));
// This load is safe because needles.size() >= 16
auto arr2 = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(needles.data()));
_mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0);
size_t j = nextAlignedIndex(needles.data());
_mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0);
size_t j = nextAlignedIndex(needles.data());