include <arm_neon.h>

/* This code is almost the same as SSE implementation, please reference

* utf8-range-sse.inc for detailed explanation.
* The only difference is the range adjustment step. NEON code is more
* straightforward.
*/

static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(

  const char* data_original, const char* data, const char* end,
  int return_position) {
const uint8x16_t first_len_tbl = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
};
const uint8x16_t first_range_tbl = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
};
const uint8x16_t range_min_tbl = {
    0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
    0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
const uint8x16_t range_max_tbl = {
    0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
    0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
/* Range adjustment in NEON uint8x16x2 table. Note that lanes are interleaved
 * in register. The table below is plotted vertically to ease understanding.
 * The 1st column is for E0~EF, 2nd column for F0~FF.
 */
// clang-format off
const uint8_t range_adjust_tbl_data[] = {
  /* index -> 0~15  16~31 <- index */
  /*  E0 -> */ 2,     3, /* <- F0  */
               0,     0,
               0,     0,
               0,     0,
               0,     4, /* <- F4  */
               0,     0,
               0,     0,
               0,     0,
               0,     0,
               0,     0,
               0,     0,
               0,     0,
               0,     0,
  /*  ED -> */ 3,     0,
               0,     0,
               0,     0,
};
// clang-format on
const uint8x16x2_t range_adjust_tbl = vld2q_u8(range_adjust_tbl_data);

const uint8x16_t const_1 = vdupq_n_u8(1);
const uint8x16_t const_2 = vdupq_n_u8(2);
const uint8x16_t const_e0 = vdupq_n_u8(0xE0);

uint8x16_t prev_input = vdupq_n_u8(0);
uint8x16_t prev_first_len = vdupq_n_u8(0);
uint8x16_t error = vdupq_n_u8(0);

while (end - data >= 16) {
  const uint8x16_t input = vld1q_u8((const uint8_t*)data);

  const uint8x16_t high_nibbles = vshrq_n_u8(input, 4);

  const uint8x16_t first_len = vqtbl1q_u8(first_len_tbl, high_nibbles);

  uint8x16_t range = vqtbl1q_u8(first_range_tbl, high_nibbles);

  range = vorrq_u8(range, vextq_u8(prev_first_len, first_len, 15));

  uint8x16_t shift2 = vextq_u8(prev_first_len, first_len, 14);
  shift2 = vqsubq_u8(shift2, const_1);
  range = vorrq_u8(range, shift2);

  uint8x16_t shift3 = vextq_u8(prev_first_len, first_len, 13);
  shift3 = vqsubq_u8(shift3, const_2);
  range = vorrq_u8(range, shift3);

  uint8x16_t shift1 = vextq_u8(prev_input, input, 15);
  shift1 = vsubq_u8(shift1, const_e0);
  range = vaddq_u8(range, vqtbl2q_u8(range_adjust_tbl, shift1));

  const uint8x16_t min_range = vqtbl1q_u8(range_min_tbl, range);
  const uint8x16_t max_range = vqtbl1q_u8(range_max_tbl, range);

  if (return_position) {
    error = vcltq_u8(input, min_range);
    error = vorrq_u8(error, vcgtq_u8(input, max_range));
    if (vmaxvq_u32(vreinterpretq_u32_u8(error))) {
      break;
    }
  } else {
    error = vorrq_u8(error, vcltq_u8(input, min_range));
    error = vorrq_u8(error, vcgtq_u8(input, max_range));
  }

  prev_input = input;
  prev_first_len = first_len;

  data += 16;
}

if (return_position && data == data_original) {
  return utf8_range_ValidateUTF8Naive(data, end, return_position);
}
const int32_t prev = vgetq_lane_s32(vreinterpretq_s32_u8(prev_input), 3);
data -= utf8_range_CodepointSkipBackwards(prev);
if (return_position) {
  return (data - data_original) +
         utf8_range_ValidateUTF8Naive(data, end, return_position);
}
if (vmaxvq_u32(vreinterpretq_u32_u8(error))) {
  return 0;
}
return utf8_range_ValidateUTF8Naive(data, end, return_position);

}