diff --git a/src/internal/bytealg/compare_loong64.s b/src/internal/bytealg/compare_loong64.s index df72a1122b..99c8cda775 100644 --- a/src/internal/bytealg/compare_loong64.s +++ b/src/internal/bytealg/compare_loong64.s @@ -28,58 +28,136 @@ TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 // R7 length of b // R4 points to the start of a // R6 points to the start of b -// R13 points to the return value (-1/0/1) +// for regabi the return value (-1/0/1) in R4 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0 - BEQ R4, R6, samebytes // same start of a and b + BEQ R4, R6, cmp_len // same start of a and b, then compare lengths SGTU R5, R7, R9 - BNE R0, R9, r2_lt_r1 + BNE R9, b_lt_a MOVV R5, R14 JMP entry -r2_lt_r1: - MOVV R7, R14 // R14 is min(R4, R5) +b_lt_a: + MOVV R7, R14 // R14 is min(R5, R7) entry: - ADDV R4, R14, R12 // R6 start of a, R14 end of a - BEQ R4, R12, samebytes // length is 0 + ADDV R4, R14, R12 // R4 start of a, R12 end of a + BEQ R4, R12, cmp_len // minlength is 0 - SRLV $4, R14 // R14 is number of chunks - BEQ R0, R14, byte_loop +tail: + MOVV $2, R15 + BLT R14, R15, cmp1 // min < 2 + SLLV $1, R15 + BLT R14, R15, cmp2 // min < 4 + SLLV $1, R15 + BLT R14, R15, cmp4 // min < 8 + SLLV $1, R15 + BLT R14, R15, cmp8 // min < 16 + SLLV $1, R15 + BLT R14, R15, cmp16 // min < 32 - // make sure both a and b are aligned. - OR R4, R6, R15 - AND $7, R15 - BNE R0, R15, byte_loop - - PCALIGN $16 -chunk16_loop: - BEQ R0, R14, byte_loop +// When min >= 32 bytes, enter the cmp32_loop loop processing: +// take out 4 8-bytes from a and b in turn for comparison. +cmp32_loop: MOVV (R4), R8 MOVV (R6), R9 - BNE R8, R9, byte_loop - MOVV 8(R4), R16 - MOVV 8(R6), R17 + MOVV 8(R4), R10 + MOVV 8(R6), R11 + BNE R8, R9, cmp8a + BNE R10, R11, cmp8b + MOVV 16(R4), R8 + MOVV 16(R6), R9 + MOVV 24(R4), R10 + MOVV 24(R6), R11 + BNE R8, R9, cmp8a + BNE R10, R11, cmp8b + ADDV $32, R4 + ADDV $32, R6 + SUBV $32, R14 + BGE R14, R15, cmp32_loop + BEQ R14, cmp_len + +check16: + MOVV $16, R15 + BLT R14, R15, check8 +cmp16: + MOVV (R4), R8 + MOVV (R6), R9 + MOVV 8(R4), R10 + MOVV 8(R6), R11 + BNE R8, R9, cmp8a + BNE R10, R11, cmp8b ADDV $16, R4 ADDV $16, R6 - SUBVU $1, R14 - BEQ R16, R17, chunk16_loop - SUBV $8, R4 - SUBV $8, R6 + SUBV $16, R14 + BEQ R14, cmp_len -byte_loop: - BEQ R4, R12, samebytes +check8: + MOVV $8, R15 + BLT R14, R15, check4 +cmp8: + MOVV (R4), R8 + MOVV (R6), R9 + BNE R8, R9, cmp8a + ADDV $8, R4 + ADDV $8, R6 + SUBV $8, R14 + BEQ R14, cmp_len + +check4: + MOVV $4, R15 + BLT R14, R15, check2 +cmp4: + MOVW (R4), R8 + MOVW (R6), R9 + BNE R8, R9, cmp8a + ADDV $4, R4 + ADDV $4, R6 + SUBV $4, R14 + BEQ R14, cmp_len + +check2: + MOVV $2, R15 + BLT R14, R15, cmp1 +cmp2: + MOVH (R4), R8 + MOVH (R6), R9 + BNE R8, R9, cmp8a + ADDV $2, R4 + ADDV $2, R6 + SUBV $2, R14 + BEQ R14, cmp_len + +cmp1: + BEQ R14, cmp_len MOVBU (R4), R8 - ADDVU $1, R4 MOVBU (R6), R9 - ADDVU $1, R6 - BEQ R8, R9, byte_loop + BNE R8, R9, byte_cmp + JMP cmp_len + // Compare 8/4/2 bytes taken from R8/R9 that are known to differ. +cmp8a: + MOVV R8, R10 + MOVV R9, R11 + + // Compare 8/4/2 bytes taken from R10/R11 that are known to differ. +cmp8b: + MOVV $0xff, R15 + + // Take single bytes from R10/R11 in turn for cyclic comparison. +cmp8_loop: + AND R10, R15, R8 + AND R11, R15, R9 + BNE R8, R9, byte_cmp + SLLV $8, R15 + JMP cmp8_loop + + // Compare 1 bytes taken from R8/R9 that are known to differ. byte_cmp: - SGTU R8, R9, R4 // R12 = 1 if (R8 > R9) + SGTU R8, R9, R4 // R4 = 1 if (R8 > R9) BNE R0, R4, ret MOVV $-1, R4 JMP ret -samebytes: +cmp_len: SGTU R5, R7, R8 SGTU R7, R5, R9 SUBV R9, R8, R4