mirror of
https://github.com/golang/go.git
synced 2025-05-08 00:53:07 +00:00
ARM64 supports load/store instructions with a memory operand that the address is calculated by base register + index register. In this CL, 1. Some rules are added to the compile's ARM64 backend to emit such efficient instructions. 2. A wrong rule of load combination is fixed. The go1 benchmark does show improvement. name old time/op new time/op delta BinaryTree17-4 44.5s ± 2% 44.1s ± 1% -0.81% (p=0.000 n=28+29) Fannkuch11-4 32.7s ± 3% 30.5s ± 0% -6.79% (p=0.000 n=30+26) FmtFprintfEmpty-4 499ns ± 0% 506ns ± 5% +1.39% (p=0.003 n=25+30) FmtFprintfString-4 1.07µs ± 0% 1.04µs ± 4% -3.17% (p=0.000 n=23+30) FmtFprintfInt-4 1.15µs ± 4% 1.13µs ± 0% -1.55% (p=0.000 n=30+23) FmtFprintfIntInt-4 1.77µs ± 4% 1.74µs ± 0% -1.71% (p=0.000 n=30+24) FmtFprintfPrefixedInt-4 2.37µs ± 5% 2.12µs ± 0% -10.56% (p=0.000 n=30+23) FmtFprintfFloat-4 3.03µs ± 1% 3.03µs ± 4% -0.13% (p=0.003 n=25+30) FmtManyArgs-4 7.38µs ± 1% 7.43µs ± 4% +0.59% (p=0.003 n=25+30) GobDecode-4 101ms ± 6% 95ms ± 5% -5.55% (p=0.000 n=30+30) GobEncode-4 78.0ms ± 4% 78.8ms ± 6% +1.05% (p=0.000 n=30+30) Gzip-4 4.25s ± 0% 4.27s ± 4% +0.45% (p=0.003 n=24+30) Gunzip-4 428ms ± 1% 420ms ± 0% -1.88% (p=0.000 n=23+23) HTTPClientServer-4 549µs ± 1% 541µs ± 1% -1.56% (p=0.000 n=29+29) JSONEncode-4 194ms ± 0% 188ms ± 4% ~ (p=0.417 n=23+30) JSONDecode-4 890ms ± 5% 831ms ± 0% -6.55% (p=0.000 n=30+23) Mandelbrot200-4 47.3ms ± 2% 46.5ms ± 0% ~ (p=0.980 n=30+26) GoParse-4 43.1ms ± 6% 43.8ms ± 6% +1.65% (p=0.000 n=30+30) RegexpMatchEasy0_32-4 1.06µs ± 0% 1.07µs ± 3% ~ (p=0.092 n=23+30) RegexpMatchEasy0_1K-4 5.53µs ± 0% 5.51µs ± 0% -0.24% (p=0.000 n=25+25) RegexpMatchEasy1_32-4 1.02µs ± 3% 1.01µs ± 0% -1.27% (p=0.000 n=30+24) RegexpMatchEasy1_1K-4 7.26µs ± 0% 7.33µs ± 0% +0.95% (p=0.000 n=23+26) RegexpMatchMedium_32-4 1.84µs ± 7% 1.79µs ± 1% ~ (p=0.333 n=30+23) RegexpMatchMedium_1K-4 553µs ± 0% 547µs ± 0% -1.14% (p=0.000 n=24+22) RegexpMatchHard_32-4 30.8µs ± 1% 30.3µs ± 0% -1.40% (p=0.000 n=24+24) RegexpMatchHard_1K-4 928µs ± 0% 929µs ± 5% +0.12% (p=0.013 n=23+30) Revcomp-4 8.13s ± 4% 6.32s ± 1% -22.23% (p=0.000 n=30+23) Template-4 899ms ± 6% 854ms ± 1% -5.01% (p=0.000 n=30+24) TimeParse-4 4.66µs ± 4% 4.59µs ± 1% -1.57% (p=0.000 n=30+23) TimeFormat-4 4.58µs ± 0% 4.61µs ± 0% +0.57% (p=0.000 n=26+24) [Geo mean] 717µs 698µs -2.55% name old speed new speed delta GobDecode-4 7.63MB/s ± 6% 8.08MB/s ± 5% +5.88% (p=0.000 n=30+30) GobEncode-4 9.85MB/s ± 4% 9.75MB/s ± 6% -1.04% (p=0.000 n=30+30) Gzip-4 4.56MB/s ± 0% 4.55MB/s ± 4% -0.36% (p=0.003 n=24+30) Gunzip-4 45.3MB/s ± 1% 46.2MB/s ± 0% +1.92% (p=0.000 n=23+23) JSONEncode-4 10.0MB/s ± 0% 10.4MB/s ± 4% ~ (p=0.403 n=23+30) JSONDecode-4 2.18MB/s ± 5% 2.33MB/s ± 0% +6.91% (p=0.000 n=30+23) GoParse-4 1.34MB/s ± 5% 1.32MB/s ± 5% -1.66% (p=0.000 n=30+30) RegexpMatchEasy0_32-4 30.2MB/s ± 0% 29.8MB/s ± 3% ~ (p=0.099 n=23+30) RegexpMatchEasy0_1K-4 185MB/s ± 0% 186MB/s ± 0% +0.24% (p=0.000 n=25+25) RegexpMatchEasy1_32-4 31.4MB/s ± 3% 31.8MB/s ± 0% +1.24% (p=0.000 n=30+24) RegexpMatchEasy1_1K-4 141MB/s ± 0% 140MB/s ± 0% -0.94% (p=0.000 n=23+26) RegexpMatchMedium_32-4 541kB/s ± 6% 560kB/s ± 0% +3.45% (p=0.000 n=30+23) RegexpMatchMedium_1K-4 1.85MB/s ± 0% 1.87MB/s ± 0% +1.08% (p=0.000 n=24+23) RegexpMatchHard_32-4 1.04MB/s ± 1% 1.06MB/s ± 1% +1.48% (p=0.000 n=24+24) RegexpMatchHard_1K-4 1.10MB/s ± 0% 1.10MB/s ± 5% +0.15% (p=0.004 n=23+30) Revcomp-4 31.3MB/s ± 4% 40.2MB/s ± 1% +28.52% (p=0.000 n=30+23) Template-4 2.16MB/s ± 6% 2.27MB/s ± 1% +5.18% (p=0.000 n=30+24) [Geo mean] 7.57MB/s 7.79MB/s +2.98% fixes #24907 Change-Id: I94afd0e3f53d62a1cf5e452f3dd6daf61be21785 Reviewed-on: https://go-review.googlesource.com/107376 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
314 lines
8.1 KiB
Go
314 lines
8.1 KiB
Go
// asmcheck
|
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package codegen
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"runtime"
|
|
)
|
|
|
|
var sink64 uint64
|
|
var sink32 uint32
|
|
var sink16 uint16
|
|
|
|
// ------------- //
|
|
// Loading //
|
|
// ------------- //
|
|
|
|
func load_le64(b []byte) {
|
|
// amd64:`MOVQ\s\(.*\),`
|
|
// s390x:`MOVDBR\s\(.*\),`
|
|
// arm64:`MOVD\s\(R[0-9]+\),`,-`MOV[BHW]`
|
|
// ppc64le:`MOVD\s`,-`MOV[BHW]Z`
|
|
sink64 = binary.LittleEndian.Uint64(b)
|
|
}
|
|
|
|
func load_le64_idx(b []byte, idx int) {
|
|
// amd64:`MOVQ\s\(.*\)\(.*\*1\),`
|
|
// s390x:`MOVDBR\s\(.*\)\(.*\*1\),`
|
|
// arm64:`MOVD\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[BHW]`
|
|
// ppc64le:`MOVD\s`,-`MOV[BHW]Z\s`
|
|
sink64 = binary.LittleEndian.Uint64(b[idx:])
|
|
}
|
|
|
|
func load_le32(b []byte) {
|
|
// amd64:`MOVL\s\(.*\),` 386:`MOVL\s\(.*\),`
|
|
// s390x:`MOVWBR\s\(.*\),`
|
|
// arm64:`MOVWU\s\(R[0-9]+\),`,-`MOV[BH]`
|
|
// ppc64le:`MOVWZ\s`
|
|
sink32 = binary.LittleEndian.Uint32(b)
|
|
}
|
|
|
|
func load_le32_idx(b []byte, idx int) {
|
|
// amd64:`MOVL\s\(.*\)\(.*\*1\),` 386:`MOVL\s\(.*\)\(.*\*1\),`
|
|
// s390x:`MOVWBR\s\(.*\)\(.*\*1\),`
|
|
// arm64:`MOVWU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[BH]`
|
|
// ppc64le:`MOVWZ\s`
|
|
sink32 = binary.LittleEndian.Uint32(b[idx:])
|
|
}
|
|
|
|
func load_le16(b []byte) {
|
|
// amd64:`MOVWLZX\s\(.*\),`
|
|
// ppc64le:`MOVHZ\s`
|
|
// arm64:`MOVHU\s\(R[0-9]+\),`,-`MOVB`
|
|
sink16 = binary.LittleEndian.Uint16(b)
|
|
}
|
|
|
|
func load_le16_idx(b []byte, idx int) {
|
|
// amd64:`MOVWLZX\s\(.*\),`
|
|
// ppc64le:`MOVHZ\s`
|
|
// arm64:`MOVHU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOVB`
|
|
sink16 = binary.LittleEndian.Uint16(b[idx:])
|
|
}
|
|
|
|
func load_be64(b []byte) {
|
|
// amd64:`BSWAPQ`
|
|
// s390x:`MOVD\s\(.*\),`
|
|
// arm64:`REV`,`MOVD\s\(R[0-9]+\),`,-`MOV[BHW]`,-`REVW`,-`REV16W`
|
|
sink64 = binary.BigEndian.Uint64(b)
|
|
}
|
|
|
|
func load_be64_idx(b []byte, idx int) {
|
|
// amd64:`BSWAPQ`
|
|
// s390x:`MOVD\s\(.*\)\(.*\*1\),`
|
|
// arm64:`REV`,`MOVD\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[WHB]`,-`REVW`,-`REV16W`
|
|
sink64 = binary.BigEndian.Uint64(b[idx:])
|
|
}
|
|
|
|
func load_be32(b []byte) {
|
|
// amd64:`BSWAPL`
|
|
// s390x:`MOVWZ\s\(.*\),`
|
|
// arm64:`REVW`,`MOVWU\s\(R[0-9]+\),`,-`MOV[BH]`,-`REV16W`
|
|
sink32 = binary.BigEndian.Uint32(b)
|
|
}
|
|
|
|
func load_be32_idx(b []byte, idx int) {
|
|
// amd64:`BSWAPL`
|
|
// s390x:`MOVWZ\s\(.*\)\(.*\*1\),`
|
|
// arm64:`REVW`,`MOVWU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[HB]`,-`REV16W`
|
|
sink32 = binary.BigEndian.Uint32(b[idx:])
|
|
}
|
|
|
|
func load_be16(b []byte) {
|
|
// amd64:`ROLW\s\$8`
|
|
// arm64: `REV16W`,`MOVHU\s\(R[0-9]+\),`,-`MOVB`
|
|
sink16 = binary.BigEndian.Uint16(b)
|
|
}
|
|
|
|
func load_be16_idx(b []byte, idx int) {
|
|
// amd64:`ROLW\s\$8`
|
|
// arm64: `REV16W`,`MOVHU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOVB`
|
|
sink16 = binary.BigEndian.Uint16(b[idx:])
|
|
}
|
|
|
|
func load_byte2_uint16(s []byte) uint16 {
|
|
// arm64:`MOVHU\t\(R[0-9]+\)`,-`ORR\tR[0-9]+<<8`
|
|
return uint16(s[0]) | uint16(s[1])<<8
|
|
}
|
|
|
|
// Check load combining across function calls.
|
|
|
|
func fcall_byte(a, b byte) (byte, byte) {
|
|
return fcall_byte(fcall_byte(a, b)) // amd64:`MOVW`
|
|
}
|
|
|
|
func fcall_uint16(a, b uint16) (uint16, uint16) {
|
|
return fcall_uint16(fcall_uint16(a, b)) // amd64:`MOVL`
|
|
}
|
|
|
|
func fcall_uint32(a, b uint32) (uint32, uint32) {
|
|
return fcall_uint32(fcall_uint32(a, b)) // amd64:`MOVQ`
|
|
}
|
|
|
|
// We want to merge load+op in the first function, but not in the
|
|
// second. See Issue 19595.
|
|
func load_op_merge(p, q *int) {
|
|
x := *p
|
|
*q += x // amd64:`ADDQ\t\(`
|
|
}
|
|
func load_op_no_merge(p, q *int) {
|
|
x := *p
|
|
for i := 0; i < 10; i++ {
|
|
*q += x // amd64:`ADDQ\t[A-Z]`
|
|
}
|
|
}
|
|
|
|
// Make sure offsets are folded into loads and stores.
|
|
func offsets_fold(_, a [20]byte) (b [20]byte) {
|
|
// arm64:`MOVD\t""\.a\+[0-9]+\(FP\), R[0-9]+`,`MOVD\tR[0-9]+, ""\.b\+[0-9]+\(FP\)`
|
|
b = a
|
|
return
|
|
}
|
|
|
|
// Make sure we don't put pointers in SSE registers across safe
|
|
// points.
|
|
|
|
func safe_point(p, q *[2]*int) {
|
|
a, b := p[0], p[1] // amd64:-`MOVUPS`
|
|
runtime.GC()
|
|
q[0], q[1] = a, b // amd64:-`MOVUPS`
|
|
}
|
|
|
|
// ------------- //
|
|
// Storing //
|
|
// ------------- //
|
|
|
|
func store_le64(b []byte) {
|
|
// amd64:`MOVQ\s.*\(.*\)$`,-`SHR.`
|
|
// arm64:`MOVD`,-`MOV[WBH]`
|
|
// ppc64le:`MOVD\s`,-`MOV[BHW]\s`
|
|
binary.LittleEndian.PutUint64(b, sink64)
|
|
}
|
|
|
|
func store_le64_idx(b []byte, idx int) {
|
|
// amd64:`MOVQ\s.*\(.*\)\(.*\*1\)$`,-`SHR.`
|
|
// arm64:`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`
|
|
// ppc64le:`MOVD\s`,-`MOV[BHW]\s`
|
|
binary.LittleEndian.PutUint64(b[idx:], sink64)
|
|
}
|
|
|
|
func store_le32(b []byte) {
|
|
// amd64:`MOVL\s`
|
|
// arm64:`MOVW`,-`MOV[BH]`
|
|
// ppc64le:`MOVW\s`
|
|
binary.LittleEndian.PutUint32(b, sink32)
|
|
}
|
|
|
|
func store_le32_idx(b []byte, idx int) {
|
|
// amd64:`MOVL\s`
|
|
// arm64:`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`
|
|
// ppc64le:`MOVW\s`
|
|
binary.LittleEndian.PutUint32(b[idx:], sink32)
|
|
}
|
|
|
|
func store_le16(b []byte) {
|
|
// amd64:`MOVW\s`
|
|
// arm64:`MOVH`,-`MOVB`
|
|
// ppc64le(DISABLED):`MOVH\s`
|
|
binary.LittleEndian.PutUint16(b, sink16)
|
|
}
|
|
|
|
func store_le16_idx(b []byte, idx int) {
|
|
// amd64:`MOVW\s`
|
|
// arm64:`MOVH\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOVB`
|
|
// ppc64le(DISABLED):`MOVH\s`
|
|
binary.LittleEndian.PutUint16(b[idx:], sink16)
|
|
}
|
|
|
|
func store_be64(b []byte) {
|
|
// amd64:`BSWAPQ`,-`SHR.`
|
|
// arm64:`MOVD`,`REV`,-`MOV[WBH]`,-`REVW`,-`REV16W`
|
|
binary.BigEndian.PutUint64(b, sink64)
|
|
}
|
|
|
|
func store_be64_idx(b []byte, idx int) {
|
|
// amd64:`BSWAPQ`,-`SHR.`
|
|
// arm64:`REV`,`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`,-`REV16W`,-`REVW`
|
|
binary.BigEndian.PutUint64(b[idx:], sink64)
|
|
}
|
|
|
|
func store_be32(b []byte) {
|
|
// amd64:`BSWAPL`,-`SHR.`
|
|
// arm64:`MOVW`,`REVW`,-`MOV[BH]`,-`REV16W`
|
|
binary.BigEndian.PutUint32(b, sink32)
|
|
}
|
|
|
|
func store_be32_idx(b []byte, idx int) {
|
|
// amd64:`BSWAPL`,-`SHR.`
|
|
// arm64:`REVW`,`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`,-`REV16W`
|
|
binary.BigEndian.PutUint32(b[idx:], sink32)
|
|
}
|
|
|
|
func store_be16(b []byte) {
|
|
// amd64:`ROLW\s\$8`,-`SHR.`
|
|
// arm64:`MOVH`,`REV16W`,-`MOVB`
|
|
binary.BigEndian.PutUint16(b, sink16)
|
|
}
|
|
|
|
func store_be16_idx(b []byte, idx int) {
|
|
// amd64:`ROLW\s\$8`,-`SHR.`
|
|
// arm64:`MOVH\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,`REV16W`,-`MOVB`
|
|
binary.BigEndian.PutUint16(b[idx:], sink16)
|
|
}
|
|
|
|
// ------------- //
|
|
// Zeroing //
|
|
// ------------- //
|
|
|
|
// Check that zero stores are combined into larger stores
|
|
|
|
func zero_byte_2(b1, b2 []byte) {
|
|
// bounds checks to guarantee safety of writes below
|
|
_, _ = b1[1], b2[1]
|
|
b1[0], b1[1] = 0, 0 // arm64:"MOVH\tZR",-"MOVB"
|
|
b2[1], b2[0] = 0, 0 // arm64:"MOVH\tZR",-"MOVB"
|
|
}
|
|
|
|
func zero_byte_4(b1, b2 []byte) {
|
|
_, _ = b1[3], b2[3]
|
|
b1[0], b1[1], b1[2], b1[3] = 0, 0, 0, 0 // arm64:"MOVW\tZR",-"MOVB",-"MOVH"
|
|
b2[2], b2[3], b2[1], b2[0] = 0, 0, 0, 0 // arm64:"MOVW\tZR",-"MOVB",-"MOVH"
|
|
}
|
|
|
|
func zero_byte_8(b []byte) {
|
|
_ = b[7]
|
|
b[0], b[1], b[2], b[3] = 0, 0, 0, 0
|
|
b[4], b[5], b[6], b[7] = 0, 0, 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
|
|
}
|
|
|
|
func zero_byte_16(b []byte) {
|
|
_ = b[15]
|
|
b[0], b[1], b[2], b[3] = 0, 0, 0, 0
|
|
b[4], b[5], b[6], b[7] = 0, 0, 0, 0
|
|
b[8], b[9], b[10], b[11] = 0, 0, 0, 0
|
|
b[12], b[13], b[14], b[15] = 0, 0, 0, 0 // arm64:"STP",-"MOVB",-"MOVH",-"MOVW"
|
|
}
|
|
|
|
func zero_byte_30(a *[30]byte) {
|
|
*a = [30]byte{} // arm64:"STP",-"MOVB",-"MOVH",-"MOVW"
|
|
}
|
|
|
|
func zero_byte_39(a *[39]byte) {
|
|
*a = [39]byte{} // arm64:"MOVD",-"MOVB",-"MOVH",-"MOVW"
|
|
}
|
|
|
|
func zero_uint16_2(h1, h2 []uint16) {
|
|
_, _ = h1[1], h2[1]
|
|
h1[0], h1[1] = 0, 0 // arm64:"MOVW\tZR",-"MOVB",-"MOVH"
|
|
h2[1], h2[0] = 0, 0 // arm64:"MOVW\tZR",-"MOVB",-"MOVH"
|
|
}
|
|
|
|
func zero_uint16_4(h1, h2 []uint16) {
|
|
_, _ = h1[3], h2[3]
|
|
h1[0], h1[1], h1[2], h1[3] = 0, 0, 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
|
|
h2[2], h2[3], h2[1], h2[0] = 0, 0, 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
|
|
}
|
|
|
|
func zero_uint16_8(h []uint16) {
|
|
_ = h[7]
|
|
h[0], h[1], h[2], h[3] = 0, 0, 0, 0
|
|
h[4], h[5], h[6], h[7] = 0, 0, 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
|
|
}
|
|
|
|
func zero_uint32_2(w1, w2 []uint32) {
|
|
_, _ = w1[1], w2[1]
|
|
w1[0], w1[1] = 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
|
|
w2[1], w2[0] = 0, 0 // arm64:"MOVD\tZR",-"MOVB",-"MOVH",-"MOVW"
|
|
}
|
|
|
|
func zero_uint32_4(w1, w2 []uint32) {
|
|
_, _ = w1[3], w2[3]
|
|
w1[0], w1[1], w1[2], w1[3] = 0, 0, 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
|
|
w2[2], w2[3], w2[1], w2[0] = 0, 0, 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
|
|
}
|
|
|
|
func zero_uint64_2(d1, d2 []uint64) {
|
|
_, _ = d1[1], d2[1]
|
|
d1[0], d1[1] = 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
|
|
d2[1], d2[0] = 0, 0 // arm64:"STP",-"MOVB",-"MOVH"
|
|
}
|