go/src/bytes/iter.go
tdakkota 35b4fd9f37 bytes, strings: reduce Split{,After}Seq heap allocations
This CL slightly changes flow of splitSeq to help compiler to inline the iterator closure.

goos: linux
goarch: amd64
pkg: strings
cpu: AMD Ryzen 9 5950X 16-Core Processor
                                    │   sec/op    │   sec/op     vs base                │
SplitSeqEmptySeparator-32             3.590m ± 0%   3.430m ± 2%   -4.46% (p=0.000 n=30)
SplitSeqSingleByteSeparator-32        647.0µ ± 0%   656.1µ ± 0%   +1.41% (p=0.000 n=30)
SplitSeqMultiByteSeparator-32         423.9µ ± 1%   384.5µ ± 0%   -9.31% (p=0.000 n=30)
SplitAfterSeqEmptySeparator-32        3.372m ± 4%   3.514m ± 0%   +4.20% (p=0.000 n=30)
SplitAfterSeqSingleByteSeparator-32   648.5µ ± 2%   537.6µ ± 0%  -17.10% (p=0.000 n=30)
SplitAfterSeqMultiByteSeparator-32    423.3µ ± 2%   364.4µ ± 2%  -13.91% (p=0.000 n=30)
geomean                               984.7µ        917.3µ        -6.85%

                                    │    B/op    │   B/op     vs base                     │
SplitSeqEmptySeparator-32             24.00 ± 0%   0.00 ± 0%  -100.00% (p=0.000 n=30)
SplitSeqSingleByteSeparator-32        24.00 ± 0%   0.00 ± 0%  -100.00% (p=0.000 n=30)
SplitSeqMultiByteSeparator-32         24.00 ± 0%   0.00 ± 0%  -100.00% (p=0.000 n=30)
SplitAfterSeqEmptySeparator-32        24.00 ± 0%   0.00 ± 0%  -100.00% (p=0.000 n=30)
SplitAfterSeqSingleByteSeparator-32   24.00 ± 0%   0.00 ± 0%  -100.00% (p=0.000 n=30)
SplitAfterSeqMultiByteSeparator-32    24.00 ± 0%   0.00 ± 0%  -100.00% (p=0.000 n=30)
geomean                               24.00                   ?

For #73524

Change-Id: Ic83c5751a41c65030356a208e4ad1f500723e695
Reviewed-on: https://go-review.googlesource.com/c/go/+/669735
Auto-Submit: Alan Donovan <adonovan@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: qiu laidongfeng2 <2645477756@qq.com>
Commit-Queue: Alan Donovan <adonovan@google.com>
2025-05-05 19:08:23 -07:00

147 lines
3.7 KiB
Go

// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bytes
import (
"iter"
"unicode"
"unicode/utf8"
)
// Lines returns an iterator over the newline-terminated lines in the byte slice s.
// The lines yielded by the iterator include their terminating newlines.
// If s is empty, the iterator yields no lines at all.
// If s does not end in a newline, the final yielded line will not end in a newline.
// It returns a single-use iterator.
func Lines(s []byte) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
for len(s) > 0 {
var line []byte
if i := IndexByte(s, '\n'); i >= 0 {
line, s = s[:i+1], s[i+1:]
} else {
line, s = s, nil
}
if !yield(line[:len(line):len(line)]) {
return
}
}
}
}
// explodeSeq returns an iterator over the runes in s.
func explodeSeq(s []byte, yield func([]byte) bool) {
for len(s) > 0 {
_, size := utf8.DecodeRune(s)
if !yield(s[:size:size]) {
return
}
s = s[size:]
}
}
// splitSeq is SplitSeq or SplitAfterSeq, configured by how many
// bytes of sep to include in the results (none or all).
func splitSeq(s, sep []byte, sepSave int) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
if len(sep) == 0 {
explodeSeq(s, yield)
return
}
for {
i := Index(s, sep)
if i < 0 {
break
}
frag := s[:i+sepSave]
if !yield(frag[:len(frag):len(frag)]) {
return
}
s = s[i+len(sep):]
}
yield(s[:len(s):len(s)])
}
}
// SplitSeq returns an iterator over all subslices of s separated by sep.
// The iterator yields the same subslices that would be returned by [Split](s, sep),
// but without constructing a new slice containing the subslices.
// It returns a single-use iterator.
func SplitSeq(s, sep []byte) iter.Seq[[]byte] {
return splitSeq(s, sep, 0)
}
// SplitAfterSeq returns an iterator over subslices of s split after each instance of sep.
// The iterator yields the same subslices that would be returned by [SplitAfter](s, sep),
// but without constructing a new slice containing the subslices.
// It returns a single-use iterator.
func SplitAfterSeq(s, sep []byte) iter.Seq[[]byte] {
return splitSeq(s, sep, len(sep))
}
// FieldsSeq returns an iterator over subslices of s split around runs of
// whitespace characters, as defined by [unicode.IsSpace].
// The iterator yields the same subslices that would be returned by [Fields](s),
// but without constructing a new slice containing the subslices.
func FieldsSeq(s []byte) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
start := -1
for i := 0; i < len(s); {
size := 1
r := rune(s[i])
isSpace := asciiSpace[s[i]] != 0
if r >= utf8.RuneSelf {
r, size = utf8.DecodeRune(s[i:])
isSpace = unicode.IsSpace(r)
}
if isSpace {
if start >= 0 {
if !yield(s[start:i:i]) {
return
}
start = -1
}
} else if start < 0 {
start = i
}
i += size
}
if start >= 0 {
yield(s[start:len(s):len(s)])
}
}
}
// FieldsFuncSeq returns an iterator over subslices of s split around runs of
// Unicode code points satisfying f(c).
// The iterator yields the same subslices that would be returned by [FieldsFunc](s),
// but without constructing a new slice containing the subslices.
func FieldsFuncSeq(s []byte, f func(rune) bool) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
start := -1
for i := 0; i < len(s); {
size := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, size = utf8.DecodeRune(s[i:])
}
if f(r) {
if start >= 0 {
if !yield(s[start:i:i]) {
return
}
start = -1
}
} else if start < 0 {
start = i
}
i += size
}
if start >= 0 {
yield(s[start:len(s):len(s)])
}
}
}