From 3c023f75a62f903273c688432f95e77fc945b6fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=B6hrmann?= Date: Tue, 28 Feb 2017 21:21:45 +0100 Subject: [PATCH] strings: fix handling of invalid UTF-8 sequences in Map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new Map implementation introduced in golang.org/cl/33201 did not differentiate if an invalid UTF-8 sequence was decoded or the RuneError rune. It would therefore always advance by 3 bytes (which is the length of the RuneError rune) instead of 1 for an invalid sequences. This cl adds a check to correctly determine the length of bytes needed to advance to the next rune. Fixes #19330. Change-Id: I1e7f9333f3ef6068ffc64015bb0a9f32b0b7111d Reviewed-on: https://go-review.googlesource.com/37597 Run-TryBot: Martin Möhrmann TryBot-Result: Gobot Gobot Reviewed-by: Joe Tsai Reviewed-by: Brad Fitzpatrick --- src/strings/strings.go | 12 +++++++++++- src/strings/strings_test.go | 13 +++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/strings/strings.go b/src/strings/strings.go index 188d8cbc09..9ca222fdfa 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -406,7 +406,17 @@ func Map(mapping func(rune) rune, s string) string { nbytes += utf8.EncodeRune(b[nbytes:], r) } } - i += utf8.RuneLen(c) + + if c == utf8.RuneError { + // RuneError is the result of either decoding + // an invalid sequence or '\uFFFD'. Determine + // the correct number of bytes we need to advance. + _, w := utf8.DecodeRuneInString(s[i:]) + i += w + } else { + i += utf8.RuneLen(c) + } + s = s[i:] break } diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go index 3378d54fe2..97041eb9ac 100644 --- a/src/strings/strings_test.go +++ b/src/strings/strings_test.go @@ -625,6 +625,19 @@ func TestMap(t *testing.T) { (*reflect.StringHeader)(unsafe.Pointer(&m)).Data { t.Error("unexpected copy during identity map") } + + // 7. Handle invalid UTF-8 sequence + replaceNotLatin := func(r rune) rune { + if unicode.Is(unicode.Latin, r) { + return r + } + return '?' + } + m = Map(replaceNotLatin, "Hello\255World") + expect = "Hello?World" + if m != expect { + t.Errorf("replace invalid sequence: expected %q got %q", expect, m) + } } func TestToUpper(t *testing.T) { runStringTests(t, ToUpper, "ToUpper", upperTests) }