From 40e629beb103ed3669a1b0765a12bf8292610132 Mon Sep 17 00:00:00 2001 From: alex-tdrn Date: Sat, 29 Jun 2024 23:12:17 +0200 Subject: [PATCH] Fix multibyte codepoint handling in `detect columns --guess` (#13272) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description This PR fixes #13269. The splitting code in `guess_width.rs` was creating slices from char indices, instead of byte indices. This works perfectly fine for 1-byte code points, but panics or returns wrong results as soon as multibyte codepoints appear in the input. I originally discovered this by piping `winget list` into `detect columns --guess`, since winget sometimes uses the unicode ellipsis symbol (`…`) which is 3 bytes long when encoded in utf-8. # User-Facing Changes `detect columns --guess` should not crash due to multibyte unicode input anymore before: ![image](https://github.com/nushell/nushell/assets/20356389/833cd732-be3b-4158-97f7-0ca2616ce23f) after: ![image](https://github.com/nushell/nushell/assets/20356389/15358b40-4083-4a33-9f2c-87e63f39d985) # Tests + Formatting - Added tests to `guess_width.rs` for testing handling of multibyte as well as combining diacritical marks # After Submitting --- crates/nu-command/src/strings/guess_width.rs | 60 ++++++++++++++++---- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/crates/nu-command/src/strings/guess_width.rs b/crates/nu-command/src/strings/guess_width.rs index 59cfbcb2cf..b214b21bcc 100644 --- a/crates/nu-command/src/strings/guess_width.rs +++ b/crates/nu-command/src/strings/guess_width.rs @@ -175,34 +175,34 @@ fn separator_position(lr: &[char], p: usize, pos: &[usize], n: usize) -> usize { fn split(line: &str, pos: &[usize], trim_space: bool) -> Vec { let mut n = 0; - let mut start = 0; + let mut start_char = 0; let mut columns = Vec::with_capacity(pos.len() + 1); - let lr: Vec = line.chars().collect(); + let (line_char_boundaries, line_chars): (Vec, Vec) = line.char_indices().unzip(); let mut w = 0; - for p in 0..lr.len() { + for p in 0..line_char_boundaries.len() { if pos.is_empty() || n > pos.len() - 1 { - start = p; + start_char = p; break; } if pos[n] <= w { - let end = separator_position(&lr, p, pos, n); - if start > end { + let end_char = separator_position(&line_chars, p, pos, n); + if start_char > end_char { break; } - let col = &line[start..end]; + let col = &line[line_char_boundaries[start_char]..line_char_boundaries[end_char]]; let col = if trim_space { col.trim() } else { col }; columns.push(col.to_string()); n += 1; - start = end; + start_char = end_char; } - w += UnicodeWidthStr::width(lr[p].to_string().as_str()); + w += UnicodeWidthStr::width(line_chars[p].to_string().as_str()); } // add last part. - let col = &line[start..]; + let col = &line[line_char_boundaries[start_char]..]; let col = if trim_space { col.trim() } else { col }; columns.push(col.to_string()); columns @@ -423,6 +423,46 @@ D: 104792064 17042676 87749388 17% /d"; assert_eq!(got, want); } + #[test] + fn test_guess_width_multibyte() { + let input = "A… B\nC… D"; + let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box; + let reader = std::io::BufReader::new(r); + + let mut guess_width = GuessWidth { + reader, + pos: Vec::new(), + pre_lines: Vec::new(), + pre_count: 0, + limit_split: 0, + }; + + let want = vec![vec!["A…", "B"], vec!["C…", "D"]]; + let got = guess_width.read_all(); + assert_eq!(got, want); + } + + #[test] + fn test_guess_width_combining_diacritical_marks() { + let input = "Name Surname +Ștefan Țincu "; + + let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box; + let reader = std::io::BufReader::new(r); + + let mut guess_width = GuessWidth { + reader, + pos: Vec::new(), + pre_lines: Vec::new(), + pre_count: 0, + limit_split: 0, + }; + + let want = vec![vec!["Name", "Surname"], vec!["Ștefan", "Țincu"]]; + let got = guess_width.read_all(); + assert_eq!(got, want); + } + #[test] fn test_to_table() { let lines = vec![