From 40e629beb103ed3669a1b0765a12bf8292610132 Mon Sep 17 00:00:00 2001
From: alex-tdrn <alex-tdrn@protonmail.com>
Date: Sat, 29 Jun 2024 23:12:17 +0200
Subject: [PATCH] Fix multibyte codepoint handling in `detect columns --guess`
 (#13272)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<!--
if this PR closes one or more issues, you can automatically link the PR
with
them by using one of the [*linking
keywords*](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword),
e.g.
- this PR should close #xxxx
- fixes #xxxx

you can also mention related issues, PRs or discussions!
-->

# Description
<!--
Thank you for improving Nushell. Please, check our [contributing
guide](../CONTRIBUTING.md) and talk to the core team before making major
changes.

Description of your pull request goes here. **Provide examples and/or
screenshots** if your changes affect the user experience.
-->
This PR fixes #13269. The splitting code in `guess_width.rs` was
creating slices from char indices, instead of byte indices. This works
perfectly fine for 1-byte code points, but panics or returns wrong
results as soon as multibyte codepoints appear in the input. I
originally discovered this by piping `winget list` into `detect columns
--guess`, since winget sometimes uses the unicode ellipsis symbol (`…`)
which is 3 bytes long when encoded in utf-8.

# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->
`detect columns --guess` should not crash due to multibyte unicode input
anymore

before:

![image](https://github.com/nushell/nushell/assets/20356389/833cd732-be3b-4158-97f7-0ca2616ce23f)

after:

![image](https://github.com/nushell/nushell/assets/20356389/15358b40-4083-4a33-9f2c-87e63f39d985)


# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the
tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->
- Added tests to `guess_width.rs` for testing handling of multibyte as
well as combining diacritical marks

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
---
 crates/nu-command/src/strings/guess_width.rs | 60 ++++++++++++++++----
 1 file changed, 50 insertions(+), 10 deletions(-)
diff --git a/crates/nu-command/src/strings/guess_width.rs b/crates/nu-command/src/strings/guess_width.rs
index 59cfbcb2cf..b214b21bcc 100644
--- a/crates/nu-command/src/strings/guess_width.rs
+++ b/crates/nu-command/src/strings/guess_width.rs
@@ -175,34 +175,34 @@ fn separator_position(lr: &[char], p: usize, pos: &[usize], n: usize) -> usize {
 
 fn split(line: &str, pos: &[usize], trim_space: bool) -> Vec<String> {
     let mut n = 0;
-    let mut start = 0;
+    let mut start_char = 0;
     let mut columns = Vec::with_capacity(pos.len() + 1);
-    let lr: Vec<char> = line.chars().collect();
+    let (line_char_boundaries, line_chars): (Vec<usize>, Vec<char>) = line.char_indices().unzip();
     let mut w = 0;
 
-    for p in 0..lr.len() {
+    for p in 0..line_char_boundaries.len() {
         if pos.is_empty() || n > pos.len() - 1 {
-            start = p;
+            start_char = p;
             break;
         }
 
         if pos[n] <= w {
-            let end = separator_position(&lr, p, pos, n);
-            if start > end {
+            let end_char = separator_position(&line_chars, p, pos, n);
+            if start_char > end_char {
                 break;
             }
-            let col = &line[start..end];
+            let col = &line[line_char_boundaries[start_char]..line_char_boundaries[end_char]];
             let col = if trim_space { col.trim() } else { col };
             columns.push(col.to_string());
             n += 1;
-            start = end;
+            start_char = end_char;
         }
 
-        w += UnicodeWidthStr::width(lr[p].to_string().as_str());
+        w += UnicodeWidthStr::width(line_chars[p].to_string().as_str());
     }
 
     // add last part.
-    let col = &line[start..];
+    let col = &line[line_char_boundaries[start_char]..];
     let col = if trim_space { col.trim() } else { col };
     columns.push(col.to_string());
     columns
@@ -423,6 +423,46 @@ D:             104792064  17042676  87749388  17% /d";
         assert_eq!(got, want);
     }
 
+    #[test]
+    fn test_guess_width_multibyte() {
+        let input = "A… B\nC… D";
+        let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box<dyn std::io::Read>;
+        let reader = std::io::BufReader::new(r);
+
+        let mut guess_width = GuessWidth {
+            reader,
+            pos: Vec::new(),
+            pre_lines: Vec::new(),
+            pre_count: 0,
+            limit_split: 0,
+        };
+
+        let want = vec![vec!["A…", "B"], vec!["C…", "D"]];
+        let got = guess_width.read_all();
+        assert_eq!(got, want);
+    }
+
+    #[test]
+    fn test_guess_width_combining_diacritical_marks() {
+        let input = "Name        Surname
+Ștefan         Țincu ";
+
+        let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box<dyn std::io::Read>;
+        let reader = std::io::BufReader::new(r);
+
+        let mut guess_width = GuessWidth {
+            reader,
+            pos: Vec::new(),
+            pre_lines: Vec::new(),
+            pre_count: 0,
+            limit_split: 0,
+        };
+
+        let want = vec![vec!["Name", "Surname"], vec!["Ștefan", "Țincu"]];
+        let got = guess_width.read_all();
+        assert_eq!(got, want);
+    }
+
     #[test]
     fn test_to_table() {
         let lines = vec![