From df87d90b8c121e1893e3ed103cdd77f6cfddb47c Mon Sep 17 00:00:00 2001 From: JT <547158+jntrnr@users.noreply.github.com> Date: Tue, 16 Nov 2021 11:29:54 +1300 Subject: [PATCH] Add 'detect columns' command (#4127) * Add 'detect columns' command * Fix warnings --- Cargo.lock | 165 +++------- .../src/commands/strings/detect/columns.rs | 283 ++++++++++++++++++ .../src/commands/strings/detect/mod.rs | 3 + crates/nu-command/src/commands/strings/mod.rs | 2 + crates/nu-command/src/default_context.rs | 1 + 5 files changed, 326 insertions(+), 128 deletions(-) create mode 100644 crates/nu-command/src/commands/strings/detect/columns.rs create mode 100644 crates/nu-command/src/commands/strings/detect/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 6f353e1211..ebcdbe0e95 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -109,7 +109,7 @@ checksum = "47044a1809e2953fe6d084312b81dcb7d9ffc24fee45aa39e5b938f66f75b8a8" dependencies = [ "clipboard-win", "core-graphics", - "image 0.23.14", + "image", "log", "objc", "objc-foundation", @@ -168,18 +168,6 @@ dependencies = [ "strength_reduce", ] -[[package]] -name = "as-slice" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45403b49e3954a4b8428a0ac21a4b7afadccf92bfd96273f1a58cd4812496ae0" -dependencies = [ - "generic-array 0.12.4", - "generic-array 0.13.3", - "generic-array 0.14.4", - "stable_deref_trait", -] - [[package]] name = "async-stream" version = "0.3.2" @@ -355,7 +343,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" dependencies = [ - "generic-array 0.14.4", + "generic-array", ] [[package]] @@ -381,21 +369,21 @@ dependencies = [ [[package]] name = "bson" -version = "0.14.1" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c177ed0122f24ce5e0f05bf9b29e79f3ac1a359bc504e0e14c3b34896c71c00" +checksum = "ff58d466782b57e0001c8e97c6a70c01c2359d7e13e257a83654c0b783ecc139" dependencies = [ - "byteorder", + "ahash", + "base64", "chrono", - "decimal", - "hex 0.3.2", - "libc", - "linked-hash-map", - "md5 0.6.1", - "rand 0.7.3", + "hex", + "indexmap", + "lazy_static", + "rand 0.8.4", "serde", + "serde_bytes", "serde_json", - "time", + "uuid", ] [[package]] @@ -800,7 +788,7 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1d1a86f49236c215f271d40892d5fc950490551400b02ef360692c29815c714" dependencies = [ - "generic-array 0.14.4", + "generic-array", "subtle", ] @@ -879,27 +867,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7313c0d620d0cb4dbd9d019e461a4beb501071ff46ec0ab933efb4daa76d73e3" -[[package]] -name = "decimal" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a8ab77e91baeb15034c3be91e87bff4665c9036216148e4996d9a9f5792114d" -dependencies = [ - "bitflags", - "cc", - "libc", -] - -[[package]] -name = "deflate" -version = "0.7.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707b6a7b384888a70c8d2e8650b3e60170dfc6a67bb4aa67b6dfca57af4bedb4" -dependencies = [ - "adler32", - "byteorder", -] - [[package]] name = "deflate" version = "0.8.6" @@ -940,7 +907,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" dependencies = [ - "generic-array 0.14.4", + "generic-array", ] [[package]] @@ -1456,24 +1423,6 @@ version = "0.3.55" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2" -[[package]] -name = "generic-array" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd" -dependencies = [ - "typenum", -] - -[[package]] -name = "generic-array" -version = "0.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f797e67af32588215eaaab8327027ee8e71b9dd0b2b26996aedf20c030fce309" -dependencies = [ - "typenum", -] - [[package]] name = "generic-array" version = "0.14.4" @@ -1651,9 +1600,9 @@ dependencies = [ [[package]] name = "hash32" -version = "0.1.1" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4041af86e63ac4298ce40e5cca669066e75b6f1aa3390fe2561ffa5e1d9f4cc" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" dependencies = [ "byteorder", ] @@ -1685,13 +1634,12 @@ dependencies = [ [[package]] name = "heapless" -version = "0.6.1" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634bd4d29cbf24424d0a4bfcbf80c6960129dc24424752a7d1d1390607023422" +checksum = "9c1ad878e07405df82b695089e63d278244344f80e764074d0bdfe99b89460f3" dependencies = [ - "as-slice", - "generic-array 0.14.4", "hash32", + "spin", "stable_deref_trait", ] @@ -1704,12 +1652,6 @@ dependencies = [ "libc", ] -[[package]] -name = "hex" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "805026a5d0141ffc30abb3be3173848ad46a1b1664fe632428479619a3644d77" - [[package]] name = "hex" version = "0.4.3" @@ -1855,20 +1797,6 @@ dependencies = [ "unicode-normalization", ] -[[package]] -name = "image" -version = "0.22.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08ed2ada878397b045454ac7cfb011d73132c59f31a955d230bd1f1c2e68eb4a" -dependencies = [ - "byteorder", - "jpeg-decoder", - "num-iter", - "num-rational 0.2.4", - "num-traits", - "png 0.15.3", -] - [[package]] name = "image" version = "0.23.14" @@ -1878,10 +1806,11 @@ dependencies = [ "bytemuck", "byteorder", "color_quant", + "jpeg-decoder", "num-iter", "num-rational 0.3.2", "num-traits", - "png 0.16.8", + "png", "tiff", ] @@ -1896,15 +1825,6 @@ dependencies = [ "serde", ] -[[package]] -name = "inflate" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cdb29978cc5797bd8dcc8e5bf7de604891df2a8dc576973d71a281e916db2ff" -dependencies = [ - "adler32", -] - [[package]] name = "insta" version = "1.7.2" @@ -2237,12 +2157,6 @@ dependencies = [ "opaque-debug", ] -[[package]] -name = "md5" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e6bcd6433cff03a4bfc3d9834d504467db1f1cf6d0ea765d37d330249ed629d" - [[package]] name = "md5" version = "0.7.0" @@ -2342,9 +2256,9 @@ dependencies = [ [[package]] name = "mp4" -version = "0.8.3" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51eb18a88129198ca1e8e92f26038ed6814cd0e608fa43215bf57368604bf093" +checksum = "85660d4d88b9318d95396943adc4a254b3ed8bf1de917e6f093abda1ccf0bec0" dependencies = [ "byteorder", "bytes 0.5.6", @@ -2679,16 +2593,14 @@ dependencies = [ "sha2", "sys-locale", "toml", - "users", ] [[package]] name = "nu-engine" version = "0.39.0" dependencies = [ - "ansi_term 0.12.1", "bigdecimal-rs", - "bytes 0.5.6", + "bytes 1.1.0", "chrono", "codespan-reporting", "derive-new", @@ -2911,7 +2823,7 @@ name = "nu_plugin_binaryview" version = "0.39.0" dependencies = [ "crossterm", - "image 0.22.5", + "image", "neso", "nu-ansi-term", "nu-errors", @@ -3075,7 +2987,7 @@ dependencies = [ name = "nu_plugin_to_sqlite" version = "0.39.0" dependencies = [ - "hex 0.4.3", + "hex", "nu-errors", "nu-plugin", "nu-protocol", @@ -3597,18 +3509,6 @@ dependencies = [ "xml-rs", ] -[[package]] -name = "png" -version = "0.15.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef859a23054bbfee7811284275ae522f0434a3c8e7f4b74bd4a35ae7e1c4a283" -dependencies = [ - "bitflags", - "crc32fast", - "deflate 0.7.20", - "inflate", -] - [[package]] name = "png" version = "0.16.8" @@ -3617,7 +3517,7 @@ checksum = "3c3287920cb847dee3de33d301c463fba14dda99db24214ddf93f83d3021f4c6" dependencies = [ "bitflags", "crc32fast", - "deflate 0.8.6", + "deflate", "miniz_oxide 0.3.7", ] @@ -4351,7 +4251,7 @@ dependencies = [ "hmac", "hmac-sha1", "log", - "md5 0.7.0", + "md5", "mime_guess", "quick-xml 0.22.0", "regex", @@ -4703,6 +4603,15 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "spin" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "511254be0c5bcf062b019a6c89c01a664aa359ded62f78aa72c6fc137c0590e5" +dependencies = [ + "lock_api", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" diff --git a/crates/nu-command/src/commands/strings/detect/columns.rs b/crates/nu-command/src/commands/strings/detect/columns.rs new file mode 100644 index 0000000000..b2a1a61eb2 --- /dev/null +++ b/crates/nu-command/src/commands/strings/detect/columns.rs @@ -0,0 +1,283 @@ +use std::{iter::Peekable, str::CharIndices}; + +use crate::prelude::*; +use nu_engine::WholeStreamCommand; +use nu_errors::ShellError; +use nu_protocol::{Signature, SyntaxShape, TaggedDictBuilder, UntaggedValue}; +use nu_source::Spanned; + +type Input<'t> = Peekable>; + +pub struct DetectColumns; + +impl WholeStreamCommand for DetectColumns { + fn name(&self) -> &str { + "detect columns" + } + + fn signature(&self) -> Signature { + Signature::build("detect columns") + .named( + "skip", + SyntaxShape::Int, + "number of rows to skip before detecting", + Some('s'), + ) + .switch("no_headers", "don't detect headers", Some('n')) + } + + fn usage(&self) -> &str { + "splits contents across multiple columns via the separator." + } + + fn run(&self, args: CommandArgs) -> Result { + detect_columns(args) + } +} + +fn detect_columns(args: CommandArgs) -> Result { + let name_tag = args.name_tag(); + let num_rows_to_skip: Option = args.get_flag("skip")?; + let noheader = args.has_flag("no_headers"); + let input = args.input.collect_string(name_tag.clone())?; + + let input: Vec<_> = input + .lines() + .skip(num_rows_to_skip.unwrap_or_default()) + .map(|x| x.to_string()) + .collect(); + + let mut input = input.into_iter(); + let headers = input.next(); + + if let Some(orig_headers) = headers { + let headers = find_columns(&orig_headers); + + Ok((if noheader { + vec![orig_headers].into_iter().chain(input) + } else { + vec![].into_iter().chain(input) + }) + .map(move |x| { + let row = find_columns(&x); + + let mut dict = TaggedDictBuilder::new(name_tag.clone()); + + if headers.len() == row.len() && !noheader { + for (header, val) in headers.iter().zip(row.iter()) { + dict.insert_untagged(&header.item, UntaggedValue::string(&val.item)); + } + } else { + let mut pre_output = vec![]; + + // column counts don't line up, so see if we can figure out why + for cell in row { + for header in &headers { + if cell.span.start() <= header.span.end() + && cell.span.end() > header.span.start() + { + pre_output + .push((header.item.to_string(), UntaggedValue::string(&cell.item))); + } + } + } + + for header in &headers { + let mut found = false; + for pre_o in &pre_output { + if pre_o.0 == header.item { + found = true; + break; + } + } + + if !found { + pre_output.push((header.item.to_string(), UntaggedValue::nothing())); + } + } + + if noheader { + for header in headers.iter().enumerate() { + for pre_o in &pre_output { + if pre_o.0 == header.1.item { + dict.insert_untagged(format!("Column{}", header.0), pre_o.1.clone()) + } + } + } + } else { + for header in &headers { + for pre_o in &pre_output { + if pre_o.0 == header.item { + dict.insert_untagged(&header.item, pre_o.1.clone()) + } + } + } + } + } + + dict.into_value() + }) + .into_output_stream()) + } else { + Ok(OutputStream::empty()) + } +} + +pub fn find_columns(input: &str) -> Vec> { + let mut chars = input.char_indices().peekable(); + let mut output = vec![]; + + while let Some((_, c)) = chars.peek() { + if c.is_whitespace() { + // If the next character is non-newline whitespace, skip it. + + let _ = chars.next(); + } else { + // Otherwise, try to consume an unclassified token. + + let result = baseline(&mut chars); + + output.push(result); + } + } + + output +} + +#[derive(Clone, Copy)] +enum BlockKind { + Paren, + CurlyBracket, + SquareBracket, +} + +fn baseline(src: &mut Input) -> Spanned { + let mut token_contents = String::new(); + + let start_offset = if let Some((pos, _)) = src.peek() { + *pos + } else { + 0 + }; + + // This variable tracks the starting character of a string literal, so that + // we remain inside the string literal lexer mode until we encounter the + // closing quote. + let mut quote_start: Option = None; + + // This Vec tracks paired delimiters + let mut block_level: Vec = vec![]; + + // A baseline token is terminated if it's not nested inside of a paired + // delimiter and the next character is one of: `|`, `;`, `#` or any + // whitespace. + fn is_termination(block_level: &[BlockKind], c: char) -> bool { + block_level.is_empty() && (c.is_whitespace()) + } + + // The process of slurping up a baseline token repeats: + // + // - String literal, which begins with `'`, `"` or `\``, and continues until + // the same character is encountered again. + // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until + // the matching closing delimiter is found, skipping comments and string + // literals. + // - When not nested inside of a delimiter pair, when a terminating + // character (whitespace, `|`, `;` or `#`) is encountered, the baseline + // token is done. + // - Otherwise, accumulate the character into the current baseline token. + while let Some((_, c)) = src.peek() { + let c = *c; + + if quote_start.is_some() { + // If we encountered the closing quote character for the current + // string, we're done with the current string. + if Some(c) == quote_start { + quote_start = None; + } + } else if c == '\n' { + if is_termination(&block_level, c) { + break; + } + } else if c == '\'' || c == '"' || c == '`' { + // We encountered the opening quote of a string literal. + quote_start = Some(c); + } else if c == '[' { + // We encountered an opening `[` delimiter. + block_level.push(BlockKind::SquareBracket); + } else if c == ']' { + // We encountered a closing `]` delimiter. Pop off the opening `[` + // delimiter. + if let Some(BlockKind::SquareBracket) = block_level.last() { + let _ = block_level.pop(); + } + } else if c == '{' { + // We encountered an opening `{` delimiter. + block_level.push(BlockKind::CurlyBracket); + } else if c == '}' { + // We encountered a closing `}` delimiter. Pop off the opening `{`. + if let Some(BlockKind::CurlyBracket) = block_level.last() { + let _ = block_level.pop(); + } + } else if c == '(' { + // We enceountered an opening `(` delimiter. + block_level.push(BlockKind::Paren); + } else if c == ')' { + // We encountered a closing `)` delimiter. Pop off the opening `(`. + if let Some(BlockKind::Paren) = block_level.last() { + let _ = block_level.pop(); + } + } else if is_termination(&block_level, c) { + break; + } + + // Otherwise, accumulate the character into the current token. + token_contents.push(c); + + // Consume the character. + let _ = src.next(); + } + + let span = Span::new(start_offset, start_offset + token_contents.len()); + + // If there is still unclosed opening delimiters, close them and add + // synthetic closing characters to the accumulated token. + if block_level.last().is_some() { + // let delim: char = (*block).closing(); + // let cause = ParseError::unexpected_eof(delim.to_string(), span); + + // while let Some(bk) = block_level.pop() { + // token_contents.push(bk.closing()); + // } + + return token_contents.spanned(span); + } + + if quote_start.is_some() { + // The non-lite parse trims quotes on both sides, so we add the expected quote so that + // anyone wanting to consume this partial parse (e.g., completions) will be able to get + // correct information from the non-lite parse. + // token_contents.push(delimiter); + + // return ( + // token_contents.spanned(span), + // Some(ParseError::unexpected_eof(delimiter.to_string(), span)), + // ); + return token_contents.spanned(span); + } + + token_contents.spanned(span) +} + +#[cfg(test)] +mod tests { + use super::DetectColumns; + use super::ShellError; + + #[test] + fn examples_work_as_expected() -> Result<(), ShellError> { + use crate::examples::test as test_examples; + + test_examples(DetectColumns {}) + } +} diff --git a/crates/nu-command/src/commands/strings/detect/mod.rs b/crates/nu-command/src/commands/strings/detect/mod.rs new file mode 100644 index 0000000000..5c56257ba1 --- /dev/null +++ b/crates/nu-command/src/commands/strings/detect/mod.rs @@ -0,0 +1,3 @@ +pub mod columns; + +pub use columns::DetectColumns; diff --git a/crates/nu-command/src/commands/strings/mod.rs b/crates/nu-command/src/commands/strings/mod.rs index bdeff53d83..b92f5e2541 100644 --- a/crates/nu-command/src/commands/strings/mod.rs +++ b/crates/nu-command/src/commands/strings/mod.rs @@ -1,5 +1,6 @@ mod build_string; mod char_; +mod detect; mod format; mod lines; mod parse; @@ -10,6 +11,7 @@ mod str_; pub use build_string::BuildString; pub use char_::Char; +pub use detect::DetectColumns; pub use format::*; pub use lines::Lines; pub use parse::*; diff --git a/crates/nu-command/src/default_context.rs b/crates/nu-command/src/default_context.rs index 3cd3d1415e..7d814f6162 100644 --- a/crates/nu-command/src/default_context.rs +++ b/crates/nu-command/src/default_context.rs @@ -127,6 +127,7 @@ pub fn create_default_context(interactive: bool) -> Result