mirror of
https://github.com/ducaale/xh.git
synced 2025-05-05 15:32:50 +00:00
Merge pull request #416 from zuisong/parse-filename
download file parse filename add RFC 5987 support
This commit is contained in:
commit
55b920e5ec
11
Cargo.lock
generated
11
Cargo.lock
generated
@ -1841,6 +1841,15 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sanitize-filename"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc984f4f9ceb736a7bb755c3e3bd17dc56370af2600c9780dcc48c66453da34d"
|
||||
dependencies = [
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.27"
|
||||
@ -2813,6 +2822,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"os_display",
|
||||
"pem",
|
||||
"percent-encoding",
|
||||
"predicates",
|
||||
"rand",
|
||||
"regex-lite",
|
||||
@ -2822,6 +2832,7 @@ dependencies = [
|
||||
"rpassword",
|
||||
"rustls",
|
||||
"ruzstd",
|
||||
"sanitize-filename",
|
||||
"serde",
|
||||
"serde-transcode",
|
||||
"serde_json",
|
||||
|
@ -60,6 +60,8 @@ log = "0.4.21"
|
||||
rustls = { version = "0.23.25", optional = true, default-features = false, features = ["logging"] }
|
||||
tracing = { version = "0.1.41", default-features = false, features = ["log"] }
|
||||
reqwest_cookie_store = { version = "0.8.0", features = ["serde"] }
|
||||
percent-encoding = "2.3.1"
|
||||
sanitize-filename = "0.6.0"
|
||||
|
||||
[dependencies.reqwest]
|
||||
version = "0.12.3"
|
||||
|
186
src/content_disposition.rs
Normal file
186
src/content_disposition.rs
Normal file
@ -0,0 +1,186 @@
|
||||
use percent_encoding::percent_decode_str;
|
||||
|
||||
/// Parse filename from Content-Disposition header
|
||||
/// Prioritizes filename* parameter if present, otherwise uses filename parameter
|
||||
pub fn parse_filename_from_content_disposition(content_disposition: &str) -> Option<String> {
|
||||
let parts: Vec<&str> = content_disposition
|
||||
.split(';')
|
||||
.map(|part| part.trim())
|
||||
.collect();
|
||||
|
||||
// First try to find filename* parameter
|
||||
for part in parts.iter() {
|
||||
if let Some(value) = part.strip_prefix("filename*=") {
|
||||
if let Some(filename) = parse_encoded_filename(value) {
|
||||
return Some(filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If filename* is not found or parsing failed, try regular filename parameter
|
||||
for part in parts {
|
||||
if let Some(value) = part.strip_prefix("filename=") {
|
||||
return parse_regular_filename(value);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Parse regular filename parameter
|
||||
/// Handles both quoted and unquoted filenames
|
||||
fn parse_regular_filename(filename: &str) -> Option<String> {
|
||||
// Content-Disposition: attachment; filename="file with \"quotes\".txt" // This won't occur
|
||||
// Content-Disposition: attachment; filename*=UTF-8''file%20with%20quotes.txt // This is the actual practice
|
||||
//
|
||||
// We don't need to handle escaped characters in Content-Disposition header parsing because:
|
||||
//
|
||||
// It's not a standard practice
|
||||
// It rarely occurs in real-world scenarios
|
||||
// When filenames contain special characters, they should use the filename* parameter
|
||||
|
||||
// Remove quotes if present
|
||||
let filename = if filename.starts_with('"') && filename.ends_with('"') && filename.len() >= 2 {
|
||||
&filename[1..(filename.len() - 1)]
|
||||
} else {
|
||||
filename
|
||||
};
|
||||
|
||||
if filename.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(filename.to_string())
|
||||
}
|
||||
|
||||
/// Parse RFC 5987 encoded filename (filename*)
|
||||
/// Format: charset'language'encoded-value
|
||||
fn parse_encoded_filename(content: &str) -> Option<String> {
|
||||
// Remove "filename*=" prefix
|
||||
|
||||
// According to RFC 5987, format should be: charset'language'encoded-value
|
||||
let parts: Vec<&str> = content.splitn(3, '\'').collect();
|
||||
if parts.len() != 3 {
|
||||
return None;
|
||||
}
|
||||
let charset = parts[0];
|
||||
let encoded_filename = parts[2];
|
||||
|
||||
// Percent-decode the encoded filename into bytes.
|
||||
let decoded_bytes = percent_decode_str(encoded_filename).collect::<Vec<u8>>();
|
||||
|
||||
if charset.eq_ignore_ascii_case("UTF-8") {
|
||||
if let Ok(decoded_str) = String::from_utf8(decoded_bytes) {
|
||||
return Some(decoded_str);
|
||||
}
|
||||
} else if charset.eq_ignore_ascii_case("ISO-8859-1") {
|
||||
// RFC 5987 says to use ISO/IEC 8859-1:1998.
|
||||
// But Firefox and Chromium decode %99 as ™ so they're actually using
|
||||
// Windows-1252. This mixup is common on the web.
|
||||
// This affects the 0x80-0x9F range. According to ISO 8859-1 those are
|
||||
// control characters. According to Windows-1252 most of them are
|
||||
// printable characters.
|
||||
// They agree on all the other characters, and filenames shouldn't have
|
||||
// control characters, so Windows-1252 makes sense.
|
||||
if let Some(decoded_str) = encoding_rs::WINDOWS_1252
|
||||
.decode_without_bom_handling_and_without_replacement(&decoded_bytes)
|
||||
{
|
||||
return Some(decoded_str.into_owned());
|
||||
}
|
||||
} else {
|
||||
// Unknown charset. As a fallback, try interpreting as UTF-8.
|
||||
// Firefox also does this.
|
||||
// Chromium makes up its own filename. (Even if `filename=` is present.)
|
||||
if let Ok(decoded_str) = String::from_utf8(decoded_bytes) {
|
||||
return Some(decoded_str);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_simple_filename() {
|
||||
let header = r#"attachment; filename="example.pdf""#;
|
||||
assert_eq!(
|
||||
parse_filename_from_content_disposition(header),
|
||||
Some("example.pdf".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filename_without_quotes() {
|
||||
let header = "attachment; filename=example.pdf";
|
||||
assert_eq!(
|
||||
parse_filename_from_content_disposition(header),
|
||||
Some("example.pdf".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encoded_filename() {
|
||||
// UTF-8 encoded Chinese filename "测试.pdf"
|
||||
let header = "attachment; filename*=UTF-8''%E6%B5%8B%E8%AF%95.pdf";
|
||||
assert_eq!(
|
||||
parse_filename_from_content_disposition(header),
|
||||
Some("测试.pdf".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_both_filenames() {
|
||||
// When both filename and filename* are present, filename* should be preferred
|
||||
let header =
|
||||
r#"attachment; filename="fallback.pdf"; filename*=UTF-8''%E6%B5%8B%E8%AF%95.pdf"#;
|
||||
assert_eq!(
|
||||
parse_filename_from_content_disposition(header),
|
||||
Some("测试.pdf".to_string())
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn test_decode_with_windows_1252() {
|
||||
let header = "content-disposition: attachment; filename*=iso-8859-1'en'a%99b";
|
||||
assert_eq!(
|
||||
parse_filename_from_content_disposition(header),
|
||||
Some("a™b".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_both_filenames_with_bad_format() {
|
||||
// When both filename and filename* are present, filename* with bad format, filename should be used
|
||||
let header = r#"attachment; filename="fallback.pdf"; filename*=UTF-8'bad_format.pdf"#;
|
||||
assert_eq!(
|
||||
parse_filename_from_content_disposition(header),
|
||||
Some("fallback.pdf".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_filename() {
|
||||
let header = "attachment";
|
||||
assert_eq!(parse_filename_from_content_disposition(header), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iso_8859_1() {
|
||||
let header = "attachment;filename*=iso-8859-1'en'%A3%20rates";
|
||||
assert_eq!(
|
||||
parse_filename_from_content_disposition(header),
|
||||
Some("£ rates".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bad_encoding_fallback_to_utf8() {
|
||||
let header = "attachment;filename*=UTF-16''%E6%B5%8B%E8%AF%95.pdf";
|
||||
assert_eq!(
|
||||
parse_filename_from_content_disposition(header),
|
||||
Some("测试.pdf".to_string())
|
||||
);
|
||||
}
|
||||
}
|
@ -3,6 +3,9 @@ use std::io::{self, ErrorKind, IsTerminal};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::content_disposition;
|
||||
use crate::decoder::{decompress, get_compression_type};
|
||||
use crate::utils::{copy_largebuf, test_pretend_term, HeaderValueExt};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use indicatif::{HumanBytes, ProgressBar, ProgressStyle};
|
||||
use mime2ext::mime2ext;
|
||||
@ -13,9 +16,6 @@ use reqwest::{
|
||||
StatusCode,
|
||||
};
|
||||
|
||||
use crate::decoder::{decompress, get_compression_type};
|
||||
use crate::utils::{copy_largebuf, test_pretend_term, HeaderValueExt};
|
||||
|
||||
fn get_content_length(headers: &HeaderMap) -> Option<u64> {
|
||||
headers
|
||||
.get(CONTENT_LENGTH)
|
||||
@ -27,20 +27,12 @@ fn get_content_length(headers: &HeaderMap) -> Option<u64> {
|
||||
// of PathBufs
|
||||
fn get_file_name(response: &Response, orig_url: &reqwest::Url) -> String {
|
||||
fn from_header(response: &Response) -> Option<String> {
|
||||
let quoted = Regex::new("filename=\"([^\"]*)\"").unwrap();
|
||||
// Alternative form:
|
||||
let unquoted = Regex::new("filename=([^;=\"]*)").unwrap();
|
||||
// TODO: support "filename*" version
|
||||
|
||||
let header = response
|
||||
.headers()
|
||||
.get(CONTENT_DISPOSITION)?
|
||||
.to_utf8_str()
|
||||
.ok()?;
|
||||
let caps = quoted
|
||||
.captures(header)
|
||||
.or_else(|| unquoted.captures(header))?;
|
||||
Some(caps[1].to_string())
|
||||
content_disposition::parse_filename_from_content_disposition(header)
|
||||
}
|
||||
|
||||
fn from_url(url: &reqwest::Url) -> Option<String> {
|
||||
@ -60,7 +52,13 @@ fn get_file_name(response: &Response, orig_url: &reqwest::Url) -> String {
|
||||
.or_else(|| from_url(orig_url))
|
||||
.unwrap_or_else(|| "index".to_string());
|
||||
|
||||
let filename = filename.split(std::path::is_separator).next_back().unwrap();
|
||||
let filename = sanitize_filename::sanitize_with_options(
|
||||
&filename,
|
||||
sanitize_filename::Options {
|
||||
replacement: "_",
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
let mut filename = filename.trim().trim_start_matches('.').to_string();
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
mod auth;
|
||||
mod buffer;
|
||||
mod cli;
|
||||
mod content_disposition;
|
||||
mod decoder;
|
||||
mod download;
|
||||
mod error_reporting;
|
||||
|
@ -119,6 +119,101 @@ fn download_supplied_unicode_filename() {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_support_filename_rfc_5987() {
|
||||
let dir = tempdir().unwrap();
|
||||
let server = server::http(|_req| async move {
|
||||
hyper::Response::builder()
|
||||
.header(
|
||||
"Content-Disposition",
|
||||
r#"attachment; filename*=UTF-8''abcd1234.txt"#,
|
||||
)
|
||||
.body("file".into())
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
get_command()
|
||||
.args(["--download", &server.base_url()])
|
||||
.current_dir(&dir)
|
||||
.assert()
|
||||
.success();
|
||||
assert_eq!(
|
||||
fs::read_to_string(dir.path().join("abcd1234.txt")).unwrap(),
|
||||
"file"
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn download_support_filename_rfc_5987_percent_encoded() {
|
||||
let dir = tempdir().unwrap();
|
||||
let server = server::http(|_req| async move {
|
||||
hyper::Response::builder()
|
||||
.header(
|
||||
"Content-Disposition",
|
||||
r#"attachment; filename*=UTF-8''%E6%B5%8B%E8%AF%95.txt"#,
|
||||
)
|
||||
.body("file".into())
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
get_command()
|
||||
.args(["--download", &server.base_url()])
|
||||
.current_dir(&dir)
|
||||
.assert()
|
||||
.success();
|
||||
assert_eq!(
|
||||
fs::read_to_string(dir.path().join("测试.txt")).unwrap(),
|
||||
"file"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_support_filename_rfc_5987_percent_encoded_with_iso_8859_1() {
|
||||
let dir = tempdir().unwrap();
|
||||
let server = server::http(|_req| async move {
|
||||
hyper::Response::builder()
|
||||
.header(
|
||||
"Content-Disposition",
|
||||
r#"attachment; filename*=iso-8859-1'en'%A3%20rates.txt"#,
|
||||
)
|
||||
.body("file".into())
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
get_command()
|
||||
.args(["--download", &server.base_url()])
|
||||
.current_dir(&dir)
|
||||
.assert()
|
||||
.success();
|
||||
assert_eq!(
|
||||
fs::read_to_string(dir.path().join("£ rates.txt")).unwrap(),
|
||||
"file"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_filename_star_with_high_priority() {
|
||||
let dir = tempdir().unwrap();
|
||||
let server = server::http(|_req| async move {
|
||||
hyper::Response::builder()
|
||||
.header(
|
||||
"Content-Disposition",
|
||||
r#"attachment; filename="fallback.txt"; filename*=UTF-8''%E6%B5%8B%E8%AF%95.txt"#,
|
||||
)
|
||||
.body("file".into())
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
get_command()
|
||||
.args(["--download", &server.base_url()])
|
||||
.current_dir(&dir)
|
||||
.assert()
|
||||
.success();
|
||||
assert_eq!(
|
||||
fs::read_to_string(dir.path().join("测试.txt")).unwrap(),
|
||||
"file"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_supplied_unquoted_filename() {
|
||||
let dir = tempdir().unwrap();
|
||||
@ -158,7 +253,10 @@ fn download_filename_with_directory_traversal() {
|
||||
.current_dir(&dir)
|
||||
.assert()
|
||||
.success();
|
||||
assert_eq!(fs::read_to_string(dir.path().join("bar")).unwrap(), "file");
|
||||
assert_eq!(
|
||||
fs::read_to_string(dir.path().join("foo_baz_bar")).unwrap(),
|
||||
"file"
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(windows)]
|
||||
@ -180,7 +278,10 @@ fn download_filename_with_windows_directory_traversal() {
|
||||
.current_dir(&dir)
|
||||
.assert()
|
||||
.success();
|
||||
assert_eq!(fs::read_to_string(dir.path().join("bar")).unwrap(), "file");
|
||||
assert_eq!(
|
||||
fs::read_to_string(dir.path().join("foo_baz_bar")).unwrap(),
|
||||
"file"
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: test implicit download filenames
|
||||
|
Loading…
x
Reference in New Issue
Block a user