From a1f4eacb765de782ccb780e9166865b69e300af5 Mon Sep 17 00:00:00 2001 From: zuisong Date: Wed, 16 Apr 2025 22:25:39 +0800 Subject: [PATCH] Decoding RFC 5987 encoded filenames using Windows-1252, and add corresponding test cases. Signed-off-by: zuisong --- src/content_disposition.rs | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/content_disposition.rs b/src/content_disposition.rs index c598dc5..cdcbd2c 100644 --- a/src/content_disposition.rs +++ b/src/content_disposition.rs @@ -74,11 +74,23 @@ fn parse_encoded_filename(content: &str) -> Option { return Some(decoded_str); } } else if charset.eq_ignore_ascii_case("ISO-8859-1") { - // Use the encoding_rs crate to decode ISO-8859-1 bytes. - let decoded: String = decoded_bytes.iter().map(|&b| b as char).collect(); - return Some(decoded); + // RFC 5987 says to use ISO/IEC 8859-1:1998. + // But Firefox and Chromium decode %99 as ™ so they're actually using + // Windows-1252. This mixup is common on the web. + // This affects the 0x80-0x9F range. According to ISO 8859-1 those are + // control characters. According to Windows-1252 most of them are + // printable characters. + // They agree on all the other characters, and filenames shouldn't have + // control characters, so Windows-1252 makes sense. + if let Some(decoded_str) = encoding_rs::WINDOWS_1252 + .decode_without_bom_handling_and_without_replacement(&decoded_bytes) + { + return Some(decoded_str.into_owned()); + } } else { // Unknown charset. As a fallback, try interpreting as UTF-8. + // Firefox also does this. + // Chromium makes up its own filename. (Even if `filename=` is present.) if let Ok(decoded_str) = String::from_utf8(decoded_bytes) { return Some(decoded_str); } @@ -129,6 +141,14 @@ mod tests { Some("测试.pdf".to_string()) ); } + #[test] + fn test_decode_with_windows_1252() { + let header = "content-disposition: attachment; filename*=iso-8859-1'en'a%99b"; + assert_eq!( + parse_filename_from_content_disposition(header), + Some("a™b".to_string()) + ); + } #[test] fn test_both_filenames_with_bad_format() {