Skip to main content

nautilus_core/string/
urlencoding.rs

1// -------------------------------------------------------------------------------------------------
2//  Copyright (C) 2015-2026 Nautech Systems Pty Ltd. All rights reserved.
3//  https://nautechsystems.io
4//
5//  Licensed under the GNU Lesser General Public License Version 3.0 (the "License");
6//  You may not use this file except in compliance with the License.
7//  You may obtain a copy of the License at https://www.gnu.org/licenses/lgpl-3.0.en.html
8//
9//  Unless required by applicable law or agreed to in writing, software
10//  distributed under the License is distributed on an "AS IS" BASIS,
11//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//  See the License for the specific language governing permissions and
13//  limitations under the License.
14// -------------------------------------------------------------------------------------------------
15
16//! URL percent-encoding and decoding per [RFC 3986].
17//!
18//! The unreserved set is `ALPHA / DIGIT / "-" / "." / "_" / "~"`; every other
19//! byte is percent-encoded as `%HH` using uppercase hexadecimal as recommended
20//! by [RFC 3986 Section 2.1].
21//!
22//! Decoding accepts both uppercase and lowercase hex. A `%` that is not
23//! followed by two hex digits is passed through literally, matching the
24//! behaviour of the `urlencoding` crate that this module replaces.
25//!
26//! [RFC 3986]: https://datatracker.ietf.org/doc/html/rfc3986
27//! [RFC 3986 Section 2.1]: https://datatracker.ietf.org/doc/html/rfc3986#section-2.1
28
29use std::{borrow::Cow, fmt::Display, string::FromUtf8Error};
30
31const UNRESERVED: [bool; 256] = {
32    let mut table = [false; 256];
33    let mut i = b'0';
34    while i <= b'9' {
35        table[i as usize] = true;
36        i += 1;
37    }
38    i = b'A';
39    while i <= b'Z' {
40        table[i as usize] = true;
41        i += 1;
42    }
43    i = b'a';
44    while i <= b'z' {
45        table[i as usize] = true;
46        i += 1;
47    }
48    table[b'-' as usize] = true;
49    table[b'.' as usize] = true;
50    table[b'_' as usize] = true;
51    table[b'~' as usize] = true;
52    table
53};
54
55const ENCODE_PAIR: [[u8; 2]; 256] = {
56    const NIBBLE: [u8; 16] = *b"0123456789ABCDEF";
57    let mut table = [[0u8; 2]; 256];
58    let mut i = 0u16;
59    while i < 256 {
60        table[i as usize] = [NIBBLE[(i >> 4) as usize], NIBBLE[(i & 0x0f) as usize]];
61        i += 1;
62    }
63    table
64};
65
66// 0xFF sentinel marks non-hex characters
67const DECODE_NIBBLE: [u8; 256] = {
68    let mut table = [0xFFu8; 256];
69    let mut i = 0u8;
70    while i < 10 {
71        table[(b'0' + i) as usize] = i;
72        i += 1;
73    }
74    i = 0;
75    while i < 6 {
76        table[(b'a' + i) as usize] = 10 + i;
77        table[(b'A' + i) as usize] = 10 + i;
78        i += 1;
79    }
80    table
81};
82
83/// Percent-encodes a string per RFC 3986.
84///
85/// Returns the input borrowed when every byte is already in the unreserved
86/// set, otherwise an owned encoded copy.
87///
88/// # Panics
89///
90/// Never panics in practice: [`encode_bytes`] only emits ASCII bytes
91/// (unreserved characters or `%HH` pairs), so [`String::from_utf8`] always
92/// succeeds.
93#[must_use]
94pub fn encode(input: &str) -> Cow<'_, str> {
95    match encode_bytes(input.as_bytes()) {
96        Cow::Borrowed(_) => Cow::Borrowed(input),
97        Cow::Owned(bytes) => Cow::Owned(String::from_utf8(bytes).expect("encoded output is ASCII")),
98    }
99}
100
101/// Percent-encodes a byte slice per RFC 3986.
102///
103/// Returns the input borrowed when every byte is already in the unreserved
104/// set, otherwise an owned encoded copy.
105#[must_use]
106pub fn encode_bytes(input: &[u8]) -> Cow<'_, [u8]> {
107    let Some(first) = input.iter().position(|&b| !UNRESERVED[b as usize]) else {
108        return Cow::Borrowed(input);
109    };
110
111    // Slack for payloads dominated by reserved chars without over-allocating
112    // on mostly-unreserved inputs; Vec's geometric growth covers the rest.
113    let mut out = Vec::with_capacity(input.len() + input.len() / 2 + 16);
114    out.extend_from_slice(&input[..first]);
115
116    let mut rest = &input[first..];
117    while let Some(&byte) = rest.first() {
118        if UNRESERVED[byte as usize] {
119            let run_end = rest
120                .iter()
121                .position(|&b| !UNRESERVED[b as usize])
122                .unwrap_or(rest.len());
123            out.extend_from_slice(&rest[..run_end]);
124            rest = &rest[run_end..];
125        } else {
126            out.push(b'%');
127            out.extend_from_slice(&ENCODE_PAIR[byte as usize]);
128            rest = &rest[1..];
129        }
130    }
131    Cow::Owned(out)
132}
133
134/// Percent-decodes a string per RFC 3986.
135///
136/// Returns the input borrowed when no `%` is present. Otherwise decodes
137/// `%HH` pairs (hex is case-insensitive) and leaves any `%` that is not
138/// followed by two hex digits in place.
139///
140/// # Errors
141///
142/// Returns [`DecodeError::InvalidUtf8`] if the decoded bytes are not valid
143/// UTF-8.
144pub fn decode(input: &str) -> Result<Cow<'_, str>, DecodeError> {
145    match decode_bytes(input.as_bytes()) {
146        Cow::Borrowed(_) => Ok(Cow::Borrowed(input)),
147        Cow::Owned(bytes) => String::from_utf8(bytes)
148            .map(Cow::Owned)
149            .map_err(DecodeError::InvalidUtf8),
150    }
151}
152
153/// Percent-decodes a byte slice.
154///
155/// Returns the input borrowed when no `%` is present. A `%` that is not
156/// followed by two hex digits is left in place.
157#[must_use]
158pub fn decode_bytes(input: &[u8]) -> Cow<'_, [u8]> {
159    let Some(first) = input.iter().position(|&b| b == b'%') else {
160        return Cow::Borrowed(input);
161    };
162
163    let mut out = Vec::with_capacity(input.len());
164    out.extend_from_slice(&input[..first]);
165
166    let mut i = first;
167    while i < input.len() {
168        if input[i] == b'%' {
169            if i + 2 < input.len() {
170                let hi = DECODE_NIBBLE[input[i + 1] as usize];
171                let lo = DECODE_NIBBLE[input[i + 2] as usize];
172                if (hi | lo) & 0xF0 == 0 {
173                    out.push((hi << 4) | lo);
174                    i += 3;
175                    continue;
176                }
177            }
178            // Malformed or trailing `%`: pass through literally.
179            out.push(b'%');
180            i += 1;
181        } else {
182            let run_start = i;
183            while i < input.len() && input[i] != b'%' {
184                i += 1;
185            }
186            out.extend_from_slice(&input[run_start..i]);
187        }
188    }
189    Cow::Owned(out)
190}
191
192/// Errors from URL percent-decoding.
193#[derive(Debug)]
194pub enum DecodeError {
195    /// Decoded bytes are not valid UTF-8.
196    InvalidUtf8(FromUtf8Error),
197}
198
199impl Display for DecodeError {
200    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
201        match self {
202            Self::InvalidUtf8(err) => write!(f, "invalid UTF-8 in decoded bytes: {err}"),
203        }
204    }
205}
206
207impl std::error::Error for DecodeError {
208    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
209        match self {
210            Self::InvalidUtf8(err) => Some(err),
211        }
212    }
213}
214
215impl From<FromUtf8Error> for DecodeError {
216    fn from(err: FromUtf8Error) -> Self {
217        Self::InvalidUtf8(err)
218    }
219}
220
221#[cfg(test)]
222mod tests {
223    use rstest::rstest;
224
225    use super::*;
226
227    // RFC 3986 Section 2.3: unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
228    const UNRESERVED_CHARS: &str =
229        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~";
230
231    // RFC 3986 Section 2.2: reserved chars (gen-delims + sub-delims) must be
232    // percent-encoded when used as data.
233    const RESERVED_CHARS: &str = ":/?#[]@!$&'()*+,;=";
234
235    #[rstest]
236    #[case("", "")]
237    #[case("abc", "abc")]
238    #[case("ABC-xyz_0.9~", "ABC-xyz_0.9~")]
239    #[case(" ", "%20")]
240    #[case("+", "%2B")]
241    #[case("/", "%2F")]
242    #[case("?", "%3F")]
243    #[case("#", "%23")]
244    #[case("&", "%26")]
245    #[case("=", "%3D")]
246    #[case("%", "%25")]
247    #[case("hello world", "hello%20world")]
248    #[case("a b+c/d", "a%20b%2Bc%2Fd")]
249    // Uppercase hex per RFC 3986 Section 2.1
250    #[case("\x7f", "%7F")]
251    fn test_encode_ascii_vectors(#[case] input: &str, #[case] expected: &str) {
252        assert_eq!(encode(input), expected);
253    }
254
255    #[rstest]
256    fn test_encode_all_unreserved_unchanged() {
257        // Every char in the unreserved set should pass through.
258        let out = encode(UNRESERVED_CHARS);
259        assert_eq!(out, UNRESERVED_CHARS);
260        // And the Cow should be Borrowed (zero-copy).
261        assert!(matches!(out, Cow::Borrowed(_)));
262    }
263
264    #[rstest]
265    fn test_encode_all_reserved_percent_encoded() {
266        let out = encode(RESERVED_CHARS);
267        // Each of the 18 chars becomes a 3-byte `%HH` sequence.
268        assert_eq!(out.len(), RESERVED_CHARS.len() * 3);
269        // None of the unreserved chars, `%`, or digits A-F should appear raw
270        // in the output except as part of a `%HH` triple.
271        for byte in out.bytes() {
272            assert!(
273                matches!(byte, b'%' | b'0'..=b'9' | b'A'..=b'F'),
274                "unexpected byte {byte:#04x} in encoded reserved output"
275            );
276        }
277    }
278
279    #[rstest]
280    fn test_encode_hex_is_uppercase() {
281        // Verify RFC 3986 Section 2.1: producers SHOULD emit uppercase hex.
282        let out = encode("/");
283        assert_eq!(out, "%2F");
284        assert!(!out.contains('f'));
285    }
286
287    #[rstest]
288    fn test_encode_every_byte_position() {
289        // For each byte 0x00..=0xFF, encode a one-byte slice and verify that
290        // the output matches the spec expectation.
291        for byte in 0u8..=255 {
292            let input = [byte];
293            let out = encode_bytes(&input);
294
295            if UNRESERVED[byte as usize] {
296                assert!(
297                    matches!(out, Cow::Borrowed(_)),
298                    "unreserved byte {byte:#04x} should not allocate"
299                );
300                assert_eq!(out.as_ref(), &[byte]);
301            } else {
302                let expected = format!("%{byte:02X}").into_bytes();
303                assert_eq!(out.as_ref(), expected.as_slice(), "byte {byte:#04x}");
304            }
305        }
306    }
307
308    #[rstest]
309    fn test_encode_utf8_multibyte() {
310        // U+00E9 encoded as UTF-8 is `0xC3 0xA9` (two bytes).
311        assert_eq!(encode("\u{00E9}"), "%C3%A9");
312        // U+4E2D encoded as UTF-8 is `0xE4 0xB8 0xAD` (three bytes).
313        assert_eq!(encode("\u{4E2D}"), "%E4%B8%AD");
314        // Grinning face emoji U+1F600 is `0xF0 0x9F 0x98 0x80` (four bytes).
315        assert_eq!(encode("\u{1F600}"), "%F0%9F%98%80");
316    }
317
318    #[rstest]
319    fn test_encode_mixed_ascii_and_utf8() {
320        assert_eq!(encode("a é/"), "a%20%C3%A9%2F");
321    }
322
323    #[rstest]
324    fn test_encode_returns_borrowed_when_no_work() {
325        let out = encode("safe-string_123.xyz~");
326        assert!(matches!(out, Cow::Borrowed(_)));
327    }
328
329    #[rstest]
330    fn test_encode_returns_owned_when_encoding_needed() {
331        let out = encode("needs encoding");
332        assert!(matches!(out, Cow::Owned(_)));
333    }
334
335    #[rstest]
336    #[case("", "")]
337    #[case("abc", "abc")]
338    #[case("%20", " ")]
339    #[case("%2F", "/")]
340    #[case("%2f", "/")] // lowercase hex must be accepted
341    #[case("%2b", "+")]
342    #[case("%25", "%")]
343    #[case("hello%20world", "hello world")]
344    #[case("a%20b%2Bc%2Fd", "a b+c/d")]
345    #[case("%C3%A9", "\u{00E9}")]
346    #[case("%E4%B8%AD", "\u{4E2D}")]
347    #[case("%F0%9F%98%80", "\u{1F600}")]
348    fn test_decode_ascii_and_utf8_vectors(#[case] input: &str, #[case] expected: &str) {
349        assert_eq!(decode(input).unwrap(), expected);
350    }
351
352    #[rstest]
353    #[case("%", "%")] // bare `%` at end passes through
354    #[case("%2", "%2")] // one hex digit at end
355    #[case("%GG", "%GG")] // non-hex digits
356    #[case("%2G", "%2G")] // second nibble invalid
357    #[case("%G2", "%G2")] // first nibble invalid
358    #[case("%%20", "% ")] // first `%` literal, then `%20` decodes
359    #[case("100%", "100%")] // `%` at end after ASCII
360    fn test_decode_malformed_percent_passes_through(#[case] input: &str, #[case] expected: &str) {
361        assert_eq!(decode(input).unwrap(), expected);
362    }
363
364    #[rstest]
365    fn test_decode_returns_borrowed_when_no_percent() {
366        let out = decode("no-percent-here").unwrap();
367        assert!(matches!(out, Cow::Borrowed(_)));
368    }
369
370    #[rstest]
371    fn test_decode_returns_owned_when_percent_present() {
372        let out = decode("a%20b").unwrap();
373        assert!(matches!(out, Cow::Owned(_)));
374    }
375
376    #[rstest]
377    #[case("this%2x%26that", "this%2x&that")]
378    #[case("%%25", "%%")]
379    #[case("%2%26", "%2&")]
380    #[case("a%2Zb%20c", "a%2Zb c")]
381    fn test_decode_malformed_then_valid(#[case] input: &str, #[case] expected: &str) {
382        assert_eq!(decode(input).unwrap(), expected);
383    }
384
385    #[rstest]
386    fn test_decode_invalid_utf8_errors() {
387        // `0xFF` is not valid UTF-8 on its own.
388        let err = decode("%FF").unwrap_err();
389        assert!(matches!(err, DecodeError::InvalidUtf8(_)));
390    }
391
392    #[rstest]
393    fn test_decode_invalid_utf8_bytes_ok() {
394        // `decode_bytes` does not validate UTF-8.
395        let out = decode_bytes(b"%FF");
396        assert_eq!(out.as_ref(), &[0xFF]);
397    }
398
399    #[rstest]
400    fn test_decode_consecutive_percent_triples() {
401        // Three consecutive `%HH` sequences decoding multi-byte UTF-8.
402        assert_eq!(decode("%e2%98%83").unwrap(), "\u{2603}"); // snowman U+2603
403    }
404
405    #[rstest]
406    fn test_decode_nul_byte() {
407        // `%00` decodes to the NUL byte, which is valid UTF-8 (U+0000).
408        let decoded = decode("a%00b").unwrap();
409        assert_eq!(decoded.as_bytes(), &[b'a', 0x00, b'b']);
410    }
411
412    #[rstest]
413    fn test_roundtrip_every_byte() {
414        // For every byte 0x00..=0xFF, encoding then decoding must recover
415        // the original byte exactly.
416        for byte in 0u8..=255 {
417            let input = [byte];
418            let encoded = encode_bytes(&input);
419            let decoded = decode_bytes(encoded.as_ref());
420            assert_eq!(
421                decoded.as_ref(),
422                input.as_slice(),
423                "round-trip failed for byte {byte:#04x}"
424            );
425        }
426    }
427
428    #[rstest]
429    #[case("hello")]
430    #[case("a b c")]
431    #[case("https://example.com/path?q=1&x=2")]
432    #[case("\u{00E9}\u{00E0}\u{00FC}")]
433    #[case("\u{4E2D}\u{6587}\u{6D4B}\u{8BD5}")]
434    #[case("mix 123 !@# %^&*()")]
435    #[case("\u{1F600}\u{1F680}\u{1F3C6}")]
436    fn test_roundtrip_string(#[case] input: &str) {
437        let encoded = encode(input);
438        let decoded = decode(&encoded).unwrap();
439        assert_eq!(decoded, input);
440    }
441
442    #[rstest]
443    fn test_encoded_output_only_ascii() {
444        // Encoded output must always be pure ASCII (unreserved bytes + `%HH`).
445        let encoded = encode("\u{00E9}\u{4E2D}\u{1F600}");
446        assert!(encoded.is_ascii(), "encoded output must be ASCII-only");
447    }
448
449    #[rstest]
450    fn test_encode_bytes_arbitrary_binary() {
451        // Encoding arbitrary bytes (including non-UTF-8) yields a valid
452        // percent-encoded ASCII sequence.
453        let input: Vec<u8> = (0u8..=255).collect();
454        let encoded = encode_bytes(&input);
455        assert!(encoded.iter().all(u8::is_ascii));
456        let decoded = decode_bytes(encoded.as_ref());
457        assert_eq!(decoded.as_ref(), input.as_slice());
458    }
459
460    #[rstest]
461    fn test_decode_error_display_and_source() {
462        let err = decode("%FF").unwrap_err();
463        let msg = err.to_string();
464        assert!(msg.starts_with("invalid UTF-8"), "got: {msg}");
465        assert!(std::error::Error::source(&err).is_some());
466    }
467
468    // Independent reference implementation used to cross-check our tuned
469    // implementation on random inputs. Pure-Rust, loop-based, no table
470    // lookups: if both agree across thousands of random inputs we have
471    // strong evidence the tuned version is spec-correct.
472    fn reference_encode(input: &[u8]) -> Vec<u8> {
473        let mut out = Vec::with_capacity(input.len());
474        for &b in input {
475            let is_unreserved =
476                b.is_ascii_alphanumeric() || b == b'-' || b == b'.' || b == b'_' || b == b'~';
477            if is_unreserved {
478                out.push(b);
479            } else {
480                out.push(b'%');
481                out.extend_from_slice(format!("{b:02X}").as_bytes());
482            }
483        }
484        out
485    }
486
487    fn reference_decode(input: &[u8]) -> Vec<u8> {
488        let mut out = Vec::with_capacity(input.len());
489        let mut i = 0;
490        while i < input.len() {
491            if input[i] == b'%' && i + 2 < input.len() {
492                let a = input[i + 1];
493                let b = input[i + 2];
494                if a.is_ascii_hexdigit() && b.is_ascii_hexdigit() {
495                    let hi = if a.is_ascii_digit() {
496                        a - b'0'
497                    } else {
498                        (a | 0x20) - b'a' + 10
499                    };
500                    let lo = if b.is_ascii_digit() {
501                        b - b'0'
502                    } else {
503                        (b | 0x20) - b'a' + 10
504                    };
505                    out.push((hi << 4) | lo);
506                    i += 3;
507                    continue;
508                }
509            }
510            out.push(input[i]);
511            i += 1;
512        }
513        out
514    }
515
516    proptest::proptest! {
517        #[rstest]
518        fn prop_encode_matches_reference(input: Vec<u8>) {
519            let actual = encode_bytes(&input);
520            let expected = reference_encode(&input);
521            proptest::prop_assert_eq!(actual.as_ref(), expected.as_slice());
522        }
523
524        #[rstest]
525        fn prop_decode_matches_reference(input: Vec<u8>) {
526            let actual = decode_bytes(&input);
527            let expected = reference_decode(&input);
528            proptest::prop_assert_eq!(actual.as_ref(), expected.as_slice());
529        }
530
531        #[rstest]
532        fn prop_bytes_roundtrip(input: Vec<u8>) {
533            let encoded = encode_bytes(&input);
534            let decoded = decode_bytes(encoded.as_ref());
535            proptest::prop_assert_eq!(decoded.as_ref(), input.as_slice());
536        }
537
538        #[rstest]
539        fn prop_string_roundtrip(input: String) {
540            let encoded = encode(&input);
541            let decoded = decode(&encoded).unwrap();
542            proptest::prop_assert_eq!(decoded.as_ref(), input.as_str());
543        }
544
545        #[rstest]
546        fn prop_encoded_output_ascii(input: Vec<u8>) {
547            let encoded = encode_bytes(&input);
548            proptest::prop_assert!(encoded.iter().all(u8::is_ascii));
549        }
550    }
551}