From 5bb0f446906e7a86c101793569c741593c9cf1db Mon Sep 17 00:00:00 2001
From: yuriyryabikov <22548029+kurok@users.noreply.github.com>
Date: Fri, 12 Jun 2026 23:46:29 +0100
Subject: [PATCH] perf: decode text-part transfer encoding only once

text/plain and text/html parts were transfer-decoded twice: once via
get_body_raw() for attachments.content, then again via get_body() for
text_plain/text_html, which re-runs the identical base64/quoted-printable
decode before applying the charset.

Decode the transfer encoding once with get_body_raw() and reuse those bytes
for both the attachment content and the text bodies, applying only the charset
step via decode_charset() (a faithful copy of mailparse's internal
get_body_as_string, using the same charset crate). Output is byte-identical.

~1.9x faster on base64/quoted-printable-encoded text bodies (8.84ms -> 4.71ms
median on a ~2MB base64 text/html part); no measurable change on bodies that
are not transfer-encoded. All 91 correctness tests pass.

Signed-off-by: yuriyryabikov <22548029+kurok@users.noreply.github.com>
---
 Cargo.lock         |  1 +
 Cargo.toml         |  4 ++++
 src/mail_parser.rs | 49 +++++++++++++++++++++++++++++++++-------------
 3 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3c39692..15379a8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -43,6 +43,7 @@ dependencies = [
 name = "fast_mail_parser"
 version = "0.4.0"
 dependencies = [
+ "charset",
  "mailparse",
  "pyo3",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 0288200..598eb49 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,6 +30,10 @@ path = "src/fast_mail_parser.rs"
 crate-type = ["cdylib"]
 
 [dependencies]
+# `charset` is also a transitive dependency of `mailparse`; we depend on it
+# directly to reuse the exact charset-decoding step mailparse applies in
+# `get_body`, so text parts are decoded only once (see `decode_charset`).
+charset = "0.1.3"
 mailparse = "0.16.1"
 pyo3 = "0.29"
 
diff --git a/src/mail_parser.rs b/src/mail_parser.rs
index d8bc32e..12ba0e8 100644
--- a/src/mail_parser.rs
+++ b/src/mail_parser.rs
@@ -10,6 +10,7 @@
 //! objects. Keeping the two models separate decouples the parsing logic from the
 //! Python bindings.
 
+use charset::{decode_ascii, Charset};
 use mailparse::*;
 use std::collections::HashMap;
 
@@ -30,6 +31,20 @@ pub(crate) fn parse_email(payload: &[u8]) -> Result<Mail, MailParseError> {
     Mail::new(payload)
 }
 
+/// Decode already-transfer-decoded `body` bytes into a `String` using the part's
+/// charset (defaulting to us-ascii when the label is missing or unrecognized).
+///
+/// This mirrors mailparse's internal `get_body_as_string` exactly -- same crate,
+/// same logic -- so it can be fed the bytes from `get_body_raw` to produce the
+/// same result as `get_body` without decoding the transfer encoding twice.
+fn decode_charset(body: &[u8], ctype: &ParsedContentType) -> String {
+    if let Some(charset) = Charset::for_label(ctype.charset.as_bytes()) {
+        charset.decode(body).0.into_owned()
+    } else {
+        decode_ascii(body).into_owned()
+    }
+}
+
 #[derive(Debug)]
 pub(crate) struct Mail {
     pub(crate) subject: String,
@@ -75,25 +90,31 @@ impl<'a> Mail {
             let attachment_name = mail.ctype.params.get("name");
             let mime = mail.ctype.mimetype.as_str();
 
-            // Propagate body decode failures instead of swallowing them with
-            // `unwrap_or_default()`. A broken transfer encoding (e.g. invalid
-            // base64/quoted-printable) or an undecodable charset would otherwise
-            // be silently turned into an empty body, hiding corruption from the
-            // caller. `get_body_raw`/`get_body` return `MailParseError`, which the
-            // PyO3 layer surfaces to Python as `ParseError`.
-            attachments.push(Attachment {
-                mimetype: mime.to_string(),
-                content: mail.get_body_raw()?,
-                filename: attachment_name.cloned().unwrap_or_default(),
-            });
-
+            // Undo the Content-Transfer-Encoding (e.g. base64/quoted-printable)
+            // exactly once. `?` propagates a broken transfer encoding instead of
+            // swallowing it with `unwrap_or_default()`, which would silently turn
+            // corruption into an empty body; the PyO3 layer surfaces the error to
+            // Python as `ParseError`.
+            let content = mail.get_body_raw()?;
+
+            // For text parts, build the Python-facing string from the bytes we
+            // just decoded instead of calling `get_body()`, which would re-run the
+            // identical transfer decode a second time. `decode_charset` performs
+            // only the charset step, so the result matches mailparse's `get_body`
+            // output byte-for-byte (see `decode_charset`).
             if attachment_name.is_none() {
                 if mime == "text/plain" {
-                    text_plain.push(mail.get_body()?)
+                    text_plain.push(decode_charset(&content, &mail.ctype));
                 } else if mime == "text/html" {
-                    text_html.push(mail.get_body()?)
+                    text_html.push(decode_charset(&content, &mail.ctype));
                 }
             }
+
+            attachments.push(Attachment {
+                mimetype: mime.to_string(),
+                content,
+                filename: attachment_name.cloned().unwrap_or_default(),
+            });
         }
 
         Ok(Self {