Fix unknown characterset emails

Some emails that had an unknown character set. when that happens we can't decode the body of the message properly, so the default body string was being used, and was attempting to be decoded. Only byte strings can be decoded, so the default string is now labeled as a byte encoding.
2026-06-17 05:18:38 -04:00 · 2020-12-15 17:59:17 -05:00
parent b4526f3a0e
commit 96bae795a5
1 changed files with 7 additions and 4 deletions
@@ -220,25 +220,27 @@ def parse_email(msgid, data, server):
        text = ""
        html = None
        # default in case body somehow isn't set below - happened once
-        body = "* unreadable msg received"
+        body = b"* unreadable msg received"
        # this uses the last text or html part in the email, phone companies often put content in an attachment
        for part in msg.get_payload():
-            if (
-                part.get_content_charset() is None
-            ):  # or BREAK when we hit a text or html?
+            if part.get_content_charset() is None:
+                # or BREAK when we hit a text or html?
                # We cannot know the character set,
                # so return decoded "something"
+                LOG.debug("Email got unknown content type")
                text = part.get_payload(decode=True)
                continue

            charset = part.get_content_charset()

            if part.get_content_type() == "text/plain":
+                LOG.debug("Email got text/plain")
                text = six.text_type(
                    part.get_payload(decode=True), str(charset), "ignore"
                ).encode("utf8", "replace")

            if part.get_content_type() == "text/html":
+                LOG.debug("Email got text/html")
                html = six.text_type(
                    part.get_payload(decode=True), str(charset), "ignore"
                ).encode("utf8", "replace")
@@ -250,6 +252,7 @@ def parse_email(msgid, data, server):
                body = html.strip()
    else:  # message is not multipart
        # email.uscc.net sends no charset, blows up unicode function below
+        LOG.debug("Email is not multipart")
        if msg.get_content_charset() is None:
            text = six.text_type(
                msg.get_payload(decode=True), "US-ASCII", "ignore"