Fix unknown characterset emails

Some emails that had an unknown character set.  when that happens
we can't decode the body of the message properly, so the default body
string was being used, and was attempting to be decoded.  Only byte
strings can be decoded, so the default string is now labeled as a byte
encoding.
This commit is contained in:
Hemna 2020-12-15 17:59:17 -05:00
parent b4526f3a0e
commit 96bae795a5
1 changed files with 7 additions and 4 deletions

View File

@ -220,25 +220,27 @@ def parse_email(msgid, data, server):
text = ""
html = None
# default in case body somehow isn't set below - happened once
body = "* unreadable msg received"
body = b"* unreadable msg received"
# this uses the last text or html part in the email, phone companies often put content in an attachment
for part in msg.get_payload():
if (
part.get_content_charset() is None
): # or BREAK when we hit a text or html?
if part.get_content_charset() is None:
# or BREAK when we hit a text or html?
# We cannot know the character set,
# so return decoded "something"
LOG.debug("Email got unknown content type")
text = part.get_payload(decode=True)
continue
charset = part.get_content_charset()
if part.get_content_type() == "text/plain":
LOG.debug("Email got text/plain")
text = six.text_type(
part.get_payload(decode=True), str(charset), "ignore"
).encode("utf8", "replace")
if part.get_content_type() == "text/html":
LOG.debug("Email got text/html")
html = six.text_type(
part.get_payload(decode=True), str(charset), "ignore"
).encode("utf8", "replace")
@ -250,6 +252,7 @@ def parse_email(msgid, data, server):
body = html.strip()
else: # message is not multipart
# email.uscc.net sends no charset, blows up unicode function below
LOG.debug("Email is not multipart")
if msg.get_content_charset() is None:
text = six.text_type(
msg.get_payload(decode=True), "US-ASCII", "ignore"