Fix UnicecodeDecodeError parsing email

This commit is contained in:
Andrey Mozgunov
2017-09-26 14:38:33 +03:00
parent 02cbf574dd
commit 878c7991bf
3 changed files with 31 additions and 1 deletions

View File

@@ -123,7 +123,7 @@ def decode_content(message):
def parse_email(raw_email, policy=None): def parse_email(raw_email, policy=None):
if isinstance(raw_email, binary_type): if isinstance(raw_email, binary_type):
raw_email = str_encode(raw_email, 'utf-8') raw_email = str_encode(raw_email, 'utf-8', errors='ignore')
if policy is not None: if policy is not None:
email_parse_kwargs = dict(policy=policy) email_parse_kwargs = dict(policy=policy)
else: else:

22
tests/8422.msg Normal file
View File

@@ -0,0 +1,22 @@
Delivered-To: receiver@example.com
Return-Path: <sender@example.com>
Date: Thu, 20 Jul 2017 07:34:22 -0500
Message-ID: <59705CFE.A95F.0016.0@journeys.com>
Subject: Following up Re: Looking to connect, let's schedule a call!
From: sender@example.com
To: "Receiver" <receiver@example.com>
Mime-Version: 1.0
Content-Type: multipart/mixed; boundary="=__PartBD85995F.0__="
This is a MIME message. If you are reading this text, you may want to
consider changing to a mail reader or gateway that understands how to
properly handle MIME multipart messages.
--=__PartBD85995F.0__=
Content-Type: multipart/alternative; boundary="=__PartBD85995F.1__="
--=__PartBD85995F.1__=
Content-Type: text/plain; charset=Windows-1252
Content-Transfer-Encoding: 8bit
Following up on my previous message. Id love to connect you with

View File

@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import unittest import unittest
from imbox.parser import * from imbox.parser import *
import os
import sys import sys
if sys.version_info.major < 3 or sys.version_info.minor < 3: if sys.version_info.major < 3 or sys.version_info.minor < 3:
SMTP = False SMTP = False
@@ -10,6 +11,9 @@ else:
from email.policy import SMTP from email.policy import SMTP
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
raw_email = """Delivered-To: johndoe@gmail.com raw_email = """Delivered-To: johndoe@gmail.com
X-Originating-Email: [martin@amon.cx] X-Originating-Email: [martin@amon.cx]
Message-ID: <test0@example.com> Message-ID: <test0@example.com>
@@ -98,6 +102,10 @@ class TestParser(unittest.TestCase):
self.assertEqual('Выписка по карте', parsed_email.subject) self.assertEqual('Выписка по карте', parsed_email.subject)
self.assertEqual('Выписка по карте 1234', parsed_email.body['html'][0]) self.assertEqual('Выписка по карте 1234', parsed_email.body['html'][0])
def test_parse_email_invalid_unicode(self):
parsed_email = parse_email(open(os.path.join(TEST_DIR, '8422.msg'), 'rb').read())
self.assertEqual("Following up Re: Looking to connect, let's schedule a call!", parsed_email.subject)
def test_parse_email_ignores_header_casing(self): def test_parse_email_ignores_header_casing(self):
self.assertEqual('one', parse_email('Message-ID: one').message_id) self.assertEqual('one', parse_email('Message-ID: one').message_id)
self.assertEqual('one', parse_email('Message-Id: one').message_id) self.assertEqual('one', parse_email('Message-Id: one').message_id)