Refactoring, initial test suite

This commit is contained in:
Martin Rusev
2013-07-30 16:52:15 +03:00
parent 8dcc527c7f
commit d5fdf9b498
3 changed files with 166 additions and 105 deletions

View File

@@ -1,20 +1,5 @@
import email
import re
import StringIO
from imbox.imap import ImapTransport from imbox.imap import ImapTransport
from imbox.parser import get_mail_addresses, decode_mail_header from imbox.parser import parse_email
class Struct(object):
def __init__(self, **entries):
self.__dict__.update(entries)
def keys(self):
return self.__dict__.keys()
def __repr__(self):
return str(self.__dict__)
class Imbox(object): class Imbox(object):
@@ -23,98 +8,11 @@ class Imbox(object):
server = ImapTransport(hostname, ssl=ssl) server = ImapTransport(hostname, ssl=ssl)
self.connection = server.connect(username, password) self.connection = server.connect(username, password)
def parse_attachment(self, message_part):
content_disposition = message_part.get("Content-Disposition", None) # Check again if this is a valid attachment
if content_disposition != None:
dispositions = content_disposition.strip().split(";")
if dispositions[0].lower() == "attachment":
file_data = message_part.get_payload(decode=True)
attachment = {
'content-type': message_part.get_content_type(),
'size': len(file_data),
'content': StringIO.StringIO(file_data)
}
for param in dispositions[1:]:
name,value = param.split("=")
name = name.lower()
if 'file' in name:
attachment['filename'] = value
if 'create-date' in name:
attachment['create-date'] = value
return attachment
return None
def parse_email(self, raw_email):
email_message = email.message_from_string(raw_email)
maintype = email_message.get_content_maintype()
parsed_email = {}
body = {
"plain": [],
"html": []
}
attachments = []
if maintype == 'multipart':
for part in email_message.walk():
content = part.get_payload(decode=True)
content_type = part.get_content_type()
content_disposition = part.get('Content-Disposition', None)
if content_type == "text/plain" and content_disposition == None:
body['plain'].append(content)
elif content_type == "text/html" and content_disposition == None:
body['html'].append(content)
elif content_disposition:
attachments.append(self.parse_attachment(part))
elif maintype == 'text':
body['plain'].append(email_message.get_payload(decode=True))
if len(attachments) > 0:
parsed_email['attachments'] = attachments
parsed_email['body'] = body
email_dict = dict(email_message.items())
parsed_email['sent_from'] = get_mail_addresses(email_message, 'from')
parsed_email['sent_to'] = get_mail_addresses(email_message, 'to')
value_headers_keys = ['Subject', 'Date','Message-ID']
key_value_header_keys = ['Received-SPF',
'MIME-Version',
'X-Spam-Status',
'X-Spam-Score',
'Content-Type']
parsed_email['headers'] = []
for key, value in email_dict.iteritems():
if key in value_headers_keys:
valid_key_name = key.lower()
parsed_email[valid_key_name] = decode_mail_header(value)
if key in key_value_header_keys:
parsed_email['headers'].append({'Name': key,
'Value': value})
return Struct(**parsed_email)
def fetch_by_uid(self, uid): def fetch_by_uid(self, uid):
message, data = self.connection.uid('fetch', uid, '(BODY.PEEK[])') # Don't mark the messages as read message, data = self.connection.uid('fetch', uid, '(BODY.PEEK[])') # Don't mark the messages as read
raw_email = data[0][1] raw_email = data[0][1]
email_object = self.parse_email(raw_email) email_object = parse_email(raw_email)
return email_object return email_object

View File

@@ -1,6 +1,20 @@
import re
import StringIO
import email import email
from email.header import decode_header from email.header import decode_header
class Struct(object):
def __init__(self, **entries):
self.__dict__.update(entries)
def keys(self):
return self.__dict__.keys()
def __repr__(self):
return str(self.__dict__)
def decode_mail_header(value, default_charset='us-ascii'): def decode_mail_header(value, default_charset='us-ascii'):
""" """
Decode a header value into a unicode string. Decode a header value into a unicode string.
@@ -30,4 +44,89 @@ def get_mail_addresses(message, header_name):
addresses[index]={'name': decode_mail_header(address_name), 'email': address_email} addresses[index]={'name': decode_mail_header(address_name), 'email': address_email}
return addresses return addresses
def parse_attachment(message_part):
content_disposition = message_part.get("Content-Disposition", None) # Check again if this is a valid attachment
if content_disposition != None:
dispositions = content_disposition.strip().split(";")
if dispositions[0].lower() == "attachment":
file_data = message_part.get_payload(decode=True)
attachment = {
'content-type': message_part.get_content_type(),
'size': len(file_data),
'content': StringIO.StringIO(file_data)
}
for param in dispositions[1:]:
name,value = param.split("=")
name = name.lower()
if 'file' in name:
attachment['filename'] = value
if 'create-date' in name:
attachment['create-date'] = value
return attachment
return None
def parse_email(raw_email):
email_message = email.message_from_string(raw_email)
maintype = email_message.get_content_maintype()
parsed_email = {}
body = {
"plain": [],
"html": []
}
attachments = []
if maintype == 'multipart':
for part in email_message.walk():
content = part.get_payload(decode=True)
content_type = part.get_content_type()
content_disposition = part.get('Content-Disposition', None)
if content_type == "text/plain" and content_disposition == None:
body['plain'].append(content)
elif content_type == "text/html" and content_disposition == None:
body['html'].append(content)
elif content_disposition:
attachments.append(parse_attachment(part))
elif maintype == 'text':
body['plain'].append(email_message.get_payload(decode=True))
if len(attachments) > 0:
parsed_email['attachments'] = attachments
parsed_email['body'] = body
email_dict = dict(email_message.items())
parsed_email['sent_from'] = get_mail_addresses(email_message, 'from')
parsed_email['sent_to'] = get_mail_addresses(email_message, 'to')
value_headers_keys = ['Subject', 'Date','Message-ID']
key_value_header_keys = ['Received-SPF',
'MIME-Version',
'X-Spam-Status',
'X-Spam-Score',
'Content-Type']
parsed_email['headers'] = []
for key, value in email_dict.iteritems():
if key in value_headers_keys:
valid_key_name = key.lower()
parsed_email[valid_key_name] = decode_mail_header(value)
if key in key_value_header_keys:
parsed_email['headers'].append({'Name': key,
'Value': value})
return Struct(**parsed_email)

View File

@@ -0,0 +1,64 @@
import unittest
import email
from imbox.parser import *
raw_email = """Delivered-To: johndoe@gmail.com
X-Originating-Email: [martin@amon.cx]
Message-ID: <test0@example.com>
Return-Path: martin@amon.cx
Date: Tue, 30 Jul 2013 15:56:29 +0300
From: Martin Rusev <martin@amon.cx>
MIME-Version: 1.0
To: John Doe <johndoe@gmail.com>
Subject: Test email - no attachment
Content-Type: multipart/alternative;
boundary="------------080505090108000500080106"
X-OriginalArrivalTime: 30 Jul 2013 12:56:43.0604 (UTC) FILETIME=[3DD52140:01CE8D24]
--------------080505090108000500080106
Content-Type: text/plain; charset="ISO-8859-1"; format=flowed
Content-Transfer-Encoding: 7bit
Hi, this is a test email with no attachments
--------------080505090108000500080106
Content-Type: text/html; charset="ISO-8859-1"
Content-Transfer-Encoding: 7bit
<html><head>
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"></head><body
bgcolor="#FFFFFF" text="#000000">
Hi, this is a test email with no <span style="font-weight: bold;">attachments</span><br>
</body>
</html>
--------------080505090108000500080106--
"""
class TestParser(unittest.TestCase):
def test_parse_email(self):
parsed_email = parse_email(raw_email)
self.assertEqual(u'Test email - no attachment', parsed_email.subject)
# TODO - Complete the test suite
def test_parse_attachment(self):
pass
def test_decode_mail_header(self):
pass
def test_get_mail_addresses(self):
to_message_object = email.message_from_string("To: John Doe <johndoe@gmail.com>")
self.assertEqual([{'email': 'johndoe@gmail.com', 'name': u'John Doe'}], get_mail_addresses(to_message_object, 'to'))
from_message_object = email.message_from_string("From: John Smith <johnsmith@gmail.com>")
self.assertEqual([{'email': 'johnsmith@gmail.com', 'name': u'John Smith'}], get_mail_addresses(from_message_object, 'from'))