diff --git a/talon/signature/bruteforce.py b/talon/signature/bruteforce.py index e502bab8..b61a0ea1 100644 --- a/talon/signature/bruteforce.py +++ b/talon/signature/bruteforce.py @@ -1,64 +1,8 @@ from __future__ import absolute_import - -import logging - -import regex as re +from talon.signature.extractor import BruteForceExtractor from talon.signature.constants import (SIGNATURE_MAX_LINES, TOO_LONG_SIGNATURE_LINE) -from talon.utils import get_delimiter - -log = logging.getLogger(__name__) - -# regex to fetch signature based on common signature words -RE_SIGNATURE = re.compile(r''' - ( - (?: - ^[\s]*--*[\s]*[a-z \.]*$ - | - ^thanks[\s,!]*$ - | - ^regards[\s,!]*$ - | - ^cheers[\s,!]*$ - | - ^best[ a-z]*[\s,!]*$ - ) - .* - ) - ''', re.I | re.X | re.M | re.S) - -# signatures appended by phone email clients -RE_PHONE_SIGNATURE = re.compile(r''' - ( - (?: - ^sent[ ]{1}from[ ]{1}my[\s,!\w]*$ - | - ^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$ - | - ^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$ - | - ^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$ - ) - .* - ) - ''', re.I | re.X | re.M | re.S) - -# see _mark_candidate_indexes() for details -# c - could be signature line -# d - line starts with dashes (could be signature or list item) -# l - long line -RE_SIGNATURE_CANDIDATE = re.compile(r''' - (?Pc+d)[^d] - | - (?Pc+d)$ - | - (?Pc+) - | - (?Pd)[^d] - | - (?Pd)$ -''', re.I | re.X | re.M | re.S) def extract_signature(msg_body): @@ -73,46 +17,8 @@ def extract_signature(msg_body): >>> extract_signature('Hey man!') ('Hey man!', None) ''' - try: - # identify line delimiter first - delimiter = get_delimiter(msg_body) - - # make an assumption - stripped_body = msg_body.strip() - phone_signature = None - - # strip off phone signature - phone_signature = RE_PHONE_SIGNATURE.search(msg_body) - if phone_signature: - stripped_body = stripped_body[:phone_signature.start()] - phone_signature = phone_signature.group() - - # decide on signature candidate - lines = stripped_body.splitlines() - candidate = get_signature_candidate(lines) - candidate = delimiter.join(candidate) - - # try to extract signature - signature = RE_SIGNATURE.search(candidate) - if not signature: - return (stripped_body.strip(), phone_signature) - else: - signature = signature.group() - # when we splitlines() and then join them - # we can lose a new line at the end - # we did it when identifying a candidate - # so we had to do it for stripped_body now - stripped_body = delimiter.join(lines) - stripped_body = stripped_body[:-len(signature)] - - if phone_signature: - signature = delimiter.join([signature, phone_signature]) - - return (stripped_body.strip(), - signature.strip()) - except Exception: - log.exception('ERROR extracting signature') - return (msg_body, None) + brute_force_extractor = BruteForceExtractor() + return brute_force_extractor.extract_signature(msg_body) def get_signature_candidate(lines): @@ -126,26 +32,8 @@ def get_signature_candidate(lines): * not include more than one line that starts with dashes """ # non empty lines indexes - non_empty = [i for i, line in enumerate(lines) if line.strip()] - - # if message is empty or just one line then there is no signature - if len(non_empty) <= 1: - return [] - - # we don't expect signature to start at the 1st line - candidate = non_empty[1:] - # signature shouldn't be longer then SIGNATURE_MAX_LINES - candidate = candidate[-SIGNATURE_MAX_LINES:] - - markers = _mark_candidate_indexes(lines, candidate) - candidate = _process_marked_candidate_indexes(candidate, markers) - - # get actual lines for the candidate instead of indexes - if candidate: - candidate = lines[candidate[0]:] - return candidate - - return [] + brute_force_extractor = BruteForceExtractor() + return brute_force_extractor._get_signature_candidate(lines) def _mark_candidate_indexes(lines, candidate): @@ -161,18 +49,8 @@ def _mark_candidate_indexes(lines, candidate): 'cdc' """ # at first consider everything to be potential signature lines - markers = list('c' * len(candidate)) - - # mark lines starting from bottom up - for i, line_idx in reversed(list(enumerate(candidate))): - if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE: - markers[i] = 'l' - else: - line = lines[line_idx].strip() - if line.startswith('-') and line.strip("-"): - markers[i] = 'd' - - return "".join(markers) + brute_force_extractor = BruteForceExtractor() + return brute_force_extractor._mark_candidate_indexes(lines, candidate) def _process_marked_candidate_indexes(candidate, markers): @@ -183,5 +61,5 @@ def _process_marked_candidate_indexes(candidate, markers): >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') [15, 17] """ - match = RE_SIGNATURE_CANDIDATE.match(markers[::-1]) - return candidate[-match.end('candidate'):] if match else [] + brute_force_extractor = BruteForceExtractor() + return brute_force_extractor._process_marked_candidate_indexes(candidate, markers) diff --git a/talon/signature/constants.py b/talon/signature/constants.py index 14f2006c..66d72bfa 100644 --- a/talon/signature/constants.py +++ b/talon/signature/constants.py @@ -1,2 +1,36 @@ +import regex as re + SIGNATURE_MAX_LINES = 11 TOO_LONG_SIGNATURE_LINE = 60 + +# signatures appended by phone email clients +RE_PHONE_SIGNATURE = re.compile(r''' + ( + (?: + ^sent[ ]{1}from[ ]{1}my[\s,!\w]*$ + | + ^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$ + | + ^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$ + | + ^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$ + ) + .* + ) + ''', re.I | re.X | re.M | re.S) + +# see _mark_candidate_indexes() for details +# c - could be signature line +# d - line starts with dashes (could be signature or list item) +# l - long line +RE_SIGNATURE_CANDIDATE = re.compile(r''' + (?Pc+d)[^d] + | + (?Pc+d)$ + | + (?Pc+) + | + (?Pd)[^d] + | + (?Pd)$ +''', re.I | re.X | re.M | re.S) \ No newline at end of file diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index 1c3a4b08..c5c8a706 100644 Binary files a/talon/signature/data/classifier and b/talon/signature/data/classifier differ diff --git a/talon/signature/extractor.py b/talon/signature/extractor.py new file mode 100644 index 00000000..d93d846c --- /dev/null +++ b/talon/signature/extractor.py @@ -0,0 +1,185 @@ +""" +Module with object oriented approach to signature extractions. Built to be more +flexible and to support more languages. +""" +from __future__ import absolute_import +import re +import logging + +from abc import ABC, abstractmethod +from talon.utils import get_delimiter +from talon.signature.constants import (SIGNATURE_MAX_LINES, + TOO_LONG_SIGNATURE_LINE, + RE_SIGNATURE_CANDIDATE, + RE_PHONE_SIGNATURE) + +log = logging.getLogger(__name__) + +# Defaults taken from bruteforce.py +DEFAULT_GREETINGS = ( + '[\s]*--*[\s]*[a-z \.]', + 'thanks[\s,!]', + 'regards[\s,!]', + 'cheers[\s,!]', + 'best[ a-z]*[\s,!]' +) + + +class AbstractExtractor(ABC): + """ + Abstract base class for signature extractors. + """ + + @abstractmethod + def extract_signature(self, message): + """ + Extract the signature from message and return the text and signature + + :param message: str + :return: (text: str, signature: str) + """ + pass + + +class BruteForceExtractor(AbstractExtractor): + """ + Brute force signature extractor. + More flexible OO approach to + talon.signatures.bruteforce.extract_signature + """ + + def __init__(self, max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE, + greetings=DEFAULT_GREETINGS): + """ + Create a new brute force extractor. Allows override max signature length, + max signature line length and common greetings (allows multi language support). + """ + self.max_lines = max_lines + self.max_line_length = max_line_length + self._compile_greetings(greetings) + + def extract_signature(self, msg_body): + """ + Use brute force to extract the signature (ie. regex and string matching) + + :param message: str + :return: (text: str, signature: str) + """ + try: + # identify line delimiter first + delimiter = get_delimiter(msg_body) + + # make an assumption + stripped_body = msg_body.strip() + phone_signature = None + + # strip off phone signature + phone_signature = RE_PHONE_SIGNATURE.search(msg_body) + if phone_signature: + stripped_body = stripped_body[:phone_signature.start()] + phone_signature = phone_signature.group() + + # decide on signature candidate + lines = stripped_body.splitlines() + candidate = self._get_signature_candidate(lines) + candidate = delimiter.join(candidate) + + # try to extract signature + signature = self.re_signature.search(candidate) + if not signature: + return (stripped_body.strip(), phone_signature) + else: + signature = signature.group() + # when we splitlines() and then join them we can lose a new line at the end + # we did it when identifying a candidate so we had to do it for stripped_body now + stripped_body = delimiter.join(lines) + stripped_body = stripped_body[:-len(signature)] + + if phone_signature: + signature = delimiter.join([signature, phone_signature]) + + return (stripped_body.strip(), + signature.strip()) + except Exception: + log.exception('ERROR extracting signature') + return (msg_body, None) + + def _compile_greetings(self, greetings): + """ + Init the regex to detect the + greeting based on the passed + greetings + + :param greetings: + """ + greetings = ['^{}*$'.format(greeting) for greeting in greetings] + greetings = '|'.join(greetings) + self.re_signature = re.compile(r'((?:{}).*)'.format(greetings), re.I | re.X | re.M | re.S) + + def _get_signature_candidate(self, lines): + """Return lines that could hold signature + + The lines should: + + * be among last SIGNATURE_MAX_LINES non-empty lines. + * not include first line + * be shorter than TOO_LONG_SIGNATURE_LINE + * not include more than one line that starts with dashes + """ + # non empty lines indexes + non_empty = [i for i, line in enumerate(lines) if line.strip()] + + # if message is empty or just one line then there is no signature + if len(non_empty) <= 1: + return [] + + # we don't expect signature to start at the 1st line + candidate = non_empty[1:] + # signature shouldn't be longer then SIGNATURE_MAX_LINES + candidate = candidate[-self.max_lines:] + + markers = self._mark_candidate_indexes(lines, candidate) + candidate = self._process_marked_candidate_indexes(candidate, markers) + + # get actual lines for the candidate instead of indexes + if candidate: + candidate = lines[candidate[0]:] + return candidate + + return [] + + def _mark_candidate_indexes(self, lines, candidate): + """Mark candidate indexes with markers + + Markers: + + * c - line that could be a signature line + * l - long line + * d - line that starts with dashes but has other chars as well + + >>> _mark_candidate_lines(['Some text', '', '-', 'Bob'], [0, 2, 3]) + 'cdc' + """ + # at first consider everything to be potential signature lines + markers = list('c' * len(candidate)) + + # mark lines starting from bottom up + for i, line_idx in reversed(list(enumerate(candidate))): + if len(lines[line_idx].strip()) > self.max_line_length: + markers[i] = 'l' + else: + line = lines[line_idx].strip() + if line.startswith('-') and line.strip("-"): + markers[i] = 'd' + + return "".join(markers) + + def _process_marked_candidate_indexes(self, candidate, markers): + """ + Run regexes against candidate's marked indexes to strip signature candidate. + + >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') + [15, 17] + """ + match = RE_SIGNATURE_CANDIDATE.match(markers[::-1]) + return candidate[-match.end('candidate'):] if match else [] diff --git a/tests/signature/bruteforce_test.py b/tests/signature/bruteforce_test.py index 382615bb..0d3221ff 100644 --- a/tests/signature/bruteforce_test.py +++ b/tests/signature/bruteforce_test.py @@ -4,7 +4,7 @@ from .. import * from talon.signature import bruteforce - +from talon.signature import extractor def test_empty_body(): eq_(('', None), bruteforce.extract_signature('')) @@ -135,7 +135,7 @@ def test_blackberry_signature(): bruteforce.extract_signature(msg_body)) -@patch.object(bruteforce, 'get_delimiter', Mock(side_effect=Exception())) +@patch.object(extractor, 'get_delimiter', Mock(side_effect=Exception())) def test_crash_in_extract_signature(): msg_body = '''Hey! -roman'''