diff --git a/.build/Dockerfile b/.build/Dockerfile index a32dd51a..da12b513 100644 --- a/.build/Dockerfile +++ b/.build/Dockerfile @@ -14,7 +14,8 @@ COPY wheel/* /wheel/ RUN mkdir -p ${REPORT_PATH} -RUN python ./setup.py build bdist_wheel -d /wheel && \ +RUN (cd talon-core && python setup.py build bdist_wheel -d /wheel) && \ + (cd talon && python setup.py build bdist_wheel -d /wheel) && \ pip install --no-deps /wheel/* ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"] diff --git a/README.rst b/README.rst deleted file mode 100644 index 6ba6d73c..00000000 --- a/README.rst +++ /dev/null @@ -1,155 +0,0 @@ -talon -===== - -Mailgun library to extract message quotations and signatures. - -If you ever tried to parse message quotations or signatures you know that absence of any formatting standards in this area could make this task a nightmare. Hopefully this library will make your life much easier. The name of the project is inspired by TALON - multipurpose robot designed to perform missions ranging from reconnaissance to combat and operate in a number of hostile environments. That’s what a good quotations and signature parser should be like :smile: - -Usage ------ - -Here’s how you initialize the library and extract a reply from a text -message: - -.. code:: python - - import talon - from talon import quotations - - talon.init() - - text = """Reply - - -----Original Message----- - - Quote""" - - reply = quotations.extract_from(text, 'text/plain') - reply = quotations.extract_from_plain(text) - # reply == "Reply" - -To extract a reply from html: - -.. code:: python - - html = """Reply -
- -
- On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: -
- -
- Quote -
- -
""" - - reply = quotations.extract_from(html, 'text/html') - reply = quotations.extract_from_html(html) - # reply == "

Reply

" - -Often the best way is the easiest one. Here’s how you can extract -signature from email message without any -machine learning fancy stuff: - -.. code:: python - - from talon.signature.bruteforce import extract_signature - - - message = """Wow. Awesome! - -- - Bob Smith""" - - text, signature = extract_signature(message) - # text == "Wow. Awesome!" - # signature == "--\nBob Smith" - -Quick and works like a charm 90% of the time. For other 10% you can use -the power of machine learning algorithms: - -.. code:: python - - import talon - # don't forget to init the library first - # it loads machine learning classifiers - talon.init() - - from talon import signature - - - message = """Thanks Sasha, I can't go any higher and is why I limited it to the - homepage. - - John Doe - via mobile""" - - text, signature = signature.extract(message, sender='john.doe@example.com') - # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage." - # signature == "John Doe\nvia mobile" - -For machine learning talon currently uses the `scikit-learn`_ library to build SVM -classifiers. The core of machine learning algorithm lays in -``talon.signature.learning package``. It defines a set of features to -apply to a message (``featurespace.py``), how data sets are built -(``dataset.py``), classifier’s interface (``classifier.py``). - -Currently the data used for training is taken from our personal email -conversations and from `ENRON`_ dataset. As a result of applying our set -of features to the dataset we provide files ``classifier`` and -``train.data`` that don’t have any personal information but could be -used to load trained classifier. Those files should be regenerated every -time the feature/data set is changed. - -To regenerate the model files, you can run - -.. code:: sh - - python train.py - -or - -.. code:: python - - from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA - from talon.signature.learning.classifier import train, init - train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) - -Open-source Dataset -------------------- - -Recently we started a `forge`_ project to create an open-source, annotated dataset of raw emails. In the project we -used a subset of `ENRON`_ data, cleansed of private, health and financial information by `EDRM`_. At the moment over 190 -emails are annotated. Any contribution and collaboration on the project are welcome. Once the dataset is ready we plan to -start using it for talon. - -.. _scikit-learn: http://scikit-learn.org -.. _ENRON: https://www.cs.cmu.edu/~enron/ -.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set -.. _forge: https://github.com/mailgun/forge - -Training on your dataset ------------------------- - -talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do: - -.. code:: python - - from talon.signature.learning.dataset import build_extraction_dataset - from talon.signature.learning import classifier as c - - build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data") - c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier") - -Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder). - -.. _forge: https://github.com/mailgun/forge - -Research --------- - -The library is inspired by the following research papers and projects: - -- http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf -- http://www.cs.cornell.edu/people/tj/publications/joachims_01a.pdf diff --git a/README.rst b/README.rst new file mode 120000 index 00000000..2ed370ef --- /dev/null +++ b/README.rst @@ -0,0 +1 @@ +talon/README.rst \ No newline at end of file diff --git a/run_tests.sh b/run_tests.sh index 19f1d59d..41313a2b 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash set -ex REPORT_PATH="${REPORT_PATH:-./}" -nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon . +nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon --cover-package=talon-core talon talon-core diff --git a/setup.py b/setup.py deleted file mode 100755 index cf8e7e6e..00000000 --- a/setup.py +++ /dev/null @@ -1,64 +0,0 @@ -from __future__ import absolute_import -from setuptools import setup, find_packages -from setuptools.command.install import install - - -class InstallCommand(install): - user_options = install.user_options + [ - ('no-ml', None, "Don't install without Machine Learning modules."), - ] - - boolean_options = install.boolean_options + ['no-ml'] - - def initialize_options(self): - install.initialize_options(self) - self.no_ml = None - - def finalize_options(self): - install.finalize_options(self) - if self.no_ml: - dist = self.distribution - dist.packages=find_packages(exclude=[ - "tests", - "tests.*", - "talon.signature", - "talon.signature.*", - ]) - for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]: - dist.install_requires.remove(not_required) - - -setup(name='talon', - version='1.6.0', - description=("Mailgun library " - "to extract message quotations and signatures."), - long_description=open("README.rst").read(), - author='Mailgun Inc.', - author_email='admin@mailgunhq.com', - url='https://github.com/mailgun/talon', - license='APACHE2', - cmdclass={ - 'install': InstallCommand, - }, - packages=find_packages(exclude=['tests', 'tests.*']), - include_package_data=True, - zip_safe=True, - install_requires=[ - "lxml", - "regex", - "numpy", - "scipy", - "scikit-learn>=1.0.0", - "chardet", - "cchardet", - "cssselect", - "six", - "html5lib", - "joblib", - ], - tests_require=[ - "mock", - "nose", - "coverage" - ] - ) diff --git a/LICENSE b/talon-core/LICENSE similarity index 100% rename from LICENSE rename to talon-core/LICENSE diff --git a/talon-core/MANIFEST.in b/talon-core/MANIFEST.in new file mode 100644 index 00000000..4984cc00 --- /dev/null +++ b/talon-core/MANIFEST.in @@ -0,0 +1,5 @@ +recursive-exclude tests *.pyc *~ +recursive-exclude talon *.pyc *~ +include LICENSE +include MANIFEST.in +include README.rst diff --git a/talon-core/README.rst b/talon-core/README.rst new file mode 100644 index 00000000..0dd92f0e --- /dev/null +++ b/talon-core/README.rst @@ -0,0 +1,6 @@ +talon-core +========== + +This is the part of talon that does not depend on NumPy, SciPy, and +scikit-learn, and does not include any machine learning functionality. +See the main talon package for documentation. diff --git a/talon-core/setup.py b/talon-core/setup.py new file mode 100644 index 00000000..773848fb --- /dev/null +++ b/talon-core/setup.py @@ -0,0 +1,32 @@ +from __future__ import absolute_import +from setuptools import setup, find_packages + + +setup(name='talon-core', + version='1.6.0', + description=("Mailgun library " + "to extract message quotations and signatures."), + long_description=open("README.rst").read(), + author='Mailgun Inc.', + author_email='admin@mailgunhq.com', + url='https://github.com/mailgun/talon', + license='APACHE2', + packages=find_packages(exclude=['tests', 'tests.*']), + include_package_data=True, + zip_safe=True, + install_requires=[ + "lxml", + "regex", + "chardet", + "cchardet", + "cssselect", + "six", + "html5lib", + "joblib", + ], + tests_require=[ + "mock", + "nose", + "coverage" + ] + ) diff --git a/talon-core/talon_core/__init__.py b/talon-core/talon_core/__init__.py new file mode 100644 index 00000000..a9c1dae8 --- /dev/null +++ b/talon-core/talon_core/__init__.py @@ -0,0 +1,6 @@ +from __future__ import absolute_import +from talon_core.quotations import register_xpath_extensions + + +def init(): + register_xpath_extensions() diff --git a/talon/constants.py b/talon-core/talon_core/constants.py similarity index 100% rename from talon/constants.py rename to talon-core/talon_core/constants.py diff --git a/talon/html_quotations.py b/talon-core/talon_core/html_quotations.py similarity index 99% rename from talon/html_quotations.py rename to talon-core/talon_core/html_quotations.py index a2db32d5..a78bc1c8 100644 --- a/talon/html_quotations.py +++ b/talon-core/talon_core/html_quotations.py @@ -6,7 +6,7 @@ from __future__ import absolute_import import regex as re -from talon.utils import cssselect +from talon_core.utils import cssselect CHECKPOINT_PREFIX = '#!%!' CHECKPOINT_SUFFIX = '!%!#' diff --git a/talon/quotations.py b/talon-core/talon_core/quotations.py similarity index 99% rename from talon/quotations.py rename to talon-core/talon_core/quotations.py index b244c6c0..c68e4b26 100644 --- a/talon/quotations.py +++ b/talon-core/talon_core/quotations.py @@ -14,9 +14,9 @@ from lxml import etree, html from six.moves import range -from talon import html_quotations -from talon.utils import (get_delimiter, html_document_fromstring, - html_tree_to_text) +from talon_core import html_quotations +from talon_core.utils import (get_delimiter, html_document_fromstring, + html_tree_to_text) log = logging.getLogger(__name__) diff --git a/talon/signature/learning/__init__.py b/talon-core/talon_core/signature/__init__.py similarity index 100% rename from talon/signature/learning/__init__.py rename to talon-core/talon_core/signature/__init__.py diff --git a/talon/signature/bruteforce.py b/talon-core/talon_core/signature/bruteforce.py similarity index 98% rename from talon/signature/bruteforce.py rename to talon-core/talon_core/signature/bruteforce.py index e502bab8..c1e71100 100644 --- a/talon/signature/bruteforce.py +++ b/talon-core/talon_core/signature/bruteforce.py @@ -4,9 +4,9 @@ import regex as re -from talon.signature.constants import (SIGNATURE_MAX_LINES, +from talon_core.signature.constants import (SIGNATURE_MAX_LINES, TOO_LONG_SIGNATURE_LINE) -from talon.utils import get_delimiter +from talon_core.utils import get_delimiter log = logging.getLogger(__name__) diff --git a/talon/signature/constants.py b/talon-core/talon_core/signature/constants.py similarity index 100% rename from talon/signature/constants.py rename to talon-core/talon_core/signature/constants.py diff --git a/talon/utils.py b/talon-core/talon_core/utils.py similarity index 98% rename from talon/utils.py rename to talon-core/talon_core/utils.py index b6b5559b..7bca48a5 100644 --- a/talon/utils.py +++ b/talon-core/talon_core/utils.py @@ -8,7 +8,7 @@ from lxml.etree import _Element from lxml.html import html5parser -from talon.constants import RE_DELIMITER +from talon_core.constants import RE_DELIMITER def get_delimiter(msg_body: str) -> str: diff --git a/talon-core/tests/__init__.py b/talon-core/tests/__init__.py new file mode 100644 index 00000000..30374353 --- /dev/null +++ b/talon-core/tests/__init__.py @@ -0,0 +1,8 @@ +from __future__ import absolute_import +from nose.tools import * +from mock import * + +import talon_core + + +talon_core.init() diff --git a/tests/fixtures/OLK_SRC_BODY_SECTION.html b/talon-core/tests/fixtures/OLK_SRC_BODY_SECTION.html similarity index 100% rename from tests/fixtures/OLK_SRC_BODY_SECTION.html rename to talon-core/tests/fixtures/OLK_SRC_BODY_SECTION.html diff --git a/talon-core/tests/fixtures/__init__.py b/talon-core/tests/fixtures/__init__.py new file mode 100644 index 00000000..6f3e5a49 --- /dev/null +++ b/talon-core/tests/fixtures/__init__.py @@ -0,0 +1,13 @@ +import os + +FIXTURES_DIR = os.path.dirname(__file__) +STANDARD_REPLIES = FIXTURES_DIR + "/standard_replies" + +with open(FIXTURES_DIR + "/reply-quotations-share-block.eml") as f: + REPLY_QUOTATIONS_SHARE_BLOCK = f.read() + +with open(FIXTURES_DIR + "/OLK_SRC_BODY_SECTION.html") as f: + OLK_SRC_BODY_SECTION = f.read() + +with open(FIXTURES_DIR + "/reply-separated-by-hr.html") as f: + REPLY_SEPARATED_BY_HR = f.read() diff --git a/tests/fixtures/html_replies/gmail.html b/talon-core/tests/fixtures/html_replies/gmail.html similarity index 100% rename from tests/fixtures/html_replies/gmail.html rename to talon-core/tests/fixtures/html_replies/gmail.html diff --git a/tests/fixtures/html_replies/hotmail.html b/talon-core/tests/fixtures/html_replies/hotmail.html similarity index 100% rename from tests/fixtures/html_replies/hotmail.html rename to talon-core/tests/fixtures/html_replies/hotmail.html diff --git a/tests/fixtures/html_replies/mail_ru.html b/talon-core/tests/fixtures/html_replies/mail_ru.html similarity index 100% rename from tests/fixtures/html_replies/mail_ru.html rename to talon-core/tests/fixtures/html_replies/mail_ru.html diff --git a/tests/fixtures/html_replies/ms_outlook_2003.html b/talon-core/tests/fixtures/html_replies/ms_outlook_2003.html similarity index 100% rename from tests/fixtures/html_replies/ms_outlook_2003.html rename to talon-core/tests/fixtures/html_replies/ms_outlook_2003.html diff --git a/tests/fixtures/html_replies/ms_outlook_2007.html b/talon-core/tests/fixtures/html_replies/ms_outlook_2007.html similarity index 100% rename from tests/fixtures/html_replies/ms_outlook_2007.html rename to talon-core/tests/fixtures/html_replies/ms_outlook_2007.html diff --git a/tests/fixtures/html_replies/ms_outlook_2010.html b/talon-core/tests/fixtures/html_replies/ms_outlook_2010.html similarity index 100% rename from tests/fixtures/html_replies/ms_outlook_2010.html rename to talon-core/tests/fixtures/html_replies/ms_outlook_2010.html diff --git a/tests/fixtures/html_replies/thunderbird.html b/talon-core/tests/fixtures/html_replies/thunderbird.html similarity index 100% rename from tests/fixtures/html_replies/thunderbird.html rename to talon-core/tests/fixtures/html_replies/thunderbird.html diff --git a/tests/fixtures/html_replies/windows_mail.html b/talon-core/tests/fixtures/html_replies/windows_mail.html similarity index 100% rename from tests/fixtures/html_replies/windows_mail.html rename to talon-core/tests/fixtures/html_replies/windows_mail.html diff --git a/tests/fixtures/html_replies/yandex_ru.html b/talon-core/tests/fixtures/html_replies/yandex_ru.html similarity index 100% rename from tests/fixtures/html_replies/yandex_ru.html rename to talon-core/tests/fixtures/html_replies/yandex_ru.html diff --git a/tests/fixtures/reply-quotations-share-block.eml b/talon-core/tests/fixtures/reply-quotations-share-block.eml similarity index 100% rename from tests/fixtures/reply-quotations-share-block.eml rename to talon-core/tests/fixtures/reply-quotations-share-block.eml diff --git a/tests/fixtures/reply-separated-by-hr.html b/talon-core/tests/fixtures/reply-separated-by-hr.html similarity index 100% rename from tests/fixtures/reply-separated-by-hr.html rename to talon-core/tests/fixtures/reply-separated-by-hr.html diff --git a/tests/fixtures/standard_replies/android.eml b/talon-core/tests/fixtures/standard_replies/android.eml similarity index 100% rename from tests/fixtures/standard_replies/android.eml rename to talon-core/tests/fixtures/standard_replies/android.eml diff --git a/tests/fixtures/standard_replies/aol.eml b/talon-core/tests/fixtures/standard_replies/aol.eml similarity index 100% rename from tests/fixtures/standard_replies/aol.eml rename to talon-core/tests/fixtures/standard_replies/aol.eml diff --git a/tests/fixtures/standard_replies/apple_mail.eml b/talon-core/tests/fixtures/standard_replies/apple_mail.eml similarity index 100% rename from tests/fixtures/standard_replies/apple_mail.eml rename to talon-core/tests/fixtures/standard_replies/apple_mail.eml diff --git a/tests/fixtures/standard_replies/apple_mail_2.eml b/talon-core/tests/fixtures/standard_replies/apple_mail_2.eml similarity index 100% rename from tests/fixtures/standard_replies/apple_mail_2.eml rename to talon-core/tests/fixtures/standard_replies/apple_mail_2.eml diff --git a/tests/fixtures/standard_replies/comcast.eml b/talon-core/tests/fixtures/standard_replies/comcast.eml similarity index 100% rename from tests/fixtures/standard_replies/comcast.eml rename to talon-core/tests/fixtures/standard_replies/comcast.eml diff --git a/tests/fixtures/standard_replies/gmail.eml b/talon-core/tests/fixtures/standard_replies/gmail.eml similarity index 100% rename from tests/fixtures/standard_replies/gmail.eml rename to talon-core/tests/fixtures/standard_replies/gmail.eml diff --git a/tests/fixtures/standard_replies/hotmail.eml b/talon-core/tests/fixtures/standard_replies/hotmail.eml similarity index 100% rename from tests/fixtures/standard_replies/hotmail.eml rename to talon-core/tests/fixtures/standard_replies/hotmail.eml diff --git a/tests/fixtures/standard_replies/iphone.eml b/talon-core/tests/fixtures/standard_replies/iphone.eml similarity index 100% rename from tests/fixtures/standard_replies/iphone.eml rename to talon-core/tests/fixtures/standard_replies/iphone.eml diff --git a/tests/fixtures/standard_replies/iphone_reply_text b/talon-core/tests/fixtures/standard_replies/iphone_reply_text similarity index 100% rename from tests/fixtures/standard_replies/iphone_reply_text rename to talon-core/tests/fixtures/standard_replies/iphone_reply_text diff --git a/tests/fixtures/standard_replies/outlook.eml b/talon-core/tests/fixtures/standard_replies/outlook.eml similarity index 100% rename from tests/fixtures/standard_replies/outlook.eml rename to talon-core/tests/fixtures/standard_replies/outlook.eml diff --git a/tests/fixtures/standard_replies/sparrow.eml b/talon-core/tests/fixtures/standard_replies/sparrow.eml similarity index 100% rename from tests/fixtures/standard_replies/sparrow.eml rename to talon-core/tests/fixtures/standard_replies/sparrow.eml diff --git a/tests/fixtures/standard_replies/sparrow_reply_text b/talon-core/tests/fixtures/standard_replies/sparrow_reply_text similarity index 100% rename from tests/fixtures/standard_replies/sparrow_reply_text rename to talon-core/tests/fixtures/standard_replies/sparrow_reply_text diff --git a/tests/fixtures/standard_replies/thunderbird.eml b/talon-core/tests/fixtures/standard_replies/thunderbird.eml similarity index 100% rename from tests/fixtures/standard_replies/thunderbird.eml rename to talon-core/tests/fixtures/standard_replies/thunderbird.eml diff --git a/tests/fixtures/standard_replies/yahoo.eml b/talon-core/tests/fixtures/standard_replies/yahoo.eml similarity index 100% rename from tests/fixtures/standard_replies/yahoo.eml rename to talon-core/tests/fixtures/standard_replies/yahoo.eml diff --git a/tests/html_quotations_test.py b/talon-core/tests/html_quotations_test.py similarity index 92% rename from tests/html_quotations_test.py rename to talon-core/tests/html_quotations_test.py index 85871e73..b5347d13 100644 --- a/tests/html_quotations_test.py +++ b/talon-core/tests/html_quotations_test.py @@ -8,10 +8,11 @@ from nose.tools import assert_false, assert_true, eq_, ok_ -from tests.fixtures import (OLK_SRC_BODY_SECTION, +from tests.fixtures import (FIXTURES_DIR, + OLK_SRC_BODY_SECTION, REPLY_QUOTATIONS_SHARE_BLOCK, REPLY_SEPARATED_BY_HR) -from talon import quotations, utils as u +from talon_core import quotations, utils as u RE_WHITESPACE = re.compile(r"\s") RE_DOUBLE_WHITESPACE = re.compile(r"\s") @@ -323,39 +324,39 @@ def extract_reply_and_check(filename): def test_gmail_reply(): - extract_reply_and_check("tests/fixtures/html_replies/gmail.html") + extract_reply_and_check(FIXTURES_DIR + "/html_replies/gmail.html") def test_mail_ru_reply(): - extract_reply_and_check("tests/fixtures/html_replies/mail_ru.html") + extract_reply_and_check(FIXTURES_DIR + "/html_replies/mail_ru.html") def test_hotmail_reply(): - extract_reply_and_check("tests/fixtures/html_replies/hotmail.html") + extract_reply_and_check(FIXTURES_DIR + "/html_replies/hotmail.html") def test_ms_outlook_2003_reply(): - extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2003.html") + extract_reply_and_check(FIXTURES_DIR + "/html_replies/ms_outlook_2003.html") def test_ms_outlook_2007_reply(): - extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") + extract_reply_and_check(FIXTURES_DIR + "/html_replies/ms_outlook_2007.html") def test_ms_outlook_2010_reply(): - extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html") + extract_reply_and_check(FIXTURES_DIR + "/html_replies/ms_outlook_2010.html") def test_thunderbird_reply(): - extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") + extract_reply_and_check(FIXTURES_DIR + "/html_replies/thunderbird.html") def test_windows_mail_reply(): - extract_reply_and_check("tests/fixtures/html_replies/windows_mail.html") + extract_reply_and_check(FIXTURES_DIR + "/html_replies/windows_mail.html") def test_yandex_ru_reply(): - extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") + extract_reply_and_check(FIXTURES_DIR + "/html_replies/yandex_ru.html") def test_CRLF(): diff --git a/tests/quotations_test.py b/talon-core/tests/quotations_test.py similarity index 96% rename from tests/quotations_test.py rename to talon-core/tests/quotations_test.py index e5ed0416..6aa9f9dd 100644 --- a/tests/quotations_test.py +++ b/talon-core/tests/quotations_test.py @@ -4,7 +4,7 @@ from . import * from . fixtures import * -from talon import quotations +from talon_core import quotations @patch.object(quotations, 'extract_from_html') diff --git a/tests/signature/__init__.py b/talon-core/tests/signature/__init__.py similarity index 100% rename from tests/signature/__init__.py rename to talon-core/tests/signature/__init__.py diff --git a/tests/signature/bruteforce_test.py b/talon-core/tests/signature/bruteforce_test.py similarity index 99% rename from tests/signature/bruteforce_test.py rename to talon-core/tests/signature/bruteforce_test.py index 382615bb..47b4efd7 100644 --- a/tests/signature/bruteforce_test.py +++ b/talon-core/tests/signature/bruteforce_test.py @@ -3,7 +3,7 @@ from __future__ import absolute_import from .. import * -from talon.signature import bruteforce +from talon_core.signature import bruteforce def test_empty_body(): diff --git a/tests/text_quotations_test.py b/talon-core/tests/text_quotations_test.py similarity index 99% rename from tests/text_quotations_test.py rename to talon-core/tests/text_quotations_test.py index 0cf7d4b0..bf9048d4 100644 --- a/tests/text_quotations_test.py +++ b/talon-core/tests/text_quotations_test.py @@ -7,7 +7,7 @@ import os import email.iterators -from talon import quotations +from talon_core import quotations import six from six.moves import range from six import StringIO diff --git a/tests/utils_test.py b/talon-core/tests/utils_test.py similarity index 97% rename from tests/utils_test.py rename to talon-core/tests/utils_test.py index 0027752e..3c02a2ce 100644 --- a/tests/utils_test.py +++ b/talon-core/tests/utils_test.py @@ -2,7 +2,7 @@ from __future__ import absolute_import -from talon import utils as u +from talon_core import utils as u from . import * diff --git a/talon/LICENSE b/talon/LICENSE new file mode 100644 index 00000000..e06d2081 --- /dev/null +++ b/talon/LICENSE @@ -0,0 +1,202 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/MANIFEST.in b/talon/MANIFEST.in similarity index 100% rename from MANIFEST.in rename to talon/MANIFEST.in diff --git a/talon/README.rst b/talon/README.rst new file mode 100644 index 00000000..6ba6d73c --- /dev/null +++ b/talon/README.rst @@ -0,0 +1,155 @@ +talon +===== + +Mailgun library to extract message quotations and signatures. + +If you ever tried to parse message quotations or signatures you know that absence of any formatting standards in this area could make this task a nightmare. Hopefully this library will make your life much easier. The name of the project is inspired by TALON - multipurpose robot designed to perform missions ranging from reconnaissance to combat and operate in a number of hostile environments. That’s what a good quotations and signature parser should be like :smile: + +Usage +----- + +Here’s how you initialize the library and extract a reply from a text +message: + +.. code:: python + + import talon + from talon import quotations + + talon.init() + + text = """Reply + + -----Original Message----- + + Quote""" + + reply = quotations.extract_from(text, 'text/plain') + reply = quotations.extract_from_plain(text) + # reply == "Reply" + +To extract a reply from html: + +.. code:: python + + html = """Reply +
+ +
+ On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ +
+ Quote +
+ +
""" + + reply = quotations.extract_from(html, 'text/html') + reply = quotations.extract_from_html(html) + # reply == "

Reply

" + +Often the best way is the easiest one. Here’s how you can extract +signature from email message without any +machine learning fancy stuff: + +.. code:: python + + from talon.signature.bruteforce import extract_signature + + + message = """Wow. Awesome! + -- + Bob Smith""" + + text, signature = extract_signature(message) + # text == "Wow. Awesome!" + # signature == "--\nBob Smith" + +Quick and works like a charm 90% of the time. For other 10% you can use +the power of machine learning algorithms: + +.. code:: python + + import talon + # don't forget to init the library first + # it loads machine learning classifiers + talon.init() + + from talon import signature + + + message = """Thanks Sasha, I can't go any higher and is why I limited it to the + homepage. + + John Doe + via mobile""" + + text, signature = signature.extract(message, sender='john.doe@example.com') + # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage." + # signature == "John Doe\nvia mobile" + +For machine learning talon currently uses the `scikit-learn`_ library to build SVM +classifiers. The core of machine learning algorithm lays in +``talon.signature.learning package``. It defines a set of features to +apply to a message (``featurespace.py``), how data sets are built +(``dataset.py``), classifier’s interface (``classifier.py``). + +Currently the data used for training is taken from our personal email +conversations and from `ENRON`_ dataset. As a result of applying our set +of features to the dataset we provide files ``classifier`` and +``train.data`` that don’t have any personal information but could be +used to load trained classifier. Those files should be regenerated every +time the feature/data set is changed. + +To regenerate the model files, you can run + +.. code:: sh + + python train.py + +or + +.. code:: python + + from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA + from talon.signature.learning.classifier import train, init + train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) + +Open-source Dataset +------------------- + +Recently we started a `forge`_ project to create an open-source, annotated dataset of raw emails. In the project we +used a subset of `ENRON`_ data, cleansed of private, health and financial information by `EDRM`_. At the moment over 190 +emails are annotated. Any contribution and collaboration on the project are welcome. Once the dataset is ready we plan to +start using it for talon. + +.. _scikit-learn: http://scikit-learn.org +.. _ENRON: https://www.cs.cmu.edu/~enron/ +.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set +.. _forge: https://github.com/mailgun/forge + +Training on your dataset +------------------------ + +talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do: + +.. code:: python + + from talon.signature.learning.dataset import build_extraction_dataset + from talon.signature.learning import classifier as c + + build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data") + c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier") + +Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder). + +.. _forge: https://github.com/mailgun/forge + +Research +-------- + +The library is inspired by the following research papers and projects: + +- http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf +- http://www.cs.cornell.edu/people/tj/publications/joachims_01a.pdf diff --git a/talon/setup.py b/talon/setup.py new file mode 100644 index 00000000..d68bb667 --- /dev/null +++ b/talon/setup.py @@ -0,0 +1,30 @@ +from __future__ import absolute_import +from setuptools import setup, find_packages + + +setup(name='talon', + version='1.6.0', + description=("Mailgun library " + "to extract message quotations and signatures."), + long_description=open("README.rst").read(), + author='Mailgun Inc.', + author_email='admin@mailgunhq.com', + url='https://github.com/mailgun/talon', + license='APACHE2', + packages=find_packages(exclude=['tests', 'tests.*']), + include_package_data=True, + zip_safe=True, + install_requires=[ + "talon-core", + "regex>=1", + "numpy", + "scipy", + "scikit-learn>=1.0.0", + "six", + ], + tests_require=[ + "mock", + "nose", + "coverage" + ] + ) diff --git a/talon/__init__.py b/talon/talon/__init__.py similarity index 70% rename from talon/__init__.py rename to talon/talon/__init__.py index 7060f5b0..bdb39815 100644 --- a/talon/__init__.py +++ b/talon/talon/__init__.py @@ -1,5 +1,5 @@ from __future__ import absolute_import -from talon.quotations import register_xpath_extensions +import talon_core try: from talon import signature ML_ENABLED = True @@ -8,6 +8,6 @@ def init(): - register_xpath_extensions() + talon_core.init() if ML_ENABLED: signature.initialize() diff --git a/talon/talon/constants.py b/talon/talon/constants.py new file mode 100644 index 00000000..509d9aef --- /dev/null +++ b/talon/talon/constants.py @@ -0,0 +1 @@ +from talon_core.constants import * diff --git a/talon/talon/html_quotations.py b/talon/talon/html_quotations.py new file mode 100644 index 00000000..b0ec6b63 --- /dev/null +++ b/talon/talon/html_quotations.py @@ -0,0 +1 @@ +from talon_core.html_quotations import * diff --git a/talon/talon/quotations.py b/talon/talon/quotations.py new file mode 100644 index 00000000..bceae1c9 --- /dev/null +++ b/talon/talon/quotations.py @@ -0,0 +1 @@ +from talon_core.quotations import * diff --git a/talon/signature/__init__.py b/talon/talon/signature/__init__.py similarity index 100% rename from talon/signature/__init__.py rename to talon/talon/signature/__init__.py diff --git a/talon/talon/signature/bruteforce.py b/talon/talon/signature/bruteforce.py new file mode 100644 index 00000000..7ffe0480 --- /dev/null +++ b/talon/talon/signature/bruteforce.py @@ -0,0 +1 @@ +from talon_core.signature.bruteforce import * diff --git a/talon/talon/signature/constants.py b/talon/talon/signature/constants.py new file mode 100644 index 00000000..a4e19682 --- /dev/null +++ b/talon/talon/signature/constants.py @@ -0,0 +1 @@ +from talon_core.signature.constants import * diff --git a/talon/signature/data/__init__.py b/talon/talon/signature/data/__init__.py similarity index 100% rename from talon/signature/data/__init__.py rename to talon/talon/signature/data/__init__.py diff --git a/talon/signature/data/classifier b/talon/talon/signature/data/classifier similarity index 100% rename from talon/signature/data/classifier rename to talon/talon/signature/data/classifier diff --git a/talon/signature/data/classifier_01.npy b/talon/talon/signature/data/classifier_01.npy similarity index 100% rename from talon/signature/data/classifier_01.npy rename to talon/talon/signature/data/classifier_01.npy diff --git a/talon/signature/data/classifier_02.npy b/talon/talon/signature/data/classifier_02.npy similarity index 100% rename from talon/signature/data/classifier_02.npy rename to talon/talon/signature/data/classifier_02.npy diff --git a/talon/signature/data/classifier_03.npy b/talon/talon/signature/data/classifier_03.npy similarity index 100% rename from talon/signature/data/classifier_03.npy rename to talon/talon/signature/data/classifier_03.npy diff --git a/talon/signature/data/classifier_04.npy b/talon/talon/signature/data/classifier_04.npy similarity index 100% rename from talon/signature/data/classifier_04.npy rename to talon/talon/signature/data/classifier_04.npy diff --git a/talon/signature/data/classifier_05.npy b/talon/talon/signature/data/classifier_05.npy similarity index 100% rename from talon/signature/data/classifier_05.npy rename to talon/talon/signature/data/classifier_05.npy diff --git a/talon/signature/data/train.data b/talon/talon/signature/data/train.data similarity index 100% rename from talon/signature/data/train.data rename to talon/talon/signature/data/train.data diff --git a/talon/signature/extraction.py b/talon/talon/signature/extraction.py similarity index 100% rename from talon/signature/extraction.py rename to talon/talon/signature/extraction.py diff --git a/tests/signature/learning/__init__.py b/talon/talon/signature/learning/__init__.py similarity index 100% rename from tests/signature/learning/__init__.py rename to talon/talon/signature/learning/__init__.py diff --git a/talon/signature/learning/classifier.py b/talon/talon/signature/learning/classifier.py similarity index 100% rename from talon/signature/learning/classifier.py rename to talon/talon/signature/learning/classifier.py diff --git a/talon/signature/learning/dataset.py b/talon/talon/signature/learning/dataset.py similarity index 100% rename from talon/signature/learning/dataset.py rename to talon/talon/signature/learning/dataset.py diff --git a/talon/signature/learning/featurespace.py b/talon/talon/signature/learning/featurespace.py similarity index 100% rename from talon/signature/learning/featurespace.py rename to talon/talon/signature/learning/featurespace.py diff --git a/talon/signature/learning/helpers.py b/talon/talon/signature/learning/helpers.py similarity index 100% rename from talon/signature/learning/helpers.py rename to talon/talon/signature/learning/helpers.py diff --git a/talon/talon/utils.py b/talon/talon/utils.py new file mode 100644 index 00000000..0518280f --- /dev/null +++ b/talon/talon/utils.py @@ -0,0 +1 @@ +from talon_core.utils import * diff --git a/talon/tests/__init__.py b/talon/tests/__init__.py new file mode 100644 index 00000000..8a965e2a --- /dev/null +++ b/talon/tests/__init__.py @@ -0,0 +1,21 @@ +from __future__ import absolute_import +import os +from nose.tools import * +from mock import * + +import talon + + +FIXTURES_DIR = os.path.dirname(__file__) + "/fixtures" +EML_MSG_FILENAME = FIXTURES_DIR + "/standard_replies/yahoo.eml" +MSG_FILENAME_WITH_BODY_SUFFIX = (FIXTURES_DIR + "/signature/emails/P/" + "johndoeexamplecom_body") +EMAILS_DIR = FIXTURES_DIR + "/signature/emails" +TMP_DIR = FIXTURES_DIR + "/signature/tmp" + +STRIPPED = FIXTURES_DIR + "/signature/emails/stripped/" +UNICODE_MSG = (FIXTURES_DIR + "/signature/emails/P/" + "unicode_msg") + + +talon.init() diff --git a/tests/fixtures/signature/emails/stripped/camel_case_sender b/talon/tests/fixtures/__init__.py similarity index 100% rename from tests/fixtures/signature/emails/stripped/camel_case_sender rename to talon/tests/fixtures/__init__.py diff --git a/tests/fixtures/signature/emails/P/102682_R_S b/talon/tests/fixtures/signature/emails/P/102682_R_S similarity index 100% rename from tests/fixtures/signature/emails/P/102682_R_S rename to talon/tests/fixtures/signature/emails/P/102682_R_S diff --git a/tests/fixtures/signature/emails/P/johndoeexamplecom_body b/talon/tests/fixtures/signature/emails/P/johndoeexamplecom_body similarity index 100% rename from tests/fixtures/signature/emails/P/johndoeexamplecom_body rename to talon/tests/fixtures/signature/emails/P/johndoeexamplecom_body diff --git a/tests/fixtures/signature/emails/P/johndoeexamplecom_sender b/talon/tests/fixtures/signature/emails/P/johndoeexamplecom_sender similarity index 100% rename from tests/fixtures/signature/emails/P/johndoeexamplecom_sender rename to talon/tests/fixtures/signature/emails/P/johndoeexamplecom_sender diff --git a/tests/fixtures/signature/emails/P/unicode_msg b/talon/tests/fixtures/signature/emails/P/unicode_msg similarity index 100% rename from tests/fixtures/signature/emails/P/unicode_msg rename to talon/tests/fixtures/signature/emails/P/unicode_msg diff --git a/tests/fixtures/signature/emails/stripped/camel_case_body b/talon/tests/fixtures/signature/emails/stripped/camel_case_body similarity index 100% rename from tests/fixtures/signature/emails/stripped/camel_case_body rename to talon/tests/fixtures/signature/emails/stripped/camel_case_body diff --git a/tests/fixtures/signature/emails/stripped/camel_case_signature b/talon/tests/fixtures/signature/emails/stripped/camel_case_sender similarity index 100% rename from tests/fixtures/signature/emails/stripped/camel_case_signature rename to talon/tests/fixtures/signature/emails/stripped/camel_case_sender diff --git a/talon/tests/fixtures/signature/emails/stripped/camel_case_signature b/talon/tests/fixtures/signature/emails/stripped/camel_case_signature new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/signature/emails/stripped/jeff_body b/talon/tests/fixtures/signature/emails/stripped/jeff_body similarity index 100% rename from tests/fixtures/signature/emails/stripped/jeff_body rename to talon/tests/fixtures/signature/emails/stripped/jeff_body diff --git a/tests/fixtures/signature/emails/stripped/jeff_sender b/talon/tests/fixtures/signature/emails/stripped/jeff_sender similarity index 100% rename from tests/fixtures/signature/emails/stripped/jeff_sender rename to talon/tests/fixtures/signature/emails/stripped/jeff_sender diff --git a/tests/fixtures/signature/emails/stripped/jeff_signature b/talon/tests/fixtures/signature/emails/stripped/jeff_signature similarity index 100% rename from tests/fixtures/signature/emails/stripped/jeff_signature rename to talon/tests/fixtures/signature/emails/stripped/jeff_signature diff --git a/tests/fixtures/signature/emails/stripped/johndoeexamplecom_body b/talon/tests/fixtures/signature/emails/stripped/johndoeexamplecom_body similarity index 100% rename from tests/fixtures/signature/emails/stripped/johndoeexamplecom_body rename to talon/tests/fixtures/signature/emails/stripped/johndoeexamplecom_body diff --git a/tests/fixtures/signature/emails/stripped/johndoeexamplecom_sender b/talon/tests/fixtures/signature/emails/stripped/johndoeexamplecom_sender similarity index 100% rename from tests/fixtures/signature/emails/stripped/johndoeexamplecom_sender rename to talon/tests/fixtures/signature/emails/stripped/johndoeexamplecom_sender diff --git a/tests/fixtures/signature/emails/stripped/johndoeexamplecom_signature b/talon/tests/fixtures/signature/emails/stripped/johndoeexamplecom_signature similarity index 100% rename from tests/fixtures/signature/emails/stripped/johndoeexamplecom_signature rename to talon/tests/fixtures/signature/emails/stripped/johndoeexamplecom_signature diff --git a/tests/fixtures/signature/emails/stripped/long_body b/talon/tests/fixtures/signature/emails/stripped/long_body similarity index 100% rename from tests/fixtures/signature/emails/stripped/long_body rename to talon/tests/fixtures/signature/emails/stripped/long_body diff --git a/tests/fixtures/signature/emails/stripped/long_sender b/talon/tests/fixtures/signature/emails/stripped/long_sender similarity index 100% rename from tests/fixtures/signature/emails/stripped/long_sender rename to talon/tests/fixtures/signature/emails/stripped/long_sender diff --git a/tests/fixtures/signature/emails/stripped/long_signature b/talon/tests/fixtures/signature/emails/stripped/long_signature similarity index 100% rename from tests/fixtures/signature/emails/stripped/long_signature rename to talon/tests/fixtures/signature/emails/stripped/long_signature diff --git a/tests/fixtures/signature/emails/stripped/short_url_body b/talon/tests/fixtures/signature/emails/stripped/short_url_body similarity index 100% rename from tests/fixtures/signature/emails/stripped/short_url_body rename to talon/tests/fixtures/signature/emails/stripped/short_url_body diff --git a/tests/fixtures/signature/emails/stripped/short_url_sender b/talon/tests/fixtures/signature/emails/stripped/short_url_sender similarity index 100% rename from tests/fixtures/signature/emails/stripped/short_url_sender rename to talon/tests/fixtures/signature/emails/stripped/short_url_sender diff --git a/tests/fixtures/signature/emails/stripped/short_url_signature b/talon/tests/fixtures/signature/emails/stripped/short_url_signature similarity index 100% rename from tests/fixtures/signature/emails/stripped/short_url_signature rename to talon/tests/fixtures/signature/emails/stripped/short_url_signature diff --git a/tests/fixtures/signature/emails/stripped/sparse_body b/talon/tests/fixtures/signature/emails/stripped/sparse_body similarity index 100% rename from tests/fixtures/signature/emails/stripped/sparse_body rename to talon/tests/fixtures/signature/emails/stripped/sparse_body diff --git a/tests/fixtures/signature/emails/stripped/sparse_sender b/talon/tests/fixtures/signature/emails/stripped/sparse_sender similarity index 100% rename from tests/fixtures/signature/emails/stripped/sparse_sender rename to talon/tests/fixtures/signature/emails/stripped/sparse_sender diff --git a/tests/fixtures/signature/emails/stripped/sparse_signature b/talon/tests/fixtures/signature/emails/stripped/sparse_signature similarity index 100% rename from tests/fixtures/signature/emails/stripped/sparse_signature rename to talon/tests/fixtures/signature/emails/stripped/sparse_signature diff --git a/tests/fixtures/signature/tmp/.gitignore b/talon/tests/fixtures/signature/tmp/.gitignore similarity index 100% rename from tests/fixtures/signature/tmp/.gitignore rename to talon/tests/fixtures/signature/tmp/.gitignore diff --git a/talon/tests/fixtures/standard_replies/yahoo.eml b/talon/tests/fixtures/standard_replies/yahoo.eml new file mode 100644 index 00000000..49692554 --- /dev/null +++ b/talon/tests/fixtures/standard_replies/yahoo.eml @@ -0,0 +1,22 @@ +Content-Type: text/plain; charset="us-ascii" +MIME-Version: 1.0 +X-Mailer: YahooMailWebService/0.8.117.340979 +Message-Id: <1333374330.68772.YahooMailNeo@web114411.mail.gq1.yahoo.com> +Date: Mon, 2 Apr 2012 06:45:30 -0700 (PDT) +From: Alex Q +Subject: Re: Test +To: "bob@xxx.mailgun.org" +In-Reply-To: <1333374262.7063.15.camel@mg5> +Content-Transfer-Encoding: 7bit + +Hello + + +----- Original Message ----- +From: "bob@xxx.mailgun.org" +To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com +Cc: +Sent: Monday, April 2, 2012 5:44 PM +Subject: Test + +Hi diff --git a/talon/tests/signature/__init__.py b/talon/tests/signature/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/signature/extraction_test.py b/talon/tests/signature/extraction_test.py similarity index 98% rename from tests/signature/extraction_test.py rename to talon/tests/signature/extraction_test.py index b9426748..657612b4 100644 --- a/tests/signature/extraction_test.py +++ b/talon/tests/signature/extraction_test.py @@ -6,7 +6,8 @@ from six.moves import range -from talon.signature import bruteforce, extraction, extract +from talon_core.signature import bruteforce +from talon.signature import extraction, extract from talon.signature import extraction as e from talon.signature.learning import dataset from .. import * diff --git a/talon/tests/signature/learning/__init__.py b/talon/tests/signature/learning/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/signature/learning/dataset_test.py b/talon/tests/signature/learning/dataset_test.py similarity index 100% rename from tests/signature/learning/dataset_test.py rename to talon/tests/signature/learning/dataset_test.py diff --git a/tests/signature/learning/featurespace_test.py b/talon/tests/signature/learning/featurespace_test.py similarity index 100% rename from tests/signature/learning/featurespace_test.py rename to talon/tests/signature/learning/featurespace_test.py diff --git a/tests/signature/learning/helpers_test.py b/talon/tests/signature/learning/helpers_test.py similarity index 100% rename from tests/signature/learning/helpers_test.py rename to talon/tests/signature/learning/helpers_test.py diff --git a/train.py b/talon/train.py similarity index 100% rename from train.py rename to talon/train.py diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index 8fdebd6b..00000000 --- a/tests/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from __future__ import absolute_import -from nose.tools import * -from mock import * - -import talon - - -EML_MSG_FILENAME = "tests/fixtures/standard_replies/yahoo.eml" -MSG_FILENAME_WITH_BODY_SUFFIX = ("tests/fixtures/signature/emails/P/" - "johndoeexamplecom_body") -EMAILS_DIR = "tests/fixtures/signature/emails" -TMP_DIR = "tests/fixtures/signature/tmp" - -STRIPPED = "tests/fixtures/signature/emails/stripped/" -UNICODE_MSG = ("tests/fixtures/signature/emails/P/" - "unicode_msg") - - -talon.init() diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py deleted file mode 100644 index dc2eb041..00000000 --- a/tests/fixtures/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -STANDARD_REPLIES = "tests/fixtures/standard_replies" - -with open("tests/fixtures/reply-quotations-share-block.eml") as f: - REPLY_QUOTATIONS_SHARE_BLOCK = f.read() - -with open("tests/fixtures/OLK_SRC_BODY_SECTION.html") as f: - OLK_SRC_BODY_SECTION = f.read() - -with open("tests/fixtures/reply-separated-by-hr.html") as f: - REPLY_SEPARATED_BY_HR = f.read()