diff --git a/notebooks/Debugging scikit-learn text classification pipeline.ipynb b/notebooks/Debugging scikit-learn text classification pipeline.ipynb index 2baa6f8b..6e520ecc 100644 --- a/notebooks/Debugging scikit-learn text classification pipeline.ipynb +++ b/notebooks/Debugging scikit-learn text classification pipeline.ipynb @@ -49,9 +49,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", @@ -81,9 +79,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -94,7 +90,7 @@ " alt.atheism 0.93 0.80 0.86 319\n", " comp.graphics 0.87 0.96 0.91 389\n", " sci.med 0.94 0.81 0.87 396\n", - "soc.religion.christian 0.85 0.98 0.91 398\n", + "soc.religion.christian 0.85 0.97 0.91 398\n", "\n", " avg / total 0.90 0.89 0.89 1502\n", "\n", @@ -126,10 +122,16 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/kostia/shub/memex/eli5/venv/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", + " \"This module will be removed in 0.20.\", DeprecationWarning)\n" + ] + }, { "data": { "text/html": [ @@ -233,29 +235,29 @@ " \n", "
\n", " \n", - "\n", - " from: brian@ucsd.edu (brian kantor)\n", - "subject: re: help for kidney stones ..............\n", - "organization: the avant-garde of the now, ltd.\n", - "lines: 12\n", - "nntp-posting-host: ucsd.edu\n", + " from: brian@ucsd.edu (brian kantor)\n", + "subject: re: help for kidney stones ..............\n", + "organization: the avant-garde of the now, ltd.\n", + "lines: 12\n", + "nntp-posting-host: ucsd.edu\n", "\n", - "as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + "as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "\n", - "demerol worked, although i nearly got arrested on my way home when i barfed\n", - "all over the police car parked just outside the er.\n", - "\t- brian\n", + "demerol worked, although i nearly got arrested on my way home when i barfed\n", + "all over the police car parked just outside the er.\n", + "\t- brian\n", "\n", "
\n", "\n", @@ -1812,7 +1808,7 @@ "\n", " \n", " \n", - "\n", " from: brian@ucsd.edu (brian kantor)\n", - "subject: re: help for kidney stones ..............\n", - "organization: the avant-garde of the now, ltd.\n", - "lines: 12\n", - "nntp-posting-host: ucsd.edu\n", + "subject: re: help for kidney stones ..............\n", + "organization: the avant-garde of the now, ltd.\n", + "lines: 12\n", + "nntp-posting-host: ucsd.edu\n", "\n", - "as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + "as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "\n", - "demerol worked, although i nearly got arrested on my way home when i barfed\n", - "all over the police car parked just outside the er.\n", + "demerol worked, although i nearly got arrested on my way home when i barfed\n", + "all over the police car parked just outside the er.\n", "\t- brian\n", "\n", "
\n", @@ -1876,7 +1872,7 @@ "\n", "\n", " \n", - " (probability 0.989, score 3.945)\n", + " (probability 0.989, score 3.955)\n", "\n", "top features\n", " \n", @@ -1896,9 +1892,9 @@ " \n", "\n", - " from: brian@ucsd.edu (brian kantor)\n", - "subject: re: help for kidney stones ..............\n", - "organization: the avant-garde of the now, ltd.\n", - "lines: 12\n", - "nntp-posting-host: ucsd.edu\n", + " from: brian@ucsd.edu (brian kantor)\n", + "subject: re: help for kidney stones ..............\n", + "organization: the avant-garde of the now, ltd.\n", + "lines: 12\n", + "nntp-posting-host: ucsd.edu\n", "\n", - "as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + "as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "\n", - "demerol worked, although i nearly got arrested on my way home when i barfed\n", - "all over the police car parked just outside the er.\n", + "demerol worked, although i nearly got arrested on my way home when i barfed\n", + "all over the police car parked just outside the er.\n", "\t- brian\n", "\n", "
\n", @@ -1988,9 +1984,9 @@ "\n", " \n", " \n", - "\n", - " from: brian@ucsd.edu (brian kantor)\n", - "subject: re: help for kidney stones ..............\n", - "organization: the avant-garde of the now, ltd.\n", - "lines: 12\n", - "nntp-posting-host: ucsd.edu\n", + " from: brian@ucsd.edu (brian kantor)\n", + "subject: re: help for kidney stones ..............\n", + "organization: the avant-garde of the now, ltd.\n", + "lines: 12\n", + "nntp-posting-host: ucsd.edu\n", "\n", - "as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + "as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "\n", - "demerol worked, although i nearly got arrested on my way home when i barfed\n", - "all over the police car parked just outside the er.\n", - "\t- brian\n", + "demerol worked, although i nearly got arrested on my way home when i barfed\n", + "all over the police car parked just outside the er.\n", + "\t- brian\n", "\n", "
\n", "\n", @@ -2106,9 +2102,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "twenty_train = fetch_20newsgroups(\n", @@ -2142,9 +2136,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2152,14 +2144,14 @@ "text": [ " precision recall f1-score support\n", "\n", - " alt.atheism 0.83 0.78 0.80 319\n", - " comp.graphics 0.82 0.96 0.88 389\n", - " sci.med 0.89 0.80 0.84 396\n", - "soc.religion.christian 0.88 0.86 0.87 398\n", + " alt.atheism 0.84 0.77 0.81 319\n", + " comp.graphics 0.83 0.95 0.89 389\n", + " sci.med 0.90 0.79 0.84 396\n", + "soc.religion.christian 0.86 0.90 0.88 398\n", "\n", - " avg / total 0.85 0.85 0.85 1502\n", + " avg / total 0.86 0.86 0.86 1502\n", "\n", - "accuracy: 0.852\n" + "accuracy: 0.858\n" ] } ], @@ -2181,9 +2173,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2236,7 +2226,7 @@ "\n", "\n", " \n", - " (probability 0.732, score 0.031)\n", + " (probability 0.711, score 0.031)\n", "\n", "top features\n", " \n", @@ -2289,14 +2279,14 @@ "\n", "\n", "\n", - " as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + " as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", + "either they pass, or they have to be broken up with sound, or they have\n", "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "
\n", "\n", "\n", @@ -2366,9 +2356,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2399,9 +2387,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2584,9 +2570,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2619,9 +2603,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2674,7 +2656,7 @@ "\n", "\n", " \n", - " (probability 0.987, score 1.585)\n", + " (probability 0.987, score 1.584)\n", "\n", "top features\n", " \n", @@ -2710,7 +2692,7 @@ " \n", "\n", - " as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + " as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "
\n", "\n", "\n", @@ -2800,9 +2782,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2833,9 +2813,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2941,14 +2919,14 @@ "\n", "\n", "\n", - " as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + " as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", "either they pass, or they have to be broken up with sound, or they have\n", "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "
\n", "\n", "\n", @@ -3008,7 +2986,226 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This starts to look good! \n", + "This starts to look good!\n", + "\n", + "Another thing we might notice if we look carefully is that in the word \"x-ray\",\n", + "a clearly medical term, only \"ray\" part is highlighted, and it's treated as a negative feature.\n", + "The reason for that is the default scikit-learn tokenizer uses ``r\"(?u)\\b\\w\\w+\\b\"`` pattern\n", + "which discards \"x\" and \"-\". We can fix that by passing a custom ``token_pattern`` argument which\n", + "will parse \"x-ray\" as a single token.\n", + "Another option would be to allow one-character tokens and\n", + "add bigrams - you can try that too!\n", + "Here is the pipeline with a custom ``token_pattern``:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " alt.atheism 0.92 0.77 0.84 319\n", + " comp.graphics 0.83 0.97 0.90 389\n", + " sci.med 0.95 0.89 0.92 396\n", + "soc.religion.christian 0.89 0.91 0.90 398\n", + "\n", + " avg / total 0.90 0.89 0.89 1502\n", + "\n", + "accuracy: 0.892\n" + ] + } + ], + "source": [ + "vec = TfidfVectorizer(stop_words='english', token_pattern=r\"(?u)\\b\\w[\\w\\-]+\\b\")\n", + "clf = LogisticRegressionCV()\n", + "pipe = make_pipeline(vec, clf)\n", + "pipe.fit(twenty_train.data, twenty_train.target)\n", + "\n", + "print_report(pipe)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " y=sci.med\n", + " \n", + "\n", + "\n", + " \n", + " (probability 0.958, score 1.883)\n", + "\n", + "top features\n", + "
\n", + " \n", + "\n", + " Contribution?\n", + " | \n", + " \n", + "Feature | \n", + " \n", + "
---|---|
\n", + " +5.461\n", + " | \n", + "\n", + " Highlighted in text (sum)\n", + " | \n", + " \n", + "
\n", + " -3.578\n", + " | \n", + "\n", + " <BIAS>\n", + " | \n", + " \n", + "
\n", + " as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", + "\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", + "\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", + "
\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "