From 809cc4329009be543a4f83eea675b5e68a9a60c1 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 27 Apr 2017 22:54:53 +0300 Subject: [PATCH] A small pipeline tweak: tokenization (x-ray) "x-ray" was tokenized as "ray": fix that by changing default tokenizer. --- ...t-learn text classification pipeline.ipynb | 845 +++++++++++------- 1 file changed, 509 insertions(+), 336 deletions(-) diff --git a/notebooks/Debugging scikit-learn text classification pipeline.ipynb b/notebooks/Debugging scikit-learn text classification pipeline.ipynb index 2baa6f8b..6e520ecc 100644 --- a/notebooks/Debugging scikit-learn text classification pipeline.ipynb +++ b/notebooks/Debugging scikit-learn text classification pipeline.ipynb @@ -49,9 +49,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", @@ -81,9 +79,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -94,7 +90,7 @@ " alt.atheism 0.93 0.80 0.86 319\n", " comp.graphics 0.87 0.96 0.91 389\n", " sci.med 0.94 0.81 0.87 396\n", - "soc.religion.christian 0.85 0.98 0.91 398\n", + "soc.religion.christian 0.85 0.97 0.91 398\n", "\n", " avg / total 0.90 0.89 0.89 1502\n", "\n", @@ -126,10 +122,16 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/kostia/shub/memex/eli5/venv/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", + " \"This module will be removed in 0.20.\", DeprecationWarning)\n" + ] + }, { "data": { "text/html": [ @@ -233,29 +235,29 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " +1.991\n", + " +1.669\n", " \n", " \n", - " x21167\n", + " x19218\n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.925\n", + " +1.663\n", " \n", " \n", - " x19218\n", + " x21167\n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.834\n", + " +1.565\n", " \n", " \n", " x5714\n", @@ -263,9 +265,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.813\n", + " +1.512\n", " \n", " \n", " x23677\n", @@ -273,9 +275,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.697\n", + " +1.422\n", " \n", " \n", " x15511\n", @@ -283,9 +285,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.696\n", + " +1.404\n", " \n", " \n", " x26415\n", @@ -293,9 +295,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.617\n", + " +1.350\n", " \n", " \n", " x6440\n", @@ -303,9 +305,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.594\n", + " +1.321\n", " \n", " \n", " x26412\n", @@ -314,24 +316,24 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " … 10174 more positive …\n", + " … 9897 more positive …\n", " \n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " … 25605 more negative …\n", + " … 25882 more negative …\n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " -1.686\n", + " -1.409\n", " \n", " \n", " x28473\n", @@ -341,7 +343,7 @@ " \n", " \n", " \n", - " -10.453\n", + " -9.757\n", " \n", " \n", " <BIAS>\n", @@ -378,7 +380,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " +1.702\n", " \n", @@ -388,7 +390,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.825\n", " \n", @@ -398,7 +400,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.798\n", " \n", @@ -408,7 +410,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.786\n", " \n", @@ -418,7 +420,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.779\n", " \n", @@ -428,7 +430,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.773\n", " \n", @@ -438,7 +440,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.729\n", " \n", @@ -448,7 +450,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.724\n", " \n", @@ -458,7 +460,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.702\n", " \n", @@ -469,22 +471,22 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " … 11710 more positive …\n", + " … 11436 more positive …\n", " \n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " … 24069 more negative …\n", + " … 24343 more negative …\n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " -1.379\n", " \n", @@ -523,7 +525,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " +2.016\n", " \n", @@ -533,9 +535,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.951\n", + " +1.952\n", " \n", " \n", " x12026\n", @@ -543,9 +545,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.758\n", + " +1.760\n", " \n", " \n", " x17854\n", @@ -553,7 +555,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +1.697\n", " \n", @@ -563,7 +565,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +1.655\n", " \n", @@ -573,7 +575,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +1.522\n", " \n", @@ -583,9 +585,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.518\n", + " +1.517\n", " \n", " \n", " x16328\n", @@ -594,22 +596,22 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " … 15007 more positive …\n", + " … 13401 more positive …\n", " \n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " … 20772 more negative …\n", + " … 22378 more negative …\n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " -1.764\n", " \n", @@ -619,9 +621,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " -2.171\n", + " -2.170\n", " \n", " \n", " x15699\n", @@ -629,9 +631,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " -5.013\n", + " -4.997\n", " \n", " \n", " <BIAS>\n", @@ -668,7 +670,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " +1.193\n", " \n", @@ -678,9 +680,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.030\n", + " +1.029\n", " \n", " \n", " x8609\n", @@ -688,9 +690,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.021\n", + " +1.020\n", " \n", " \n", " x8559\n", @@ -698,9 +700,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +0.946\n", + " +0.945\n", " \n", " \n", " x8798\n", @@ -708,9 +710,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +0.899\n", + " +0.898\n", " \n", " \n", " x8544\n", @@ -718,9 +720,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +0.797\n", + " +0.796\n", " \n", " \n", " x8553\n", @@ -729,24 +731,24 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " … 11122 more positive …\n", + " … 11998 more positive …\n", " \n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " … 24657 more negative …\n", + " … 23781 more negative …\n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " -0.852\n", + " -0.851\n", " \n", " \n", " x15699\n", @@ -754,9 +756,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " -0.894\n", + " -0.892\n", " \n", " \n", " x25663\n", @@ -764,7 +766,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " -1.181\n", " \n", @@ -774,9 +776,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " -1.243\n", + " -1.242\n", " \n", " \n", " x16881\n", @@ -875,9 +877,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# eli5.show_weights(clf, \n", @@ -895,9 +895,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1002,29 +1000,29 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " +1.991\n", + " +1.669\n", " \n", " \n", - " mathew\n", + " keith\n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.925\n", + " +1.663\n", " \n", " \n", - " keith\n", + " mathew\n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.834\n", + " +1.565\n", " \n", " \n", " atheism\n", @@ -1032,9 +1030,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.813\n", + " +1.512\n", " \n", " \n", " okcforum\n", @@ -1042,9 +1040,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.697\n", + " +1.422\n", " \n", " \n", " go\n", @@ -1052,9 +1050,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.696\n", + " +1.404\n", " \n", " \n", " psuvm\n", @@ -1062,9 +1060,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.617\n", + " +1.350\n", " \n", " \n", " believing\n", @@ -1072,9 +1070,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.594\n", + " +1.321\n", " \n", " \n", " psu\n", @@ -1083,24 +1081,24 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " … 10174 more positive …\n", + " … 9897 more positive …\n", " \n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " … 25605 more negative …\n", + " … 25882 more negative …\n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " -1.686\n", + " -1.409\n", " \n", " \n", " rutgers\n", @@ -1110,7 +1108,7 @@ " \n", " \n", " \n", - " -10.453\n", + " -9.757\n", " \n", " \n", " <BIAS>\n", @@ -1147,7 +1145,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " +1.702\n", " \n", @@ -1157,7 +1155,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.825\n", " \n", @@ -1167,7 +1165,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.798\n", " \n", @@ -1177,7 +1175,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.786\n", " \n", @@ -1187,7 +1185,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.779\n", " \n", @@ -1197,7 +1195,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.773\n", " \n", @@ -1207,7 +1205,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.729\n", " \n", @@ -1217,7 +1215,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.724\n", " \n", @@ -1227,7 +1225,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +0.702\n", " \n", @@ -1238,22 +1236,22 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " … 11710 more positive …\n", + " … 11436 more positive …\n", " \n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " … 24069 more negative …\n", + " … 24343 more negative …\n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " -1.379\n", " \n", @@ -1292,7 +1290,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " +2.016\n", " \n", @@ -1302,9 +1300,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.951\n", + " +1.952\n", " \n", " \n", " doctor\n", @@ -1312,9 +1310,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.758\n", + " +1.760\n", " \n", " \n", " information\n", @@ -1322,7 +1320,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +1.697\n", " \n", @@ -1332,7 +1330,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +1.655\n", " \n", @@ -1342,7 +1340,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " +1.522\n", " \n", @@ -1352,9 +1350,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.518\n", + " +1.517\n", " \n", " \n", " health\n", @@ -1363,22 +1361,22 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " … 15007 more positive …\n", + " … 13401 more positive …\n", " \n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " … 20772 more negative …\n", + " … 22378 more negative …\n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " -1.764\n", " \n", @@ -1388,9 +1386,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " -2.171\n", + " -2.170\n", " \n", " \n", " graphics\n", @@ -1398,9 +1396,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " -5.013\n", + " -4.997\n", " \n", " \n", " <BIAS>\n", @@ -1437,7 +1435,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " +1.193\n", " \n", @@ -1447,9 +1445,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.030\n", + " +1.029\n", " \n", " \n", " church\n", @@ -1457,9 +1455,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +1.021\n", + " +1.020\n", " \n", " \n", " christians\n", @@ -1467,9 +1465,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +0.946\n", + " +0.945\n", " \n", " \n", " clh\n", @@ -1477,9 +1475,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +0.899\n", + " +0.898\n", " \n", " \n", " christ\n", @@ -1487,9 +1485,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " +0.797\n", + " +0.796\n", " \n", " \n", " christian\n", @@ -1498,24 +1496,24 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " … 11122 more positive …\n", + " … 11998 more positive …\n", " \n", " \n", " \n", "\n", " \n", - " \n", + " \n", " \n", - " … 24657 more negative …\n", + " … 23781 more negative …\n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " -0.852\n", + " -0.851\n", " \n", " \n", " graphics\n", @@ -1523,9 +1521,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " -0.894\n", + " -0.892\n", " \n", " \n", " posting\n", @@ -1533,7 +1531,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " -1.181\n", " \n", @@ -1543,9 +1541,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " -1.243\n", + " -1.242\n", " \n", " \n", " host\n", @@ -1646,9 +1644,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1700,7 +1696,7 @@ "\n", "\n", " \n", - " (probability 0.000, score -8.709)\n", + " (probability 0.001, score -7.485)\n", "\n", "top features\n", "

\n", @@ -1720,9 +1716,9 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " +1.743\n", + " +2.272\n", " \n", " \n", " Highlighted in text (sum)\n", @@ -1736,7 +1732,7 @@ " \n", " \n", " \n", - " -10.453\n", + " -9.757\n", " \n", " \n", " <BIAS>\n", @@ -1753,24 +1749,24 @@ "\n", "\n", "

\n", - " from: brian@ucsd.edu (brian kantor)\n", - "subject: re: help for kidney stones ..............\n", - "organization: the avant-garde of the now, ltd.\n", - "lines: 12\n", - "nntp-posting-host: ucsd.edu\n", + " from: brian@ucsd.edu (brian kantor)\n", + "subject: re: help for kidney stones ..............\n", + "organization: the avant-garde of the now, ltd.\n", + "lines: 12\n", + "nntp-posting-host: ucsd.edu\n", "\n", - "as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + "as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "\n", - "demerol worked, although i nearly got arrested on my way home when i barfed\n", - "all over the police car parked just outside the er.\n", - "\t- brian\n", + "demerol worked, although i nearly got arrested on my way home when i barfed\n", + "all over the police car parked just outside the er.\n", + "\t- brian\n", "\n", "

\n", "\n", @@ -1812,7 +1808,7 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " -1.379\n", " \n", @@ -1822,7 +1818,7 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", " -3.213\n", " \n", @@ -1842,22 +1838,22 @@ "\n", "

\n", " from: brian@ucsd.edu (brian kantor)\n", - "subject: re: help for kidney stones ..............\n", - "organization: the avant-garde of the now, ltd.\n", - "lines: 12\n", - "nntp-posting-host: ucsd.edu\n", + "subject: re: help for kidney stones ..............\n", + "organization: the avant-garde of the now, ltd.\n", + "lines: 12\n", + "nntp-posting-host: ucsd.edu\n", "\n", - "as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + "as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "\n", - "demerol worked, although i nearly got arrested on my way home when i barfed\n", - "all over the police car parked just outside the er.\n", + "demerol worked, although i nearly got arrested on my way home when i barfed\n", + "all over the police car parked just outside the er.\n", "\t- brian\n", "\n", "

\n", @@ -1876,7 +1872,7 @@ "\n", "\n", " \n", - " (probability 0.989, score 3.945)\n", + " (probability 0.989, score 3.955)\n", "\n", "top features\n", "

\n", @@ -1896,9 +1892,9 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " +8.958\n", + " +8.951\n", " \n", " \n", " Highlighted in text (sum)\n", @@ -1910,9 +1906,9 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " -5.013\n", + " -4.997\n", " \n", " \n", " <BIAS>\n", @@ -1929,23 +1925,23 @@ "\n", "\n", "

\n", - " from: brian@ucsd.edu (brian kantor)\n", - "subject: re: help for kidney stones ..............\n", - "organization: the avant-garde of the now, ltd.\n", - "lines: 12\n", - "nntp-posting-host: ucsd.edu\n", + " from: brian@ucsd.edu (brian kantor)\n", + "subject: re: help for kidney stones ..............\n", + "organization: the avant-garde of the now, ltd.\n", + "lines: 12\n", + "nntp-posting-host: ucsd.edu\n", "\n", - "as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + "as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "\n", - "demerol worked, although i nearly got arrested on my way home when i barfed\n", - "all over the police car parked just outside the er.\n", + "demerol worked, although i nearly got arrested on my way home when i barfed\n", + "all over the police car parked just outside the er.\n", "\t- brian\n", "\n", "

\n", @@ -1988,9 +1984,9 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", - " -0.258\n", + " -0.264\n", " \n", " \n", " <BIAS>\n", @@ -1998,9 +1994,9 @@ " \n", "\n", " \n", - " \n", + " \n", " \n", - " -6.899\n", + " -6.893\n", " \n", " \n", " Highlighted in text (sum)\n", @@ -2017,24 +2013,24 @@ "\n", "\n", "

\n", - " from: brian@ucsd.edu (brian kantor)\n", - "subject: re: help for kidney stones ..............\n", - "organization: the avant-garde of the now, ltd.\n", - "lines: 12\n", - "nntp-posting-host: ucsd.edu\n", + " from: brian@ucsd.edu (brian kantor)\n", + "subject: re: help for kidney stones ..............\n", + "organization: the avant-garde of the now, ltd.\n", + "lines: 12\n", + "nntp-posting-host: ucsd.edu\n", "\n", - "as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + "as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "\n", - "demerol worked, although i nearly got arrested on my way home when i barfed\n", - "all over the police car parked just outside the er.\n", - "\t- brian\n", + "demerol worked, although i nearly got arrested on my way home when i barfed\n", + "all over the police car parked just outside the er.\n", + "\t- brian\n", "\n", "

\n", "\n", @@ -2106,9 +2102,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "twenty_train = fetch_20newsgroups(\n", @@ -2142,9 +2136,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2152,14 +2144,14 @@ "text": [ " precision recall f1-score support\n", "\n", - " alt.atheism 0.83 0.78 0.80 319\n", - " comp.graphics 0.82 0.96 0.88 389\n", - " sci.med 0.89 0.80 0.84 396\n", - "soc.religion.christian 0.88 0.86 0.87 398\n", + " alt.atheism 0.84 0.77 0.81 319\n", + " comp.graphics 0.83 0.95 0.89 389\n", + " sci.med 0.90 0.79 0.84 396\n", + "soc.religion.christian 0.86 0.90 0.88 398\n", "\n", - " avg / total 0.85 0.85 0.85 1502\n", + " avg / total 0.86 0.86 0.86 1502\n", "\n", - "accuracy: 0.852\n" + "accuracy: 0.858\n" ] } ], @@ -2181,9 +2173,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2236,7 +2226,7 @@ "\n", "\n", " \n", - " (probability 0.732, score 0.031)\n", + " (probability 0.711, score 0.031)\n", "\n", "top features\n", "

\n", @@ -2289,14 +2279,14 @@ "\n", "\n", "

\n", - " as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + " as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", + "either they pass, or they have to be broken up with sound, or they have\n", "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "

\n", "\n", "\n", @@ -2366,9 +2356,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2399,9 +2387,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2584,9 +2570,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2619,9 +2603,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2674,7 +2656,7 @@ "\n", "\n", " \n", - " (probability 0.987, score 1.585)\n", + " (probability 0.987, score 1.584)\n", "\n", "top features\n", "

\n", @@ -2710,7 +2692,7 @@ " \n", " \n", " \n", - " -5.203\n", + " -5.204\n", " \n", " \n", " <BIAS>\n", @@ -2727,14 +2709,14 @@ "\n", "\n", "

\n", - " as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + " as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", - "either they pass, or they have to be broken up with sound, or they have\n", - "to be extracted surgically.\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "

\n", "\n", "\n", @@ -2800,9 +2782,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2833,9 +2813,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -2941,14 +2919,14 @@ "\n", "\n", "

\n", - " as i recall from my bout with kidney stones, there isn't any\n", - "medication that can do anything about them except relieve the pain.\n", + " as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", "\n", "either they pass, or they have to be broken up with sound, or they have\n", "to be extracted surgically.\n", "\n", - "when i was in, the x-ray tech happened to mention that she'd had kidney\n", - "stones and children, and the childbirth hurt less.\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", "

\n", "\n", "\n", @@ -3008,7 +2986,226 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This starts to look good! \n", + "This starts to look good!\n", + "\n", + "Another thing we might notice if we look carefully is that in the word \"x-ray\",\n", + "a clearly medical term, only \"ray\" part is highlighted, and it's treated as a negative feature.\n", + "The reason for that is the default scikit-learn tokenizer uses ``r\"(?u)\\b\\w\\w+\\b\"`` pattern\n", + "which discards \"x\" and \"-\". We can fix that by passing a custom ``token_pattern`` argument which\n", + "will parse \"x-ray\" as a single token.\n", + "Another option would be to allow one-character tokens and\n", + "add bigrams - you can try that too!\n", + "Here is the pipeline with a custom ``token_pattern``:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " alt.atheism 0.92 0.77 0.84 319\n", + " comp.graphics 0.83 0.97 0.90 389\n", + " sci.med 0.95 0.89 0.92 396\n", + "soc.religion.christian 0.89 0.91 0.90 398\n", + "\n", + " avg / total 0.90 0.89 0.89 1502\n", + "\n", + "accuracy: 0.892\n" + ] + } + ], + "source": [ + "vec = TfidfVectorizer(stop_words='english', token_pattern=r\"(?u)\\b\\w[\\w\\-]+\\b\")\n", + "clf = LogisticRegressionCV()\n", + "pipe = make_pipeline(vec, clf)\n", + "pipe.fit(twenty_train.data, twenty_train.target)\n", + "\n", + "print_report(pipe)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=sci.med\n", + " \n", + "\n", + "\n", + " \n", + " (probability 0.958, score 1.883)\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Contribution?\n", + " Feature
\n", + " +5.461\n", + " \n", + " Highlighted in text (sum)\n", + "
\n", + " -3.578\n", + " \n", + " <BIAS>\n", + "
\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "

\n", + " as i recall from my bout with kidney stones, there isn't any\n", + "medication that can do anything about them except relieve the pain.\n", + "\n", + "either they pass, or they have to be broken up with sound, or they have\n", + "to be extracted surgically.\n", + "\n", + "when i was in, the x-ray tech happened to mention that she'd had kidney\n", + "stones and children, and the childbirth hurt less.\n", + "

\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eli5.show_prediction(clf, twenty_test.data[0], vec=vec, \n", + " target_names=twenty_test.target_names,\n", + " targets=['sci.med'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now \"x-ray\" is a positive feature, exactly as we wanted! Average accuracy did not improve though - perhaps cases like this are too rare.\n", "\n", "\n", "## 4. Char-based pipeline\n", @@ -3019,9 +3216,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3053,9 +3248,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3449,9 +3642,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3482,9 +3673,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -3881,9 +4070,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -3925,9 +4112,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -4106,9 +4291,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -4855,9 +5038,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from eli5.sklearn import InvertableHashingVectorizer\n", @@ -4867,9 +5048,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "ivec = InvertableHashingVectorizer(vec)\n", @@ -4881,9 +5060,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -6032,9 +6209,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -6065,9 +6240,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -6263,7 +6436,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.5.3" } }, "nbformat": 4,