Merge pull request #375 from vespa-engine/tgm/deprecate-evaluation-mo…

…dule Deprecate evaluation module
vespa-engine · Sep 22, 2022 · f466795 · f466795
2 parents 9fb0201 + 8c15e08
commit f466795
Show file tree

Hide file tree

Showing 9 changed files with 137 additions and 130 deletions.
diff --git a/docs/sphinx/source/evaluation.ipynb b/docs/sphinx/source/evaluation.ipynb
@@ -85,7 +85,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from vespa.evaluation import MatchRatio, Recall, ReciprocalRank\n",
+    "from learntorank.evaluation import MatchRatio, Recall, ReciprocalRank\n",
     "\n",
     "eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)]"
    ]
@@ -190,7 +190,10 @@
     }
    ],
    "source": [
-    "evaluation = app.evaluate(\n",
+    "from learntorank.evaluation import evaluate\n",
+    "\n",
+    "evaluation = evaluate(\n",
+    "    app=app,\n",
     "    labeled_data = labeled_data,\n",
     "    eval_metrics = eval_metrics, \n",
     "    query_model = query_model, \n",
@@ -275,10 +278,12 @@
    ],
    "source": [
     "from pandas import concat, DataFrame\n",
+    "from learntorank.evaluation import evaluate_query\n",
     "\n",
     "evaluation = []\n",
     "for query_data in labeled_data:\n",
-    "    query_evaluation = app.evaluate_query(\n",
+    "    query_evaluation = evaluate_query(\n",
+    "        app=app,\n",
     "        eval_metrics = eval_metrics, \n",
     "        query_model = query_model, \n",
     "        query_id = query_data[\"query_id\"], \n",
@@ -309,7 +314,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -322,13 +327,14 @@
        " 'reciprocal_rank_10': 1.0}"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "query_evaluation = app.evaluate_query(\n",
+    "query_evaluation = evaluate_query(\n",
+    "    app=app,\n",
     "    eval_metrics = eval_metrics, \n",
     "    query_model = query_model, \n",
     "    query_id = 0, \n",
@@ -350,7 +356,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -363,13 +369,14 @@
        " 'reciprocal_rank_10': 1.0}"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "query_evaluation = app.evaluate_query(\n",
+    "query_evaluation = evaluate_query(\n",
+    "    app=app,\n",
     "    eval_metrics = eval_metrics, \n",
     "    query_model = query_model, \n",
     "    query_id = 0, \n",
@@ -399,9 +406,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.13"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/docs/sphinx/source/notebook_requirements.txt b/docs/sphinx/source/notebook_requirements.txt
@@ -1 +1,2 @@
-plotly
+plotly
+learntorank
diff --git a/docs/sphinx/source/use_cases/cord19/cord19_connect_evaluate.ipynb b/docs/sphinx/source/use_cases/cord19/cord19_connect_evaluate.ipynb
@@ -294,7 +294,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from vespa.evaluation import MatchRatio, Recall, ReciprocalRank, NormalizedDiscountedCumulativeGain\n",
+    "from learntorank.evaluation import MatchRatio, Recall, ReciprocalRank, NormalizedDiscountedCumulativeGain\n",
     "\n",
     "eval_metrics = [\n",
     "    MatchRatio(), \n",
@@ -371,40 +371,40 @@
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">match_ratio</th>\n",
        "      <th>mean</th>\n",
-       "      <td>0.412386</td>\n",
-       "      <td>0.412386</td>\n",
+       "      <td>0.411789</td>\n",
+       "      <td>0.411789</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>median</th>\n",
-       "      <td>0.282816</td>\n",
-       "      <td>0.282816</td>\n",
+       "      <td>0.282227</td>\n",
+       "      <td>0.282227</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>std</th>\n",
-       "      <td>0.238306</td>\n",
-       "      <td>0.238306</td>\n",
+       "      <td>0.238502</td>\n",
+       "      <td>0.238502</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">recall_10</th>\n",
        "      <th>mean</th>\n",
-       "      <td>0.007978</td>\n",
-       "      <td>0.005489</td>\n",
+       "      <td>0.007753</td>\n",
+       "      <td>0.005522</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>median</th>\n",
-       "      <td>0.005827</td>\n",
+       "      <td>0.006152</td>\n",
        "      <td>0.004092</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>std</th>\n",
-       "      <td>0.007009</td>\n",
-       "      <td>0.005449</td>\n",
+       "      <td>0.006355</td>\n",
+       "      <td>0.005451</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">reciprocal_rank_10</th>\n",
        "      <th>mean</th>\n",
-       "      <td>0.597238</td>\n",
-       "      <td>0.564913</td>\n",
+       "      <td>0.592024</td>\n",
+       "      <td>0.561579</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>median</th>\n",
@@ -413,43 +413,43 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>std</th>\n",
-       "      <td>0.406171</td>\n",
-       "      <td>0.400010</td>\n",
+       "      <td>0.390161</td>\n",
+       "      <td>0.401255</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">ndcg_10</th>\n",
        "      <th>mean</th>\n",
-       "      <td>0.645486</td>\n",
-       "      <td>0.604916</td>\n",
+       "      <td>0.354018</td>\n",
+       "      <td>0.275940</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>median</th>\n",
-       "      <td>0.690601</td>\n",
-       "      <td>0.649283</td>\n",
+       "      <td>0.366791</td>\n",
+       "      <td>0.253619</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>std</th>\n",
-       "      <td>0.290917</td>\n",
-       "      <td>0.308170</td>\n",
+       "      <td>0.221325</td>\n",
+       "      <td>0.202369</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "model                       or_bm25  or_default\n",
-       "match_ratio        mean    0.412386    0.412386\n",
-       "                   median  0.282816    0.282816\n",
-       "                   std     0.238306    0.238306\n",
-       "recall_10          mean    0.007978    0.005489\n",
-       "                   median  0.005827    0.004092\n",
-       "                   std     0.007009    0.005449\n",
-       "reciprocal_rank_10 mean    0.597238    0.564913\n",
+       "match_ratio        mean    0.411789    0.411789\n",
+       "                   median  0.282227    0.282227\n",
+       "                   std     0.238502    0.238502\n",
+       "recall_10          mean    0.007753    0.005522\n",
+       "                   median  0.006152    0.004092\n",
+       "                   std     0.006355    0.005451\n",
+       "reciprocal_rank_10 mean    0.592024    0.561579\n",
        "                   median  0.500000    0.500000\n",
-       "                   std     0.406171    0.400010\n",
-       "ndcg_10            mean    0.645486    0.604916\n",
-       "                   median  0.690601    0.649283\n",
-       "                   std     0.290917    0.308170"
+       "                   std     0.390161    0.401255\n",
+       "ndcg_10            mean    0.354018    0.275940\n",
+       "                   median  0.366791    0.253619\n",
+       "                   std     0.221325    0.202369"
       ]
      },
      "execution_count": 9,
@@ -458,7 +458,9 @@
     }
    ],
    "source": [
-    "evaluations = app.evaluate(\n",
+    "from learntorank.evaluation import evaluate\n",
+    "evaluations = evaluate(\n",
+    "    app=app,\n",
     "    labeled_data = labeled_data,\n",
     "    eval_metrics = eval_metrics,\n",
     "    query_model = query_models,\n",
@@ -514,58 +516,58 @@
        "      <th>0</th>\n",
        "      <td>or_default</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.231523</td>\n",
+       "      <td>0.230847</td>\n",
        "      <td>0.008584</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>0.868373</td>\n",
+       "      <td>0.519431</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>or_bm25</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.231523</td>\n",
-       "      <td>0.004292</td>\n",
-       "      <td>0.142857</td>\n",
-       "      <td>0.483639</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
        "      <td>or_default</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.755509</td>\n",
+       "      <td>0.755230</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>or_bm25</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.755509</td>\n",
-       "      <td>0.002985</td>\n",
-       "      <td>0.250000</td>\n",
-       "      <td>0.430677</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
+       "      <th>2</th>\n",
        "      <td>or_default</td>\n",
        "      <td>3</td>\n",
-       "      <td>0.265400</td>\n",
+       "      <td>0.264601</td>\n",
        "      <td>0.001534</td>\n",
        "      <td>0.142857</td>\n",
+       "      <td>0.036682</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>or_default</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0.843341</td>\n",
+       "      <td>0.001764</td>\n",
        "      <td>0.333333</td>\n",
+       "      <td>0.110046</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>or_default</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.901317</td>\n",
+       "      <td>0.003096</td>\n",
+       "      <td>0.250000</td>\n",
+       "      <td>0.258330</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "        model  query_id  match_ratio  recall_10  reciprocal_rank_10   ndcg_10\n",
-       "0  or_default         1     0.231523   0.008584            1.000000  0.868373\n",
-       "1     or_bm25         1     0.231523   0.004292            0.142857  0.483639\n",
-       "2  or_default         2     0.755509   0.000000            0.000000  0.000000\n",
-       "3     or_bm25         2     0.755509   0.002985            0.250000  0.430677\n",
-       "4  or_default         3     0.265400   0.001534            0.142857  0.333333"
+       "0  or_default         1     0.230847   0.008584            1.000000  0.519431\n",
+       "1  or_default         2     0.755230   0.000000            0.000000  0.000000\n",
+       "2  or_default         3     0.264601   0.001534            0.142857  0.036682\n",
+       "3  or_default         4     0.843341   0.001764            0.333333  0.110046\n",
+       "4  or_default         5     0.901317   0.003096            0.250000  0.258330"
       ]
      },
      "execution_count": 10,
@@ -574,7 +576,8 @@
     }
    ],
    "source": [
-    "evaluations = app.evaluate(\n",
+    "evaluations = evaluate(\n",
+    "    app=app,\n",
     "    labeled_data = labeled_data,\n",
     "    eval_metrics = eval_metrics,\n",
     "    query_model = query_models,\n",
@@ -602,7 +605,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.9.13"
   }
  },
  "nbformat": 4,

diff --git a/screwdriver.yaml b/screwdriver.yaml
@@ -54,7 +54,7 @@ jobs:
       - install-python: |
           dnf install -y python38-pip
           python3 -m pip install --upgrade pip
-          python3 -m pip install pytest
+          python3 -m pip install pytest learntorank
           python3 -m pip install -e .[full]
       - run-integration-running-instance: |
           pytest vespa/test_integration_running_instance.py -s -v
@@ -128,7 +128,7 @@ jobs:
       - install-python: |
           dnf install -y python38-pip
           python3 -m pip install --upgrade pip
-          python3 -m pip install pytest
+          python3 -m pip install pytest learntorank
           python3 -m pip install -e .[full]
       - run-integration-cloud: |
           pytest vespa/test_integration_vespa_cloud.py -s -v

diff --git a/vespa/application.py b/vespa/application.py
@@ -8,6 +8,7 @@
 import concurrent.futures
 from collections import Counter
 from typing import Optional, Dict, Tuple, List, IO, Union
+import warnings
 
 import requests
 from pandas import DataFrame
@@ -1225,7 +1226,11 @@ def evaluate_query(
         :param kwargs: Extra keyword arguments to be included in the Vespa Query.
         :return: Dict containing query_id and metrics according to the selected evaluation metrics.
         """
-
+        warnings.warn(
+            "vespa.application.Vespa.evaluate_query is deprecated, "
+            "use learntorank.evaluation.evaluate_query from the learntorank library instead.",
+            DeprecationWarning,
+        )
         query_results = self.query(query=query, query_model=query_model, **kwargs)
         evaluation = {"model": query_model.name, "query_id": query_id}
         for evaluator in eval_metrics:
@@ -1291,6 +1296,12 @@ def evaluate(
         :param kwargs: Extra keyword arguments to be included in the Vespa Query.
         :return: DataFrame containing query_id and metrics according to the selected evaluation metrics.
         """
+        warnings.warn(
+            "vespa.application.Vespa.evaluate is deprecated, "
+            "use learntorank.evaluation.evaluate from the learntorank library instead.",
+            DeprecationWarning,
+        )
+
         if isinstance(labeled_data, DataFrame):
             labeled_data = parse_labeled_data(df=labeled_data)