Merge pull request #382 from vespa-engine/tgm/body-query-only

Move query module to learntorank
vespa-engine · Oct 9, 2022 · 7328de2 · 7328de2
2 parents ec80c6a + 75c3e18
commit 7328de2
Show file tree

Hide file tree

Showing 14 changed files with 514 additions and 2,127 deletions.
diff --git a/docs/sphinx/source/collect-training-data.ipynb b/docs/sphinx/source/collect-training-data.ipynb
diff --git a/docs/sphinx/source/exchange-data-with-app.ipynb b/docs/sphinx/source/exchange-data-with-app.ipynb
@@ -12,21 +12,117 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Waiting for configuration server.\n",
-      "Waiting for configuration server.\n",
-      "Waiting for configuration server.\n",
-      "Waiting for configuration server.\n",
-      "Waiting for application status.\n",
-      "Waiting for application status.\n",
+      "Waiting for configuration server, 0/300 seconds...\n",
+      "Waiting for configuration server, 5/300 seconds...\n",
+      "Waiting for configuration server, 10/300 seconds...\n",
+      "Waiting for application status, 0/300 seconds...\n",
+      "Waiting for application status, 5/300 seconds...\n",
+      "Waiting for application status, 10/300 seconds...\n",
+      "Waiting for application status, 15/300 seconds...\n",
+      "Waiting for application status, 20/300 seconds...\n",
+      "Waiting for application status, 25/300 seconds...\n",
+      "Waiting for application status, 30/300 seconds...\n",
       "Finished deployment.\n"
      ]
     }
    ],
    "source": [
     "# this is a hidden cell. It will not show on the documentation HTML.\n",
     "import os\n",
+    "from vespa.package import (\n",
+    "    HNSW,\n",
+    "    Document,\n",
+    "    Field,\n",
+    "    Schema,\n",
+    "    FieldSet,\n",
+    "#    SecondPhaseRanking,\n",
+    "    RankProfile,\n",
+    "    ApplicationPackage,\n",
+    "    QueryProfile,\n",
+    "    QueryProfileType,\n",
+    "    QueryTypeField\n",
+    ")\n",
+    "\n",
     "from vespa.deployment import VespaDocker\n",
-    "from vespa.gallery import QuestionAnswering\n",
+    "\n",
+    "class QuestionAnswering(ApplicationPackage):\n",
+    "    def __init__(self, name: str = \"qa\"):\n",
+    "        context_document = Document(\n",
+    "            fields=[\n",
+    "                Field(\n",
+    "                    name=\"questions\",\n",
+    "                    type=\"array<int>\",\n",
+    "                    indexing=[\"summary\", \"attribute\"],\n",
+    "                ),\n",
+    "                Field(name=\"dataset\", type=\"string\", indexing=[\"summary\", \"attribute\"]),\n",
+    "                Field(name=\"context_id\", type=\"int\", indexing=[\"summary\", \"attribute\"]),\n",
+    "                Field(\n",
+    "                    name=\"text\",\n",
+    "                    type=\"string\",\n",
+    "                    indexing=[\"summary\", \"index\"],\n",
+    "                    index=\"enable-bm25\",\n",
+    "                ),\n",
+    "            ]\n",
+    "        )\n",
+    "        context_schema = Schema(\n",
+    "            name=\"context\",\n",
+    "            document=context_document,\n",
+    "            fieldsets=[FieldSet(name=\"default\", fields=[\"text\"])],\n",
+    "            rank_profiles=[\n",
+    "                RankProfile(name=\"bm25\", inherits=\"default\", first_phase=\"bm25(text)\"),\n",
+    "                RankProfile(\n",
+    "                    name=\"nativeRank\",\n",
+    "                    inherits=\"default\",\n",
+    "                    first_phase=\"nativeRank(text)\",\n",
+    "                ),\n",
+    "            ],\n",
+    "        )\n",
+    "        sentence_document = Document(\n",
+    "            inherits=\"context\",\n",
+    "            fields=[\n",
+    "                Field(\n",
+    "                    name=\"sentence_embedding\",\n",
+    "                    type=\"tensor<float>(x[512])\",\n",
+    "                    indexing=[\"attribute\", \"index\"],\n",
+    "                    ann=HNSW(\n",
+    "                        distance_metric=\"euclidean\",\n",
+    "                        max_links_per_node=16,\n",
+    "                        neighbors_to_explore_at_insert=500,\n",
+    "                    ),\n",
+    "                )\n",
+    "            ],\n",
+    "        )\n",
+    "        sentence_schema = Schema(\n",
+    "            name=\"sentence\",\n",
+    "            document=sentence_document,\n",
+    "            fieldsets=[FieldSet(name=\"default\", fields=[\"text\"])],\n",
+    "            rank_profiles=[\n",
+    "                RankProfile(\n",
+    "                    name=\"semantic-similarity\",\n",
+    "                    inherits=\"default\",\n",
+    "                    first_phase=\"closeness(sentence_embedding)\",\n",
+    "                ),\n",
+    "                RankProfile(name=\"bm25\", inherits=\"default\", first_phase=\"bm25(text)\"),\n",
+    "                RankProfile(\n",
+    "                    name=\"bm25-semantic-similarity\",\n",
+    "                    inherits=\"default\",\n",
+    "                    first_phase=\"bm25(text) + closeness(sentence_embedding)\",\n",
+    "                ),\n",
+    "            ],\n",
+    "        )\n",
+    "        super().__init__(\n",
+    "            name=name,\n",
+    "            schema=[context_schema, sentence_schema],\n",
+    "            query_profile=QueryProfile(),\n",
+    "            query_profile_type=QueryProfileType(\n",
+    "                fields=[\n",
+    "                    QueryTypeField(\n",
+    "                        name=\"ranking.features.query(query_embedding)\",\n",
+    "                        type=\"tensor<float>(x[512])\",\n",
+    "                    )\n",
+    "                ]\n",
+    "            ),\n",
+    "        )\n",
     "\n",
     "app_package = QuestionAnswering()\n",
     "vespa_docker = VespaDocker()\n",
@@ -147,7 +243,16 @@
    "execution_count": 4,
    "id": "meaning-jamaica",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successful documents fed: 100/100.\n",
+      "Batch progress: 1/1.\n"
+     ]
+    }
+   ],
    "source": [
     "response = app.feed_batch(schema=\"sentence\", batch=batch_feed)"
    ]
@@ -648,7 +753,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.13"
   }
  },
  "nbformat": 4,

diff --git a/docs/sphinx/source/query-model.ipynb b/docs/sphinx/source/query-model.ipynb
@@ -112,21 +112,21 @@
        "      <th>0</th>\n",
        "      <td>0</td>\n",
        "      <td>id:covid-19:doc::142863</td>\n",
-       "      <td>11.334371</td>\n",
+       "      <td>11.824458</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0</td>\n",
        "      <td>id:covid-19:doc::187156</td>\n",
-       "      <td>11.318515</td>\n",
+       "      <td>11.818079</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>0</td>\n",
        "      <td>id:covid-19:doc::31328</td>\n",
-       "      <td>11.288960</td>\n",
+       "      <td>11.288179</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -135,9 +135,9 @@
       ],
       "text/plain": [
        "   qid                   doc_id      score  rank\n",
-       "0    0  id:covid-19:doc::142863  11.334371     0\n",
-       "1    0  id:covid-19:doc::187156  11.318515     1\n",
-       "2    0   id:covid-19:doc::31328  11.288960     2"
+       "0    0  id:covid-19:doc::142863  11.824458     0\n",
+       "1    0  id:covid-19:doc::187156  11.818079     1\n",
+       "2    0   id:covid-19:doc::31328  11.288179     2"
       ]
      },
      "execution_count": 4,
@@ -146,7 +146,13 @@
     }
    ],
    "source": [
-    "standard_result = app.query(query=\"this is a test\", query_model=standard_query_model)\n",
+    "from learntorank.query import send_query\n",
+    "\n",
+    "standard_result = send_query(\n",
+    "    app=app, \n",
+    "    query=\"this is a test\", \n",
+    "    query_model=standard_query_model\n",
+    ")\n",
     "standard_result.get_hits().head(3)"
    ]
   },
@@ -187,21 +193,21 @@
        "      <th>0</th>\n",
        "      <td>0</td>\n",
        "      <td>id:covid-19:doc::142863</td>\n",
-       "      <td>11.334371</td>\n",
+       "      <td>11.824458</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0</td>\n",
        "      <td>id:covid-19:doc::187156</td>\n",
-       "      <td>11.318515</td>\n",
+       "      <td>11.818079</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>0</td>\n",
        "      <td>id:covid-19:doc::31328</td>\n",
-       "      <td>11.288960</td>\n",
+       "      <td>11.288179</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -210,9 +216,9 @@
       ],
       "text/plain": [
        "   qid                   doc_id      score  rank\n",
-       "0    0  id:covid-19:doc::142863  11.334371     0\n",
-       "1    0  id:covid-19:doc::187156  11.318515     1\n",
-       "2    0   id:covid-19:doc::31328  11.288960     2"
+       "0    0  id:covid-19:doc::142863  11.824458     0\n",
+       "1    0  id:covid-19:doc::187156  11.818079     1\n",
+       "2    0   id:covid-19:doc::31328  11.288179     2"
       ]
      },
      "execution_count": 5,
@@ -221,7 +227,11 @@
     }
    ],
    "source": [
-    "flexible_result = app.query(query=\"this is a test\", query_model=flexible_query_model)\n",
+    "flexible_result = send_query(\n",
+    "    app=app, \n",
+    "    query=\"this is a test\", \n",
+    "    query_model=flexible_query_model\n",
+    ")\n",
     "flexible_result.get_hits().head(3)"
    ]
   }

diff --git a/docs/sphinx/source/query.ipynb b/docs/sphinx/source/query.ipynb
@@ -66,7 +66,7 @@
     {
      "data": {
       "text/plain": [
-       "9865"
+       "9758"
       ]
      },
      "execution_count": 4,
@@ -120,7 +120,7 @@
     {
      "data": {
       "text/plain": [
-       "['8n6eybze', '2lwzhqer', '8n6eybze', '8art2tyj', 'xej338lo']"
+       "['8n6eybze', '2lwzhqer', '8art2tyj', 'oud5ioks', 'ifanumo8']"
       ]
      },
      "execution_count": 6,
@@ -152,9 +152,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from learntorank.query import QueryModel, OR, Ranking\n",
+    "from learntorank.query import QueryModel, OR, Ranking, send_query\n",
     "\n",
-    "results = app.query(\n",
+    "results = send_query(\n",
+    "    app=app,\n",
     "    query=\"Is remdesivir an effective treatment for COVID-19?\", \n",
     "    query_model = QueryModel(\n",
     "        match_phase=OR(), \n",
@@ -224,8 +225,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results = app.query(query=\"Is remdesivir an effective treatment for COVID-19?\", \n",
-    "          query_model=query_model)"
+    "results = send_query(\n",
+    "    app=app,\n",
+    "    query=\"Is remdesivir an effective treatment for COVID-19?\", \n",
+    "    query_model=query_model\n",
+    ")"
    ]
   },
   {
@@ -236,7 +240,7 @@
     {
      "data": {
       "text/plain": [
-       "1520"
+       "1513"
       ]
      },
      "execution_count": 11,
@@ -270,7 +274,7 @@
     {
      "data": {
       "text/plain": [
-       "[144384, 269386, 144385]"
+       "[144384, 269386, 280365]"
       ]
      },
      "execution_count": 12,
@@ -296,9 +300,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_with_recall = app.query(query=\"Is remdesivir an effective treatment for COVID-19?\", \n",
-    "                    query_model=query_model,\n",
-    "                    recall = (\"id\", top_ids[1:3]))"
+    "results_with_recall = send_query(\n",
+    "    app=app,\n",
+    "    query=\"Is remdesivir an effective treatment for COVID-19?\", \n",
+    "    query_model=query_model,\n",
+    "    recall = (\"id\", top_ids[1:3])\n",
+    ")"
    ]
   },
   {
@@ -316,7 +323,7 @@
     {
      "data": {
       "text/plain": [
-       "[269386, 144385]"
+       "[269386, 280365]"
       ]
      },
      "execution_count": 14,