nextstrain · jameshadfield · Apr 14, 2024 · Apr 15, 2024
diff --git a/env/production/config.json b/env/production/config.json
@@ -110,5 +110,5 @@
   "OIDC_GROUPS_CLAIM": "cognito:groups",
   "SESSION_COOKIE_DOMAIN": "nextstrain.org",
   "GROUPS_DATA_FILE": "groups.json",
-  "RESOURCE_INDEX": "s3://nextstrain-inventories/resources.json.gz"
+  "RESOURCE_INDEX": "./index.files.json.gz"
 }
diff --git a/env/testing/config.json b/env/testing/config.json
@@ -108,5 +108,5 @@
   "OIDC_USERNAME_CLAIM": "cognito:username",
   "OIDC_GROUPS_CLAIM": "cognito:groups",
   "GROUPS_DATA_FILE": "groups.json",
-  "RESOURCE_INDEX": "s3://nextstrain-inventories/resources.json.gz"
+  "RESOURCE_INDEX": "./index.files.json.gz"
 }
diff --git a/index.files.json.gz b/index.files.json.gz
diff --git a/resourceIndexer/coreStagingS3.js b/resourceIndexer/coreStagingS3.js
@@ -30,41 +30,61 @@ function categoriseCoreObjects(item, staging) {
     || key.startsWith('datasets_')
   ) return false;
 
-  // On the core bucket, directory-like hierarchies are used for intermediate
-  // files. These intermediate files may include files which auspice can
-  // display, but nextstrain.org cannot map URLs to directory-like hierarchies.
-  // There are other resourceTypes here we may consider in the future -- e.g.
-  // model output JSONs
-  if (key.includes("/")) {
-    if (staging===true) return false;
-    if (key.startsWith('files/')) {
-      if (
-        key.includes('/archive/')
-        || key.includes('/test/')
-        || key.includes('/workflows/')
-        || key.includes('/branch/')
-        || key.includes('/trial/')
-        || key.includes('/test-data/')
-        || key.includes('jen_test/')
-        || key.match(/\/nextclade-full-run-[\d-]+--UTC\//)
-        || key.match(/\/\d{4}-\d{2}-\d{2}_results.json/) // forecasts-ncov
-        || key.endsWith('.png')                          // forecasts-ncov
-      ) {
-        return false;
-      }
-      item.resourceType = 'intermediate';
-      /* The ID is used for grouping. For a nextstrain.org dataset this would be
-      combined with the source to form a nextstrain URL, however that's not
-      applicable here. Instead we use the filepath information without the
-      leading 'files/' and without the (trailing) filename so that different
-      files in the same directory structure get grouped together. For instance,
-      files/ncov/open/x.json -> ncov/open */
-      item.resourcePath = key.split('/').slice(1, -1).join('/')
-      return item;
+  /* Intermediate files in the core bucket are many and varied, however we expect them
+  to follow the format specified in <https://docs.nextstrain.org/en/latest/reference/data-files.html>
+  At the moment we only consider "workflows", i.e. files in `/files/workflows/*`
+  as there are no "datasets" ("build files"?) intermediates.
+  The file name schema is:
+    /files
+      /workflows
+        {/workflow-repo}                (matching github.com/nextstrain{/workflow-repo})
+          {/arbitrary-structure*}
+            /metadata.tsv.zst (etc)
+            /sequences.fasta.zst (etc)
+  For the current listing we filter out any files where "/arbitrary-structure*" matches
+  some hardcoded list in an attempt to filter out test runs which we don't want to surface.
+
+  We also include /files/ncov which predates the above structure design.
+
+  The reported resource ID does not include the "/files/workflows" prefix.
+
+  Redirects aren't considered when constructing the ID, so (e.g.) "monkeypox" and "mpox" are independent.
+  */
+  const intermediateExcludePatterns = [
+    /\/branch\//,
+    /\/test\//,
+    /\/trial\//,
+    /\/trials\//,
+    /\/nextclade-full-run[\d-]+--UTC\//, /* We could detail versions via the datestamped filename if desired */
+    /\/\d{4}-\d{2}-\d{2}_results\.json/, // forecasts-ncov
+    /\.png$/, // forecasts-ncov
+  ]
+  if ((key.startsWith("files/workflows/") || key.startsWith("files/ncov/")) && staging===false) {
+    for (const pattern of intermediateExcludePatterns) {
+      if (key.match(pattern)) return false;
     }
-    return false;
+    item.resourceType = 'intermediate';
+    /* The ID is used for grouping. For a nextstrain.org dataset this would be
+    combined with the source to form a nextstrain URL, however that's not
+    applicable here. Instead we use the filepath information without the
+    leading 'files/' and without the (trailing) filename so that different
+    files in the same directory structure get grouped together. For instance:
+    * files/ncov/open/100k/metadata.tsv.xz -> ncov/open/100k
+    * files/workflows/zika/sequences.fasta.zst -> zika
+    */
+    item.resourcePath = key
+      .replace(/^files\/ncov\//, "ncov/")
+      .replace(/^files\/workflows\//, "")
+      .replace(/\/[^\/]+$/, '')
+    return item;
   }
 
+  /* All other files with a directory-like structure, including those on the
+  staging bucket, are ignored. Note that this removes files which don't conform
+  to the structure described above, as well as some files on the staging bucket.
+  */
+  if (key.includes("/")) return false;  
+
   // Some filenames have a double underscore (presumably by mistake)
   if (key.includes('__')) return false;
 

diff --git a/src/endpoints/listResources.js b/src/endpoints/listResources.js
@@ -11,7 +11,7 @@ import { contentTypesProvided } from '../negotiate.js';
 const listResourcesJson = async (req, res) => {
   /* API currently only handles a limited range of sources / resource types.
   ListResources will throw a HTTP error if they do not exist */
-  const resourceType = 'dataset';
+  const resourceType = req.params.resourceType;
   const sourceName = req.params.sourceName;
   const resources = new ListResources([sourceName], [resourceType]);
   const data = {

diff --git a/src/resourceIndex.js b/src/resourceIndex.js
@@ -132,6 +132,9 @@ async function updateResourceVersions() {
  * ListResources is intended to respond to resource listing queries. The current
  * implementation only handles a single source Id and single resource type, but
  * this will be extended as needed.
+ * 
+ * There's definitely an inheritance structure here, but I haven't spent time to
+ * really draw it out. So instead of polymorphism we use conditionals.
  */
 class ListResources {
   constructor(sourceIds, resourceTypes) {
@@ -148,63 +151,87 @@ class ListResources {
     this.resourceType = resourceTypes[0];
   }
 
-  coreDatasetFilter([name, ]) {
-    /* Consult the manifest to and restrict our listed resources to those whose
-    _first words_ appear as a top-level key the manifest. Subsequent words
-    aren't checked, so datasets may be returned which aren't explicitly defined
-    in the manifest.
-
-    This is very similar to restricting based on the routing rules (e.g. using
-    `coreBuildPaths`) however the manifest is a subset of those and is used here
-    as the listed resources should be those for which we have added the pathogen
-    name to the manifest.
-    */
-    if (!this._coreDatasetFirstWords) {
-      this._coreDatasetFirstWords = new Set(
-        global?.availableDatasets?.core?.map((path) => path.split("/")[0]) || []
-      );
-    }
-    return this._coreDatasetFirstWords.has(name.split("/")[0])
+  filterFn() {
+
+    // TODO XXX
+    const _coreDatasetFirstWords = new Set(
+      global?.availableDatasets?.core?.map((path) => path.split("/")[0]) || []
+    );
+
+    const fn = ({
+      dataset: {
+        core([name, ]) {
+          /* Consult the manifest to and restrict our listed resources to those whose
+          _first words_ appear as a top-level key the manifest. Subsequent words
+          aren't checked, so datasets may be returned which aren't explicitly defined
+          in the manifest.
+
+          This is very similar to restricting based on the routing rules (e.g. using
+          `coreBuildPaths`) however the manifest is a subset of those and is used here
+          as the listed resources should be those for which we have added the pathogen
+          name to the manifest.
+          */
+          return _coreDatasetFirstWords.has(name.split("/")[0])
+        },
+        staging() {return true;},
+      },
+      intermediate: {
+        core() {return true;},
+      },
+    })[this.resourceType][this.sourceId];
+    if (fn!==undefined) return fn;
+    throw new InternalServerError(`Source "${this.sourceId}" + resource type "${this.resourceType} does not have a corresponding filter function`);
   }
 
-  pathPrefixBySource(name) {
+  pathPrefix() {
     /**
      * We separate out the "source part" from the "pathParts" part in our
      * routing logic, creating corresponding Source and Resource objects. Here
      * we go in the other direction. We could link the two approaches in the
      * future if it's felt this duplication is too brittle.
+     * 
+     * Returns string | undefined
      */
-    switch (name) {
-      case "core":
-        return ""
-      case "staging":
-        return "staging/"
-      default:
-        throw new InternalServerError(`Source "${name}" does not have a corresponding prefix`)
-    }
+    const prefix = ({
+      dataset: {
+        core() {return "";},
+        staging() {return "staging/";},
+      },
+    })?.[this.resourceType]?.[this.sourceId]?.();
+    return prefix;
   }
 
+  pathVersions(_resources) {
+    const fn = ({
+      dataset() {
+        return Object.entries(_resources).map(([name, data]) => {
+          return [name, data.versions.map((v) => v.date)];
+        })
+      },
+      intermediate() {
+        return Object.entries(_resources).map(([name, data]) => {
+          return [name, Object.fromEntries((data.versions).map(({date, fileUrls}) => [date, fileUrls]))] // FIXME XXX
+        });
+      },
+    })[this.resourceType];
+    if (!fn) throw new InternalServerError(`Resource type "${this.resourceType} does not have a path version extractor`);
+    return fn();
+  }
 
   get data() {
     const _resources = resources?.[this.sourceId]?.[this.resourceType];
     if (!_resources) {
       throw new NotFound(`No resources exist for the provided source-id / resource-type`);
     }
-    if (this.resourceType !== 'dataset') {
-      throw new InternalServerError(`Resource listing is currently only implemented for datasets`);
-    }
-    const pathVersions = Object.fromEntries(
-      Object.entries(_resources).map(([name, data]) => {
-        return [name, data.versions.map((v) => v.date)];
-      })
-      .filter((d) => this.sourceId==='core' ? this.coreDatasetFilter(d) : true)
-    )
-    const d = {}
-    d[this.resourceType] = {}
+    const d = {};
+    d[this.resourceType] = {};
     d[this.resourceType][this.sourceId] = {
-      pathVersions,
-      pathPrefix: this.pathPrefixBySource(this.sourceId)
-    }
+      pathPrefix: this.pathPrefix(),
+      pathVersions: Object.fromEntries(
+        this.pathVersions(_resources)
+          .filter(this.filterFn())
+      )
+    };
     return d;
   }
 }

diff --git a/src/routing/listResources.js b/src/routing/listResources.js
@@ -15,6 +15,6 @@ import {listResources} from '../endpoints/index.js';
  * for some discussion about route name choices.
  */
 export function setup(app) {
-  app.routeAsync("/list-resources/:sourceName")
+  app.routeAsync("/list-resources/:sourceName/:resourceType")
     .getAsync(listResources.listResources);
 }
diff --git a/static-site/pages/pathogens/files.jsx b/static-site/pages/pathogens/files.jsx
@@ -0,0 +1,3 @@
+import dynamic from 'next/dynamic'
+const Index = dynamic(() => import("../../src/sections/core-files"), {ssr: false})
+export default Index;
diff --git a/static-site/src/components/ListResources/IndividualResource.jsx b/static-site/src/components/ListResources/IndividualResource.jsx
@@ -37,7 +37,23 @@ export const ResourceLink = styled.a`
   text-decoration: none !important;
 `;
 
+export const ResourceName = styled.span`
+  font-size: ${resourceFontSize}px;
+  font-family: monospace;
+  cursor: pointer;
+  white-space: pre; /* don't collapse back-to-back spaces */
+  color: ${(props) => props.$hovered ? LINK_HOVER_COLOR : LINK_COLOR} !important;
+  text-decoration: none !important;
+`;
+
 function Name({displayName, $hovered, href, topOfColumn}) {
+  if (!href) {
+    return (
+      <ResourceName $hovered={$hovered}>
+        {'• '}{($hovered||topOfColumn) ? displayName.hovered : displayName.default}
+      </ResourceName>
+    )
+  }
   return (
     <ResourceLink href={href} target="_blank" rel="noreferrer" $hovered={$hovered}>
       {'• '}{($hovered||topOfColumn) ? displayName.hovered : displayName.default}
@@ -106,24 +122,43 @@ export const IndividualResource = ({data, isMobile}) => {
     }
   }, []);
 
+  let summaryText;
+  if (data.versioned && !isMobile) {
+    summaryText = `${data.updateCadence.summary} (n=${data.nVersions})`;
+    if (data.fileCounts) {
+      const {min, max} = data.fileCounts;
+      if (min===max) {
+        summaryText += ` (${data.fileCounts.min} files)`
+      } else {
+        summaryText += ` (${data.fileCounts.min} - ${data.fileCounts.max} files)`
+      }
+    }
+  }
+
   return (
     <Container ref={ref}>
 
       <FlexRow>
 
         <TooltipWrapper description={`Last known update on ${data.lastUpdated}`}>
-          <ResourceLinkWrapper onShiftClick={() => setModal(data)}>
-            <Name displayName={data.displayName} href={data.url} topOfColumn={topOfColumn}/>
-          </ResourceLinkWrapper>
+          { data.url ? (
+            <ResourceLinkWrapper onShiftClick={() => setModal(data)}>
+              <Name displayName={data.displayName} href={data.url} topOfColumn={topOfColumn}/>
+            </ResourceLinkWrapper>
+          ) : (
+            <ResourceLinkWrapper onClick={() => setModal(data)}>
+              <Name displayName={data.displayName} topOfColumn={topOfColumn}/>
+            </ResourceLinkWrapper>
+          )}
         </TooltipWrapper>
 
-        {data.versioned && !isMobile && (
+        {summaryText && (
           <TooltipWrapper description={data.updateCadence.description +
             `<br/>Last known update on ${data.lastUpdated}` +
             `<br/>${data.nVersions} snapshots of this dataset available (click to see them)`}>
             <IconContainer
               Icon={MdHistory}
-              text={`${data.updateCadence.summary} (n=${data.nVersions})`}
+              text={summaryText}
               handleClick={() => setModal(data)}
             />
           </TooltipWrapper>
@@ -140,17 +175,21 @@ export const IndividualResource = ({data, isMobile}) => {
  * Wrapper component which monitors for mouse-over events and injects a
  * `hovered: boolean` prop into the child.
  */
-export const ResourceLinkWrapper = ({children, onShiftClick}) => {
+export const ResourceLinkWrapper = ({children, onClick, onShiftClick}) => {
   const [hovered, setHovered] = useState(false);
-  const onClick = (e) => {
-    if (e.shiftKey) {
+  const _onClick = (e) => {
+    if (e.shiftKey && onShiftClick) {
       onShiftClick();
       e.preventDefault(); // child elements (e.g. <a>) shouldn't receive the click
     }
+    if (onClick) {
+      onClick();
+      e.preventDefault(); // child elements (e.g. <a>) shouldn't receive the click
+    }
   };
   return (
     <div>
-      <div onMouseOver={() => setHovered(true)} onMouseOut={() => setHovered(false)} onClick={onClick}>
+      <div onMouseOver={() => setHovered(true)} onMouseOut={() => setHovered(false)} onClick={_onClick}>
         {React.cloneElement(children, { $hovered: hovered })}
       </div>
     </div>