Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP resource listing for core data files #828

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion env/production/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -110,5 +110,5 @@
"OIDC_GROUPS_CLAIM": "cognito:groups",
"SESSION_COOKIE_DOMAIN": "nextstrain.org",
"GROUPS_DATA_FILE": "groups.json",
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources.json.gz"
"RESOURCE_INDEX": "./index.files.json.gz"
}
2 changes: 1 addition & 1 deletion env/testing/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,5 @@
"OIDC_USERNAME_CLAIM": "cognito:username",
"OIDC_GROUPS_CLAIM": "cognito:groups",
"GROUPS_DATA_FILE": "groups.json",
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources.json.gz"
"RESOURCE_INDEX": "./index.files.json.gz"
}
Binary file added index.files.json.gz
Binary file not shown.
84 changes: 52 additions & 32 deletions resourceIndexer/coreStagingS3.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,41 +30,61 @@ function categoriseCoreObjects(item, staging) {
|| key.startsWith('datasets_')
) return false;

// On the core bucket, directory-like hierarchies are used for intermediate
// files. These intermediate files may include files which auspice can
// display, but nextstrain.org cannot map URLs to directory-like hierarchies.
// There are other resourceTypes here we may consider in the future -- e.g.
// model output JSONs
if (key.includes("/")) {
if (staging===true) return false;
if (key.startsWith('files/')) {
if (
key.includes('/archive/')
|| key.includes('/test/')
|| key.includes('/workflows/')
|| key.includes('/branch/')
|| key.includes('/trial/')
|| key.includes('/test-data/')
|| key.includes('jen_test/')
|| key.match(/\/nextclade-full-run-[\d-]+--UTC\//)
|| key.match(/\/\d{4}-\d{2}-\d{2}_results.json/) // forecasts-ncov
|| key.endsWith('.png') // forecasts-ncov
) {
return false;
}
item.resourceType = 'intermediate';
/* The ID is used for grouping. For a nextstrain.org dataset this would be
combined with the source to form a nextstrain URL, however that's not
applicable here. Instead we use the filepath information without the
leading 'files/' and without the (trailing) filename so that different
files in the same directory structure get grouped together. For instance,
files/ncov/open/x.json -> ncov/open */
item.resourcePath = key.split('/').slice(1, -1).join('/')
return item;
/* Intermediate files in the core bucket are many and varied, however we expect them
to follow the format specified in <https://docs.nextstrain.org/en/latest/reference/data-files.html>
At the moment we only consider "workflows", i.e. files in `/files/workflows/*`
as there are no "datasets" ("build files"?) intermediates.
The file name schema is:
/files
/workflows
{/workflow-repo} (matching github.com/nextstrain{/workflow-repo})
{/arbitrary-structure*}
/metadata.tsv.zst (etc)
/sequences.fasta.zst (etc)
For the current listing we filter out any files where "/arbitrary-structure*" matches
some hardcoded list in an attempt to filter out test runs which we don't want to surface.

We also include /files/ncov which predates the above structure design.

The reported resource ID does not include the "/files/workflows" prefix.

Redirects aren't considered when constructing the ID, so (e.g.) "monkeypox" and "mpox" are independent.
*/
const intermediateExcludePatterns = [
/\/branch\//,
/\/test\//,
/\/trial\//,
/\/trials\//,
/\/nextclade-full-run[\d-]+--UTC\//, /* We could detail versions via the datestamped filename if desired */
/\/\d{4}-\d{2}-\d{2}_results\.json/, // forecasts-ncov
/\.png$/, // forecasts-ncov
]
if ((key.startsWith("files/workflows/") || key.startsWith("files/ncov/")) && staging===false) {
for (const pattern of intermediateExcludePatterns) {
if (key.match(pattern)) return false;
}
return false;
item.resourceType = 'intermediate';
/* The ID is used for grouping. For a nextstrain.org dataset this would be
combined with the source to form a nextstrain URL, however that's not
applicable here. Instead we use the filepath information without the
leading 'files/' and without the (trailing) filename so that different
files in the same directory structure get grouped together. For instance:
* files/ncov/open/100k/metadata.tsv.xz -> ncov/open/100k
* files/workflows/zika/sequences.fasta.zst -> zika
*/
item.resourcePath = key
.replace(/^files\/ncov\//, "ncov/")
.replace(/^files\/workflows\//, "")
.replace(/\/[^\/]+$/, '')
return item;
}

/* All other files with a directory-like structure, including those on the
staging bucket, are ignored. Note that this removes files which don't conform
to the structure described above, as well as some files on the staging bucket.
*/
if (key.includes("/")) return false;

// Some filenames have a double underscore (presumably by mistake)
if (key.includes('__')) return false;

Expand Down
2 changes: 1 addition & 1 deletion src/endpoints/listResources.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { contentTypesProvided } from '../negotiate.js';
const listResourcesJson = async (req, res) => {
/* API currently only handles a limited range of sources / resource types.
ListResources will throw a HTTP error if they do not exist */
const resourceType = 'dataset';
const resourceType = req.params.resourceType;
const sourceName = req.params.sourceName;
const resources = new ListResources([sourceName], [resourceType]);
const data = {
Expand Down
107 changes: 67 additions & 40 deletions src/resourceIndex.js
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ async function updateResourceVersions() {
* ListResources is intended to respond to resource listing queries. The current
* implementation only handles a single source Id and single resource type, but
* this will be extended as needed.
*
* There's definitely an inheritance structure here, but I haven't spent time to
* really draw it out. So instead of polymorphism we use conditionals.
*/
class ListResources {
constructor(sourceIds, resourceTypes) {
Expand All @@ -148,63 +151,87 @@ class ListResources {
this.resourceType = resourceTypes[0];
}

coreDatasetFilter([name, ]) {
/* Consult the manifest to and restrict our listed resources to those whose
_first words_ appear as a top-level key the manifest. Subsequent words
aren't checked, so datasets may be returned which aren't explicitly defined
in the manifest.

This is very similar to restricting based on the routing rules (e.g. using
`coreBuildPaths`) however the manifest is a subset of those and is used here
as the listed resources should be those for which we have added the pathogen
name to the manifest.
*/
if (!this._coreDatasetFirstWords) {
this._coreDatasetFirstWords = new Set(
global?.availableDatasets?.core?.map((path) => path.split("/")[0]) || []
);
}
return this._coreDatasetFirstWords.has(name.split("/")[0])
filterFn() {

// TODO XXX
const _coreDatasetFirstWords = new Set(
global?.availableDatasets?.core?.map((path) => path.split("/")[0]) || []
);

const fn = ({
dataset: {
core([name, ]) {
/* Consult the manifest to and restrict our listed resources to those whose
_first words_ appear as a top-level key the manifest. Subsequent words
aren't checked, so datasets may be returned which aren't explicitly defined
in the manifest.

This is very similar to restricting based on the routing rules (e.g. using
`coreBuildPaths`) however the manifest is a subset of those and is used here
as the listed resources should be those for which we have added the pathogen
name to the manifest.
*/
return _coreDatasetFirstWords.has(name.split("/")[0])
},
staging() {return true;},
},
intermediate: {
core() {return true;},
},
})[this.resourceType][this.sourceId];
if (fn!==undefined) return fn;
throw new InternalServerError(`Source "${this.sourceId}" + resource type "${this.resourceType} does not have a corresponding filter function`);
}

pathPrefixBySource(name) {
pathPrefix() {
/**
* We separate out the "source part" from the "pathParts" part in our
* routing logic, creating corresponding Source and Resource objects. Here
* we go in the other direction. We could link the two approaches in the
* future if it's felt this duplication is too brittle.
*
* Returns string | undefined
*/
switch (name) {
case "core":
return ""
case "staging":
return "staging/"
default:
throw new InternalServerError(`Source "${name}" does not have a corresponding prefix`)
}
const prefix = ({
dataset: {
core() {return "";},
staging() {return "staging/";},
},
})?.[this.resourceType]?.[this.sourceId]?.();
return prefix;
}

pathVersions(_resources) {
const fn = ({
dataset() {
return Object.entries(_resources).map(([name, data]) => {
return [name, data.versions.map((v) => v.date)];
})
},
intermediate() {
return Object.entries(_resources).map(([name, data]) => {
return [name, Object.fromEntries((data.versions).map(({date, fileUrls}) => [date, fileUrls]))] // FIXME XXX
});
},
})[this.resourceType];
if (!fn) throw new InternalServerError(`Resource type "${this.resourceType} does not have a path version extractor`);
return fn();
}

get data() {
const _resources = resources?.[this.sourceId]?.[this.resourceType];
if (!_resources) {
throw new NotFound(`No resources exist for the provided source-id / resource-type`);
}
if (this.resourceType !== 'dataset') {
throw new InternalServerError(`Resource listing is currently only implemented for datasets`);
}
const pathVersions = Object.fromEntries(
Object.entries(_resources).map(([name, data]) => {
return [name, data.versions.map((v) => v.date)];
})
.filter((d) => this.sourceId==='core' ? this.coreDatasetFilter(d) : true)
)
const d = {}
d[this.resourceType] = {}
const d = {};
d[this.resourceType] = {};
d[this.resourceType][this.sourceId] = {
pathVersions,
pathPrefix: this.pathPrefixBySource(this.sourceId)
}
pathPrefix: this.pathPrefix(),
pathVersions: Object.fromEntries(
this.pathVersions(_resources)
.filter(this.filterFn())
)
};
return d;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/routing/listResources.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ import {listResources} from '../endpoints/index.js';
* for some discussion about route name choices.
*/
export function setup(app) {
app.routeAsync("/list-resources/:sourceName")
app.routeAsync("/list-resources/:sourceName/:resourceType")
.getAsync(listResources.listResources);
}
3 changes: 3 additions & 0 deletions static-site/pages/pathogens/files.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import dynamic from 'next/dynamic'
const Index = dynamic(() => import("../../src/sections/core-files"), {ssr: false})
export default Index;
57 changes: 48 additions & 9 deletions static-site/src/components/ListResources/IndividualResource.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,23 @@ export const ResourceLink = styled.a`
text-decoration: none !important;
`;

export const ResourceName = styled.span`
font-size: ${resourceFontSize}px;
font-family: monospace;
cursor: pointer;
white-space: pre; /* don't collapse back-to-back spaces */
color: ${(props) => props.$hovered ? LINK_HOVER_COLOR : LINK_COLOR} !important;
text-decoration: none !important;
`;

function Name({displayName, $hovered, href, topOfColumn}) {
if (!href) {
return (
<ResourceName $hovered={$hovered}>
{'• '}{($hovered||topOfColumn) ? displayName.hovered : displayName.default}
</ResourceName>
)
}
return (
<ResourceLink href={href} target="_blank" rel="noreferrer" $hovered={$hovered}>
{'• '}{($hovered||topOfColumn) ? displayName.hovered : displayName.default}
Expand Down Expand Up @@ -106,24 +122,43 @@ export const IndividualResource = ({data, isMobile}) => {
}
}, []);

let summaryText;
if (data.versioned && !isMobile) {
summaryText = `${data.updateCadence.summary} (n=${data.nVersions})`;
if (data.fileCounts) {
const {min, max} = data.fileCounts;
if (min===max) {
summaryText += ` (${data.fileCounts.min} files)`
} else {
summaryText += ` (${data.fileCounts.min} - ${data.fileCounts.max} files)`
}
}
}

return (
<Container ref={ref}>

<FlexRow>

<TooltipWrapper description={`Last known update on ${data.lastUpdated}`}>
<ResourceLinkWrapper onShiftClick={() => setModal(data)}>
<Name displayName={data.displayName} href={data.url} topOfColumn={topOfColumn}/>
</ResourceLinkWrapper>
{ data.url ? (
<ResourceLinkWrapper onShiftClick={() => setModal(data)}>
<Name displayName={data.displayName} href={data.url} topOfColumn={topOfColumn}/>
</ResourceLinkWrapper>
) : (
<ResourceLinkWrapper onClick={() => setModal(data)}>
<Name displayName={data.displayName} topOfColumn={topOfColumn}/>
</ResourceLinkWrapper>
)}
</TooltipWrapper>

{data.versioned && !isMobile && (
{summaryText && (
<TooltipWrapper description={data.updateCadence.description +
`<br/>Last known update on ${data.lastUpdated}` +
`<br/>${data.nVersions} snapshots of this dataset available (click to see them)`}>
<IconContainer
Icon={MdHistory}
text={`${data.updateCadence.summary} (n=${data.nVersions})`}
text={summaryText}
handleClick={() => setModal(data)}
/>
</TooltipWrapper>
Expand All @@ -140,17 +175,21 @@ export const IndividualResource = ({data, isMobile}) => {
* Wrapper component which monitors for mouse-over events and injects a
* `hovered: boolean` prop into the child.
*/
export const ResourceLinkWrapper = ({children, onShiftClick}) => {
export const ResourceLinkWrapper = ({children, onClick, onShiftClick}) => {
const [hovered, setHovered] = useState(false);
const onClick = (e) => {
if (e.shiftKey) {
const _onClick = (e) => {
if (e.shiftKey && onShiftClick) {
onShiftClick();
e.preventDefault(); // child elements (e.g. <a>) shouldn't receive the click
}
if (onClick) {
onClick();
e.preventDefault(); // child elements (e.g. <a>) shouldn't receive the click
}
};
return (
<div>
<div onMouseOver={() => setHovered(true)} onMouseOut={() => setHovered(false)} onClick={onClick}>
<div onMouseOver={() => setHovered(true)} onMouseOut={() => setHovered(false)} onClick={_onClick}>
{React.cloneElement(children, { $hovered: hovered })}
</div>
</div>
Expand Down
Loading