-
Notifications
You must be signed in to change notification settings - Fork 49
/
main.js
121 lines (106 loc) · 5.05 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import { ArgumentParser } from 'argparse';
import fs from 'fs';
import { coreS3Data, stagingS3Data } from "./coreStagingS3.js";
import zlib from 'zlib';
import { promisify } from 'util';
import { ResourceIndexerError } from './errors.js';
const gzip = promisify(zlib.gzip)
/**
* We define a number of collections which each represent some listing of
* nextstrain resources. The actual details are deferred to the provided
* collection objects - e.g. they may represent a GitHub repo listing, an S3
* inventory. Each of these collections provides functions which allow items
* (files) across collections to be collected into a master list of resources
* using three identifiers: source, resourceType and resourcePath. The intention
* is for source to parallel the information in the corresponding Source
* (sub-)class and resourcePath to parallel the information in the Resource
* (sub-)class.
*
* Currently only sources {core, staging} and resource types {dataset,
* intermediate} are part of the index.
*
* As an example, the core WNV/NA (nextstrain.org/WNV/NA) dataset is indexed
* like so:
*
* core → dataset → WNV/NA → versions -> [
* {date: "2021-04-08", fileUrls: {main: ...}},
* {date: "2019-08-30", fileUrls: {meta: ..., tree: ...}}
* ]
*
*/
const COLLECTIONS = [
coreS3Data,
stagingS3Data,
];
function parseArgs() {
const argparser = new ArgumentParser({
description: `
Fetch file lists from a number of provided collections (e.g. S3 inventories) and collect them into
resources. Resources are organised in a hierarchical fashion via source → resourceType → resourcePath.
Each resource contains a list of available versions, where applicable.
The output JSON is intended for consumption by the nextstrain.org server.
For more verbose logging set a 'DEBUG=nextstrain:*' env variable.
`,
});
argparser.addArgument("--local", {action: 'storeTrue',
help: 'Access a local copy of S3 inventories within ./devData/. See docstring of fetchInventoryLocal() for expected filenames.'})
argparser.addArgument("--collections", {metavar: "<name>", type: "string", nargs: '+', choices: COLLECTIONS.map((c) => c.name),
help: "Only fetch data from a subset of collections. Source names are those defined in COLLECTIONS"});
argparser.addArgument("--resourceTypes", {metavar: "<name>", type: "string", nargs: '+', choices: ['dataset', 'intermediate'],
help: "Only index data matching specified resource types"});
argparser.addArgument("--save-inventories", {action: 'storeTrue',
help: "Save the fetched inventories + manifest files to ./devData so that future invocations can use --local"});
argparser.addArgument("--output", {metavar: "<json>", required: true})
argparser.addArgument("--indent", {action: 'storeTrue', help: 'Indent the output JSON'})
argparser.addArgument("--gzip", {action: 'storeTrue', help: 'GZip the output JSON'})
return argparser.parseArgs();
}
main(parseArgs())
.catch((err) => {
console.error(err.message);
if (!(err instanceof ResourceIndexerError)) {
console.trace(err);
}
process.exitCode = 2;
})
async function main(args) {
if (args.local && args.save_inventories) {
throw new ResourceIndexerError('Arguments --local and --save-inventories cannot be used together.')
}
const resources = {};
const restrictResourceTypes = args.resourceTypes ? new Set(args.resourceTypes) : false;
for (const collection of COLLECTIONS) {
if (args.collections && !args.collections.includes(collection.name)) {
continue
}
const groupedObjects = (await collection.collect({local: args.local, save: args.save_inventories}))
.map(collection.categorise)
.filter((item) => !!item)
.filter((item) => restrictResourceTypes ? restrictResourceTypes.has(item.resourceType) : true)
// Collect together all items ("files") based on their assigned resourceType & resourcePath
.reduce((store, item) => {
const {resourceType, resourcePath, source} = item;
if (!store[source]) store[source]={}
if (!store[source][resourceType]) store[source][resourceType]={}
if (!store[source][resourceType][resourcePath]) store[source][resourceType][resourcePath]=[]
store[source][resourceType][resourcePath].push(item);
return store;
}, {});
for (const source of Object.keys(groupedObjects)) {
for (const resourceType of Object.keys(groupedObjects[source])) {
for (const [resourcePath, items] of Object.entries(groupedObjects[source][resourceType])) {
const resource = collection.createResource(resourceType, resourcePath, items);
if (resource.versions.length===0) continue;
if (!resources[source]) resources[source]={}
if (!resources[source][resourceType]) resources[source][resourceType]={}
resources[source][resourceType][resourcePath] = resource;
}
}
}
}
let output = JSON.stringify(resources, null, args.indent ? 2 : null);
if (args.gzip) {
output = await gzip(output)
}
fs.writeFileSync(args.output, output);
}