gatsbyjs · pieh · Jan 20, 2022 · Jan 20, 2022 · Jan 20, 2022 · Jan 21, 2022
diff --git a/benchmarks/memory/.dockerignore b/benchmarks/memory/.dockerignore
@@ -0,0 +1,23 @@
+**/.classpath
+**/.dockerignore
+**/.env
+**/.git
+**/.gitignore
+**/.project
+**/.settings
+**/.toolstarget
+**/.vs
+**/.vscode
+**/*.*proj.user
+**/*.dbmdl
+**/*.jfm
+**/charts
+**/docker-compose*
+**/compose*
+**/Dockerfile*
+**/node_modules
+**/npm-debug.log
+**/obj
+**/secrets.dev.yaml
+**/values.dev.yaml
+README.md
diff --git a/benchmarks/memory/Dockerfile b/benchmarks/memory/Dockerfile
@@ -0,0 +1,14 @@
+FROM node:14-buster
+ENV NODE_ENV=production
+ENV CI=1
+ENV GATSBY_CPU_COUNT=4
+RUN apt-get update -y && apt-get upgrade -y && apt-get install git curl npm -y
+RUN npm i -g gatsby-cli gatsby-dev-cli
+WORKDIR /usr/src/app
+RUN echo "\n\necho \"Welcome to the Gatsby Memory benchmark container!\\n  - /usr/src/gatsby : Your local gatsby repo\\n  - /usr/src/app : The memory benchmark gatsby site\\n\"" > /root/.bashrc
+
+# set up gatsby-dev
+RUN gatsby-dev --set-path-to-repo /usr/src/gatsby
+
+# keep the process running
+ENTRYPOINT ["tail", "-f", "/dev/null"]
diff --git a/benchmarks/memory/README.md b/benchmarks/memory/README.md
@@ -0,0 +1,97 @@
+# Gatsby Memory Benchmark
+
+The goal of this benchmark is to test Gatsby's memory usage and look for potential optimizations.
+
+## The Docker Container
+
+The docker container used in these tests sets up a Debian instance with node 14 installed (as well as npm/yarn/etc).
+It has ports 9000 (for hosting gatsby) and 9229 (for debugging) exposed.
+
+Within the container, two points to your local filesystem are mounted:
+
+- /usr/src/gatsby : Your local gatsby repo
+- /usr/src/site : The memory benchmark gatsby site
+
+## Commands
+
+### Docker
+
+These commands are used for interfacing with docker and have built-in utilities for managing the docker container.
+
+#### yarn docker:build
+
+Builds the container used for testing.
+
+#### yarn docker:start
+
+Starts the container built by `yarn docker:build`.
+
+#### yarn docker:connect
+
+Connects to the container started by `yarn docker:start`.
+
+#### yarn docker:start-and-connect
+
+A shorthand for start + connect.
+
+#### yarn docker:stop
+
+Stop the container used for testing.
+
+### Gatsby
+
+These commands are used for interfacing with gatsby.
+
+#### yarn gatsby:build
+
+Simply an alias to `yarn gatsby build`.
+
+#### yarn gatsby:serve
+
+Starts `gatsby serve` on port 9000 and sets the host properly to work inside docker.
+
+#### yarn gatsby:develop
+
+Starts `gatsby develop` on port 9000 and sets the host properly to work inside docker.
+
+#### yarn gatsby:build:debug
+
+Runs `gatsby build` with `inspect-brk` set to start the [debugging process](https://www.gatsbyjs.com/docs/debugging-the-build-process/) on port 9229.
+
+#### yarn gatsby:develop:debug
+
+Runs `gatsby develop` with `inspect-brk` set to start the [debugging process](https://www.gatsbyjs.com/docs/debugging-the-build-process/) on port 9229.
+
+## Setup
+
+Currently we can reproduce builds crashing with out default settings
+
+- Docker container running with 2GB limit
+- 300 nodes x ~2MB each = ~600MB of "just" nodes data in each process (number of nodes can be controlled with NUM_NODES env var)
+- 3 workers + main process (GATSBY_CPU_COUNT set to 4 in docker image, but you can specify different value with env var)
+- `eq_field` template using fast filters (single `eq` specifically)
+
+Goal is to make `eq_field` template to not cause crashes, then add next template (different operator) that cause crashes and repeat until all queries can be handled with set memory limits.
+
+### Workflow
+
+While `gatsby-dev` command is available inside docker, from my testing it seems like it doesn't pick up file changes when run there. Workflow that seems to work reliably:
+
+When starting working with this benchmark:
+
+- start `yarn watch` (possibly with `--scope`) in monorepo
+- start `gatsby-dev` outside of docker in benchmark directory (just like with regular site)
+- `yarn docker:connect` to get inside docker
+- `npm rebuild` to rebuild binaries inside docker
+
+And repeat as many times as you want:
+
+- make changes to `gatsby` source code as you normally would
+- run `yarn build` inside docker
+
+## Testing
+
+TODO
+
+- How to configure memory limits
+- Where to look
diff --git a/benchmarks/memory/gatsby-config.js b/benchmarks/memory/gatsby-config.js
@@ -0,0 +1,3 @@
+module.exports = {
+  plugins: [],
+}
diff --git a/benchmarks/memory/gatsby-node.js b/benchmarks/memory/gatsby-node.js
@@ -0,0 +1,226 @@
+const { cpuCoreCount } = require(`gatsby-core-utils`)
+
+const NUM_NODES = parseInt(process.env.NUM_NODES || 300, 10)
+
+const NUM_KEYS_IN_LARGE_SIZE_OBJ = 1024
+
+exports.sourceNodes = async ({ actions, reporter }) => {
+  const contentDigest = Date.now().toString() // make each sourcing mark everything as dirty
+
+  const activity = reporter.createProgress(`Creating test nodes`, NUM_NODES)
+  activity.start()
+
+  for (let i = 0; i < NUM_NODES; i++) {
+    const largeSizeObj = {}
+    for (let j = 1; j <= NUM_KEYS_IN_LARGE_SIZE_OBJ; j++) {
+      largeSizeObj[`key_${j}`] = `x`.repeat(1024)
+    }
+
+    // each node is ~2MB
+    const node = {
+      id: `memory-${i}`,
+      idClone: `memory-${i}`,
+      fooBar: [`foo`, `bar`, `baz`, `foobar`][i % 4],
+      number1: i,
+      number2: NUM_NODES - i,
+      number3: i % 20,
+      largeSizeObj,
+      largeSizeString: `x`.repeat(1024 * 1024),
+      internal: {
+        contentDigest,
+        type: `Test`,
+      },
+    }
+
+    actions.createNode(node)
+
+    if (i % 100 === 99) {
+      activity.tick(100)
+      await new Promise(resolve => setImmediate(resolve))
+    }
+  }
+
+  activity.tick(NUM_NODES % 100)
+
+  await new Promise(resolve => setTimeout(resolve, 100))
+
+  activity.end()
+}
+
+exports.createSchemaCustomization = ({ actions, schema }) => {
+  actions.createTypes([
+    schema.buildObjectType({
+      name: `TestLargeSizeObj`,
+      fields: Object.fromEntries(
+        new Array(NUM_KEYS_IN_LARGE_SIZE_OBJ)
+          .fill(`String`)
+          .map((value, index) => [`key_${index + 1}`, value])
+      ),
+    }),
+    schema.buildObjectType({
+      name: `Test`,
+      fields: {
+        idClone: `String`,
+        fooBar: `String`,
+        number1: `Int`,
+        number2: `Int`,
+        number3: `Int`,
+        largeSizeString: `String`,
+        largeSizeObj: `TestLargeSizeObj`,
+        idCloneWithResolver: {
+          type: `String`,
+          resolve: source => {
+            return source.idClone
+          },
+        },
+      },
+      interfaces: ["Node"],
+      extensions: {
+        infer: false,
+      },
+    }),
+  ])
+}
+
+const printedMessages = new Set()
+exports.createResolvers = ({ createResolvers }) => {
+  createResolvers({
+    Query: {
+      workerInfo: {
+        type: `String`,
+        args: {
+          label: `String!`,
+        },
+        resolve: (_, args) => {
+          const msg = `${args.label} on ${
+            process.env.GATSBY_WORKER_ID
+              ? `worker #${process.env.GATSBY_WORKER_ID}`
+              : `main`
+          }`
+          if (!printedMessages.has(msg)) {
+            printedMessages.add(msg)
+            console.log(msg)
+          }
+          return msg
+        },
+      },
+    },
+  })
+}
+
+const WORKER_BATCH_SIZE =
+  Number(process.env.GATSBY_PARALLEL_QUERY_CHUNK_SIZE) || 50
+
+let enabledTemplates = new Set()
+exports.onPreBootstrap = () => {
+  const availableTemplates = new Set([
+    `eq_id`, // this should skip node-model and fast filters completely and should be very cheap already
+    `eq_field`, // this needs fast filters for eq operator on non-id field
+    `eq_field_with_resolver`, // / this needs fast filters for eq operator on non-id field + materialization
+    `ne_field_collection_sort_skip_limit`, // collection query to check code path applying sorting and skip/limit
+  ])
+  enabledTemplates = new Set(
+    process.env.TEMPLATES
+      ? process.env.TEMPLATES.split(`,`).filter(template =>
+          availableTemplates.has(template)
+        )
+      : availableTemplates
+  )
+
+  console.info(`Enabled templates`, enabledTemplates)
+}
+
+exports.createPages = async ({ actions, graphql }) => {
+  const numWorkers = Math.max(1, cpuCoreCount() - 1)
+
+  // we do want ALL available workers to execute each query type
+  const minNumOfPagesToSaturateAllWorkers = WORKER_BATCH_SIZE * numWorkers
+
+  const { data } = await graphql(`
+    {
+      allTest {
+        nodes {
+          id
+          idClone
+        }
+      }
+    }
+  `)
+
+  // we might need to "duplicate" pages if node count is less than number of needed pages
+  const repeatCount = Math.min(
+    1,
+    Math.ceil(minNumOfPagesToSaturateAllWorkers / data.allTest.nodes.length)
+  )
+
+  function createEnoughToSaturate(template, cb) {
+    if (!enabledTemplates.has(template)) {
+      return
+    }
+    console.log(`Creating pages with template "${template}"`)
+    let counter = 0
+    for (let i = 0; i < repeatCount; i++) {
+      let j = 0
+      for (const node of data.allTest.nodes) {
+        const { context } = cb(node, j)
+
+        actions.createPage({
+          path: `/${template}/${counter++}`,
+          component: require.resolve(`./src/templates/${template}`),
+          context,
+        })
+
+        if (counter >= minNumOfPagesToSaturateAllWorkers) {
+          break
+        }
+
+        j++
+      }
+    }
+  }
+
+  // fast path (eq: { id: x })
+  createEnoughToSaturate(`eq_id`, node => {
+    return {
+      context: {
+        id: node.id,
+      },
+    }
+  })
+
+  // (eq: { idClone: x })
+  createEnoughToSaturate(`eq_field`, node => {
+    return {
+      context: {
+        id: node.id,
+      },
+    }
+  })
+
+  // (eq: { idCloneWithResolver: x })
+  createEnoughToSaturate(`eq_field_with_resolver`, node => {
+    return {
+      context: {
+        id: node.id,
+      },
+    }
+  })
+
+  // allTest(
+  //   filter: { idClone: { ne: $id } }
+  //   sort: { fields: [number3], order: [ASC] }
+  //   limit: 10
+  //   skip: $skip
+  // )
+  createEnoughToSaturate(
+    `ne_field_collection_sort_skip_limit`,
+    (node, index) => {
+      return {
+        context: {
+          id: node.id,
+          skip: Math.max(index, NUM_NODES - 10), // limit is set to 10, so just setting upper bound so queries for last nodes do have 10 items
+        },
+      }
+    }
+  )
+}