From 86ea4c39301b19710137b7cea7791a3e7640e2ad Mon Sep 17 00:00:00 2001
From: Daniel Holth <dholth@anaconda.com>
Date: Tue, 29 Aug 2023 11:23:16 -0400
Subject: [PATCH] document schema (#113)

* document schema
---
 docs/database.md | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
 docs/index.md    |  1 +
 2 files changed, 93 insertions(+)
 create mode 100644 docs/database.md
diff --git a/docs/database.md b/docs/database.md
new file mode 100644
index 0000000..ddbfe05
--- /dev/null
+++ b/docs/database.md
@@ -0,0 +1,92 @@
+# Database schema
+
+Standalone conda-index uses a per-subdir sqlite database to track package
+metadata, unlike the older version which used millions of tiny `.json` files.
+The new strategy is much faster because we don't have to pay for many individual
+`stat()` or `open()` calls.
+
+The whole schema looks like this:
+
+```sql
+<subdir>/.cache % sqlite3 cache.db
+SQLite version 3.41.2 2023-03-22 11:56:21
+Enter ".help" for usage hints.
+sqlite> .schema
+CREATE TABLE about (path TEXT PRIMARY KEY, about BLOB);
+CREATE TABLE index_json (path TEXT PRIMARY KEY, index_json BLOB);
+CREATE TABLE recipe (path TEXT PRIMARY KEY, recipe BLOB);
+CREATE TABLE recipe_log (path TEXT PRIMARY KEY, recipe_log BLOB);
+CREATE TABLE run_exports (path TEXT PRIMARY KEY, run_exports BLOB);
+CREATE TABLE post_install (path TEXT PRIMARY KEY, post_install BLOB);
+CREATE TABLE icon (path TEXT PRIMARY KEY, icon_png BLOB);
+CREATE TABLE stat (
+                stage TEXT NOT NULL DEFAULT 'indexed',
+                path TEXT NOT NULL,
+                mtime NUMBER,
+                size INTEGER,
+                sha256 TEXT,
+                md5 TEXT,
+                last_modified TEXT,
+                etag TEXT
+            );
+CREATE UNIQUE INDEX idx_stat ON stat (path, stage);
+CREATE INDEX idx_stat_stage ON stat (stage, path);
+```
+
+```sql
+sqlite> select stage, path from stat where path like 'libcurl%';
+fs|libcurl-7.84.0-hc6d1d07_0.conda
+fs|libcurl-7.86.0-h0f1d93c_0.conda
+fs|libcurl-7.87.0-h0f1d93c_0.conda
+fs|libcurl-7.88.1-h0f1d93c_0.conda
+fs|libcurl-7.88.1-h9049daf_0.conda
+indexed|libcurl-7.84.0-hc6d1d07_0.conda
+indexed|libcurl-7.86.0-h0f1d93c_0.conda
+indexed|libcurl-7.87.0-h0f1d93c_0.conda
+indexed|libcurl-7.88.1-h0f1d93c_0.conda
+indexed|libcurl-7.88.1-h9049daf_0.conda
+```
+
+Most of these tables store json-format metadata extracted from each package.
+
+```sql
+select * from index_json where path = 'libcurl-7.88.1-h9049daf_0.conda';
+libcurl-7.88.1-h9049daf_0.conda|{"build":"h9049daf_0","build_number":0,"depends":["krb5 >=1.20.1,<1.21.0a0","libnghttp2 >=1.51.0,<2.0a0","libssh2 >=1.10.0,<2.0a0","libzlib >=1.2.13,<1.3.0a0","openssl >=3.0.8,<4.0a0"],"license":"curl","license_family":"MIT","name":"libcurl","subdir":"osx-arm64","timestamp":1676918523934,"version":"7.88.1","md5":"c86bbee944bb640609670ce722fba9a4","sha256":"37b8d58c05386ac55d1d8e196c90b92b0a63f3f1fe2fa916bf5ed3e1656d8e14","size":321706}
+```
+
+To track whether a package is indexed in the cache or not, conda-index uses a
+table named `stat`. The main point of this table is to assign a stage value to
+each artifact filename; usually `'fs'` which is called the `upstream` stage, and
+`'indexed'`. `'fs'` means that the artifact is now available in the set of
+packages (assumed by default to be the local filesystem). `'indexed'` means that
+the entry already exists in the database (same filename, same timestamp, same
+hash), and its package metadata has been extracted to the `index_json` etc.
+tables. Paths in `'fs'` but not in `'indexed'` need to be unpacked to have their
+metadata added to the database. Paths in `'indexed'` but not in `'fs'` will be
+ignored and left out of `repodata.json`.
+
+First, conda-index adds all files in a subdir to the `upstream` stage. This
+involves a `listdir()` and `stat()` for each file in the index. The default
+`upstream` stage is named `fs`, but this step is designed to be overridden by
+subclassing `CondaIndexCache()` and replacing the `save_fs_state()` and
+`changed_packages()` methods. By overriding `CondexIndexCache()` it is possible
+to index without calling `stat()` on each package, or without even having all
+packages stored on the indexing machine.
+
+Next, conda-index looks for all `changed_packages()`: paths in the `upstream`
+(`fs`) stage that don't exist in or have a different  modification time than
+those in thie `indexed` stage.
+
+Finally, a join between the `upstream` stage, usually `'fs'`, and the
+`index_json` table yields a basic `repodata_from_packages.json` without any
+repodata patches.
+
+```sql
+SELECT path, index_json FROM stat JOIN index_json USING (path) WHERE stat.stage = :upstream_stage
+```
+
+The steps to create `repodata.json`, including any repodata patches, and to
+create `current_repodata.json` with only the latest versions of each package,
+are similar to pre-sqlite3 conda-index.
+
+The other cached metadata tables are used to create `channeldata.json`.
diff --git a/docs/index.md b/docs/index.md
index d15947e..f281189 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -8,6 +8,7 @@
 :maxdepth: 2
 cli
 modules
+database
 ```
 
 # Indices and tables