From f05ece64dc8f0382c2f4549036b1afe789807e2d Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 5 Jun 2023 22:19:51 -0500 Subject: [PATCH 1/9] Add reason why the archive bot is joining the room Using the join `reason` added in MSC2367 Related to adding some better profile information, https://github.com/matrix-org/matrix-public-archive/issues/257#issuecomment-1570795922 --- docs/faq.md | 2 ++ server/lib/matrix-utils/ensure-room-joined.js | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/docs/faq.md b/docs/faq.md index fb5a8275..ba446b15 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -17,6 +17,8 @@ And with the introduction of the jump to date API via [MSC3030](https://github.com/matrix-org/matrix-spec-proposals/pull/3030), we could show messages from any given date and day-by-day navigation. +## Why did the archive bot join my room? + ## How do I opt out and keep my room from being indexed by search engines? All public Matrix rooms are accessible to view in the Matrix Public Archive. But only diff --git a/server/lib/matrix-utils/ensure-room-joined.js b/server/lib/matrix-utils/ensure-room-joined.js index fc92536a..aa7af3aa 100644 --- a/server/lib/matrix-utils/ensure-room-joined.js +++ b/server/lib/matrix-utils/ensure-room-joined.js @@ -43,6 +43,14 @@ async function ensureRoomJoined( method: 'POST', accessToken, abortSignal, + body: { + reason: + `Joining room to check history visibility. ` + + `If your room is public with shared or world readable history visibility, ` + + `it will be accessible at archive.matrix.org. ` + + `See the FAQ for more details: ` + + `https://github.com/matrix-org/matrix-public-archive/blob/main/docs/faq.md#why-did-the-archive-bot-join-my-room`, + }, }); assert( joinData.room_id, From d7bf67645c2f2b76af7240cdf8665b591883e998 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 5 Jun 2023 22:44:01 -0500 Subject: [PATCH 2/9] Flesh out docs --- docs/faq.md | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index ba446b15..46d11261 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -19,19 +19,27 @@ messages from any given date and day-by-day navigation. ## Why did the archive bot join my room? -## How do I opt out and keep my room from being indexed by search engines? +Only public Matrix rooms with `shared` or `world_readable` history visibility are +accessible in the Matrix Public Archive. -All public Matrix rooms are accessible to view in the Matrix Public Archive. But only -rooms with history visibility set to `world_readable` are indexable by search engines. +But the archive bot (`@archive:matrix.org`) will join any public room because it doesn't +know the history visibility without first joining. Any room without `world_readable` or +`shared` [history +visibility](https://spec.matrix.org/v1.6/client-server-api/#room-history-visibility) +will lead a `403 Forbidden`. And if the public room is in the room directory, it will be +listed in the archive but will still lead to a `403 Forbidden` in that case. -Also see https://github.com/matrix-org/matrix-public-archive/issues/47 to track better -opt out controls. +The Matrix Public Archive doesn't hold onto any data (it's +stateless) and requests the messages from the homeserver every time. The +[archive.matrix.org](https://archive.matrix.org/) instance has some caching in place, 5 +minutes for the current day, and 2 days for past content. -For [archive.matrix.org](https://archive.matrix.org/), you can ban the -`@archive:matrix.org` user if you don't want your room content to be shown in the -archive at all. +The Matrix Public Archive only allows rooms with `world_readable` history visibility to +be indexed by search engines. See the [opt +out](#how-do-i-opt-out-and-keep-my-room-from-being-indexed-by-search-engines) topic +below for more details. -## Why does the archive user join rooms instead of browsing them as a guest? +### Why does the archive user join rooms instead of browsing them as a guest? Guests require `m.room.guest_access` to access a room. Most public rooms do not allow guests because even the `public_chat` preset when creating a room does not allow guest @@ -39,11 +47,22 @@ access. Not being able to view most public rooms is the major blocker on being a use guest access. The idea is if I can view the messages from a Matrix client as a random user, I should also be able to see the messages in the archive. -Keep in mind that only rooms with history visibility set to `world_readable` are -indexable by search engines. The Matrix Public Archive doesn't hold onto any data (it's -stateless) and requests the messages from the homeserver every time. The -[archive.matrix.org](https://archive.matrix.org/) instance has some caching in place, 5 -minutes for the current day, and 2 days for past content. +Guest access is also a much different ask than read-only access since guests can also +send messages in the room which isn't always desirable. The archive bot is read-only and +does not send messages. + +## How do I opt out and keep my room from being indexed by search engines? + +Only public Matrix rooms with `shared` or `world_readable` history visibility are +accessible to view in the Matrix Public Archive. But only rooms with history visibility +set to `world_readable` are indexable by search engines. + +Also see https://github.com/matrix-org/matrix-public-archive/issues/47 to track better +opt out controls. + +As a workaround for [archive.matrix.org](https://archive.matrix.org/) today, you can ban +the `@archive:matrix.org` user if you don't want your room content to be shown in the +archive at all. ## Technical details From 62daca3defa25c6c24db12d5bd26446f34b1c0b3 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 5 Jun 2023 23:09:27 -0500 Subject: [PATCH 3/9] Move link to first reference --- docs/faq.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 46d11261..0f457ebc 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -19,15 +19,15 @@ messages from any given date and day-by-day navigation. ## Why did the archive bot join my room? -Only public Matrix rooms with `shared` or `world_readable` history visibility are +Only public Matrix rooms with `shared` or `world_readable` [history +visibility](https://spec.matrix.org/v1.6/client-server-api/#room-history-visibility) are accessible in the Matrix Public Archive. But the archive bot (`@archive:matrix.org`) will join any public room because it doesn't know the history visibility without first joining. Any room without `world_readable` or -`shared` [history -visibility](https://spec.matrix.org/v1.6/client-server-api/#room-history-visibility) -will lead a `403 Forbidden`. And if the public room is in the room directory, it will be -listed in the archive but will still lead to a `403 Forbidden` in that case. +`shared` history visibility will lead a `403 Forbidden`. And if the public room is in +the room directory, it will be listed in the archive but will still lead to a `403 +Forbidden` in that case. The Matrix Public Archive doesn't hold onto any data (it's stateless) and requests the messages from the homeserver every time. The From 10e1f422668c327a2c7ea72bf7065015c25b6502 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 6 Jun 2023 11:37:20 -0500 Subject: [PATCH 4/9] Use latest spec See https://github.com/matrix-org/matrix-public-archive/pull/262#discussion_r1219131196 --- docs/faq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/faq.md b/docs/faq.md index 0f457ebc..60267216 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -20,7 +20,7 @@ messages from any given date and day-by-day navigation. ## Why did the archive bot join my room? Only public Matrix rooms with `shared` or `world_readable` [history -visibility](https://spec.matrix.org/v1.6/client-server-api/#room-history-visibility) are +visibility](https://spec.matrix.org/v1.7/client-server-api/#room-history-visibility) are accessible in the Matrix Public Archive. But the archive bot (`@archive:matrix.org`) will join any public room because it doesn't From 3d4096a58262cdc5e9d3dd0654a0ad306b7d6ee1 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 6 Jun 2023 11:48:35 -0500 Subject: [PATCH 5/9] Use full archive URL for room in the reason See https://github.com/matrix-org/matrix-public-archive/pull/262#discussion_r1219137857 --- server/lib/matrix-utils/ensure-room-joined.js | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/server/lib/matrix-utils/ensure-room-joined.js b/server/lib/matrix-utils/ensure-room-joined.js index aa7af3aa..09826f43 100644 --- a/server/lib/matrix-utils/ensure-room-joined.js +++ b/server/lib/matrix-utils/ensure-room-joined.js @@ -3,14 +3,19 @@ const assert = require('assert'); const urlJoin = require('url-join'); +const StatusError = require('../errors/status-error'); const { fetchEndpointAsJson } = require('../fetch-endpoint'); const getServerNameFromMatrixRoomIdOrAlias = require('./get-server-name-from-matrix-room-id-or-alias'); +const MatrixPublicArchiveURLCreator = require('matrix-public-archive-shared/lib/url-creator'); const config = require('../config'); -const StatusError = require('../errors/status-error'); +const basePath = config.get('basePath'); +assert(basePath); const matrixServerUrl = config.get('matrixServerUrl'); assert(matrixServerUrl); +const matrixPublicArchiveURLCreator = new MatrixPublicArchiveURLCreator(basePath); + async function ensureRoomJoined( accessToken, roomIdOrAlias, @@ -47,7 +52,12 @@ async function ensureRoomJoined( reason: `Joining room to check history visibility. ` + `If your room is public with shared or world readable history visibility, ` + - `it will be accessible at archive.matrix.org. ` + + `it will be accessible at ${matrixPublicArchiveURLCreator.archiveUrlForRoom( + roomIdOrAlias + // We don't need to include the `viaServers` option here because the archive + // will already be joined to the room from this request itself and we don't + // need to make the URL any longer/noisier than it needs to be. + )}. ` + `See the FAQ for more details: ` + `https://github.com/matrix-org/matrix-public-archive/blob/main/docs/faq.md#why-did-the-archive-bot-join-my-room`, }, From 2a4a650c779700284cab13ab31f73ed2524e21d9 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 6 Jun 2023 11:56:15 -0500 Subject: [PATCH 6/9] Explain API values to options you might see in the UI See https://github.com/matrix-org/matrix-public-archive/pull/262#discussion_r1219139064 --- docs/faq.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/faq.md b/docs/faq.md index 60267216..b9e79aab 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -21,7 +21,10 @@ messages from any given date and day-by-day navigation. Only public Matrix rooms with `shared` or `world_readable` [history visibility](https://spec.matrix.org/v1.7/client-server-api/#room-history-visibility) are -accessible in the Matrix Public Archive. +accessible in the Matrix Public Archive. In some clients like Element, the `shared` +option equates to "Members only (since the point in time of selecting this option)" and +`world_readable` to "Anyone" under the **room settings** -> **Security & Privacy** -> +**Who can read history?**. But the archive bot (`@archive:matrix.org`) will join any public room because it doesn't know the history visibility without first joining. Any room without `world_readable` or From 81d182598def10294c070feabb1a4621b9e71cef Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 7 Jun 2023 12:13:50 -0500 Subject: [PATCH 7/9] Link to the `latest` version of the spec Co-authored-by: Aminda Suomalainen --- docs/faq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/faq.md b/docs/faq.md index b9e79aab..3c5fda0b 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -20,7 +20,7 @@ messages from any given date and day-by-day navigation. ## Why did the archive bot join my room? Only public Matrix rooms with `shared` or `world_readable` [history -visibility](https://spec.matrix.org/v1.7/client-server-api/#room-history-visibility) are +visibility](https://spec.matrix.org/latest/client-server-api/#room-history-visibility) are accessible in the Matrix Public Archive. In some clients like Element, the `shared` option equates to "Members only (since the point in time of selecting this option)" and `world_readable` to "Anyone" under the **room settings** -> **Security & Privacy** -> From 53019dc37898cc39a16614d88f8fe0fdfe33b7f9 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 7 Jun 2023 12:45:17 -0500 Subject: [PATCH 8/9] Fix tests having extra join event because the join event content didn't match --- test/e2e-tests.js | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/e2e-tests.js b/test/e2e-tests.js index 9f21fdfd..64bceb67 100644 --- a/test/e2e-tests.js +++ b/test/e2e-tests.js @@ -14,6 +14,7 @@ const chalk = require('chalk'); const RethrownError = require('../server/lib/errors/rethrown-error'); const MatrixPublicArchiveURLCreator = require('matrix-public-archive-shared/lib/url-creator'); const { fetchEndpointAsText, fetchEndpointAsJson } = require('../server/lib/fetch-endpoint'); +const ensureRoomJoined = require('../server/lib/matrix-utils/ensure-room-joined'); const config = require('../server/lib/config'); const { MS_LOOKUP, @@ -999,9 +1000,13 @@ describe('matrix-public-archive', () => { // avoid problems jumping to the latest activity since we can't control the // timestamp of the membership event. const archiveAppServiceUserClient = await getTestClientForAs(); - await joinRoom({ - client: archiveAppServiceUserClient, - roomId: roomId, + // We use `ensureRoomJoined` instead of `joinRoom` because we're joining + // the archive user here and want the same join `reason` to avoid a new + // state event being created (`joinRoom` -> `{ displayname, membership }` + // whereas `ensureRoomJoined` -> `{ reason, displayname, membership }`) + await ensureRoomJoined({ + accessToken: archiveAppServiceUserClient.accessToken, + roomIdOrAlias: roomId, }); // Just spread things out a bit so the event times are more obvious From f466b9abb521efc61229c6c10cee5b6b82ddb793 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 7 Jun 2023 13:16:01 -0500 Subject: [PATCH 9/9] Fix wrong usage of `ensureRoomJoined()` --- test/e2e-tests.js | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/e2e-tests.js b/test/e2e-tests.js index 64bceb67..f7b4bc2b 100644 --- a/test/e2e-tests.js +++ b/test/e2e-tests.js @@ -1004,10 +1004,7 @@ describe('matrix-public-archive', () => { // the archive user here and want the same join `reason` to avoid a new // state event being created (`joinRoom` -> `{ displayname, membership }` // whereas `ensureRoomJoined` -> `{ reason, displayname, membership }`) - await ensureRoomJoined({ - accessToken: archiveAppServiceUserClient.accessToken, - roomIdOrAlias: roomId, - }); + await ensureRoomJoined(archiveAppServiceUserClient.accessToken, roomId); // Just spread things out a bit so the event times are more obvious // and stand out from each other while debugging and so we just have