Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Unified search query syntax using the full-text search capabilities of the underlying DB #11635

Merged
merged 59 commits into from
Oct 25, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
d49af47
use websearch_to_tsquery
novocaine Dec 22, 2021
fea2848
Support fallback using plainto_tsquery
novocaine Dec 22, 2021
ccb0d6c
cleanup
novocaine Dec 23, 2021
409afd6
add tests for search_rooms
novocaine Dec 23, 2021
ac002cb
deflake
novocaine Dec 23, 2021
bbbb651
improve docstring
novocaine Dec 23, 2021
02defb2
pass tsquery_func to _find_highlights_in_postgres
novocaine Dec 23, 2021
2eef5ed
Add tests for sqlite
novocaine Dec 23, 2021
28dc642
Don't preprocess sqllite queries
novocaine Dec 23, 2021
8ac3ea1
Also test the size of "results", as its generated by a different quer…
novocaine Dec 23, 2021
5198cb2
fix comment
novocaine Dec 23, 2021
b3c7e0d
Use plainto_tsquery instead of crafting the query ourselves
novocaine Dec 23, 2021
10f181a
black
novocaine Dec 23, 2021
905f572
Merge branch 'develop' of github.com:matrix-org/synapse into use-webs…
novocaine Dec 23, 2021
05a0cdd
Add feature file
novocaine Dec 23, 2021
b0417e9
Use common cases for all tests
novocaine Dec 23, 2021
04c394b
Merge branch 'develop' of github.com:novocaine/synapse into use-webse…
novocaine Jan 10, 2022
4cfb506
Merge branch 'develop' into use-websearch_to_tsquery-for-fts
novocaine May 26, 2022
d30a211
fix for removal of get_datastore()
novocaine May 26, 2022
68cae26
isort
novocaine May 26, 2022
5dbb2c0
add migration
novocaine May 26, 2022
64dd357
comment
novocaine May 26, 2022
20ab98e
black
novocaine May 26, 2022
e5aa916
isort
novocaine May 26, 2022
7941d8a
flake8
novocaine May 26, 2022
f7362f1
import List for python 3.7
novocaine May 26, 2022
f1769f9
create a background job, and don't do anything if the tokenizer is al…
novocaine May 26, 2022
de07c83
When creating a new db, create it with tokenize=porter in the first p…
novocaine May 26, 2022
1170e07
black
novocaine May 27, 2022
b987203
Revert change to 25/fts.py
novocaine May 27, 2022
6ef1ef8
fix json import
novocaine May 27, 2022
63c4270
slightly neater quote formatting
novocaine May 27, 2022
0ddf6b1
fix missing space
novocaine May 30, 2022
9d77bc4
Add a parser to produce uniform results on all DBs
novocaine May 30, 2022
b8b2e28
address flake8
novocaine May 30, 2022
8506b21
give mypy a hint
novocaine May 30, 2022
e279be5
Fix phrase handling of "word"
novocaine May 30, 2022
6a7cf49
Add comment
novocaine May 30, 2022
92ad70a
Merge branch 'develop' into use-websearch_to_tsquery-for-fts
novocaine May 31, 2022
a5f298b
document negation via -
novocaine May 31, 2022
d6ed19e
Merge remote-tracking branch 'origin/develop' into use-websearch_to_t…
clokep Oct 17, 2022
c544409
Move the database schema to the updated directory.
clokep Oct 17, 2022
be742f1
Use a deque.
clokep Oct 17, 2022
52700b0
Add basic tests for _tokenize_query.
clokep Oct 17, 2022
5d9d183
Handle edge-cases.
clokep Oct 17, 2022
219321b
temp
clokep Oct 18, 2022
9166035
Fix phrase handling.
clokep Oct 18, 2022
a7fd4f6
Handle not with a space after.
clokep Oct 18, 2022
6751f5f
Use the int version number to check if the feature is supported.
clokep Oct 18, 2022
6e6ebd9
Lint
clokep Oct 18, 2022
3272819
Simplify schema delta.
clokep Oct 18, 2022
4993e1c
Add docstring.
clokep Oct 18, 2022
9b7bb08
Remove support for parens.
clokep Oct 24, 2022
13f5306
Fix edge cases with double quotes.
clokep Oct 24, 2022
62f6f18
Lint.
clokep Oct 24, 2022
abc56ad
Merge remote-tracking branch 'origin/develop' into use-websearch_to_t…
clokep Oct 24, 2022
b3755ae
Remove backwards compat code for Postgres < 11 since (almost) EOL.
clokep Oct 25, 2022
5e5fc8d
Simplify phrase handling.
clokep Oct 25, 2022
223580a
Fix tests on postgres 10.
clokep Oct 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/11635.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Allow use of postgres and sqllite full-text search operators in search queries.
74 changes: 45 additions & 29 deletions synapse/storage/databases/main/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import logging
import re
from collections import namedtuple
from typing import TYPE_CHECKING, Collection, Iterable, List, Optional, Set
from typing import TYPE_CHECKING, Collection, Iterable, List, Optional, Set, Tuple

from synapse.api.errors import SynapseError
from synapse.events import EventBase
Expand Down Expand Up @@ -389,8 +389,6 @@ async def search_msgs(self, room_ids, search_term, keys):
"""
clauses = []

search_query = _parse_query(self.database_engine, search_term)

args = []

# Make sure we don't explode because the person is in too many rooms.
Expand All @@ -412,20 +410,25 @@ async def search_msgs(self, room_ids, search_term, keys):
count_clauses = clauses

if isinstance(self.database_engine, PostgresEngine):
search_query, tsquery_func = _parse_query_for_pgsql(
search_term, self.database_engine
)
sql = (
"SELECT ts_rank_cd(vector, to_tsquery('english', ?)) AS rank,"
f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank,"
" room_id, event_id"
" FROM event_search"
" WHERE vector @@ to_tsquery('english', ?)"
f" WHERE vector @@ {tsquery_func}('english', ?)"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we move these to multiline strings while we're here 😇

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was planning to do a follow-up PR to update the entire module to multi-line strings. Would that be acceptable?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure!

)
args = [search_query, search_query] + args

count_sql = (
"SELECT room_id, count(*) as count FROM event_search"
" WHERE vector @@ to_tsquery('english', ?)"
f" WHERE vector @@ {tsquery_func}('english', ?)"
)
count_args = [search_query] + count_args
elif isinstance(self.database_engine, Sqlite3Engine):
search_query = _parse_query_for_sqlite(search_term)

sql = (
"SELECT rank(matchinfo(event_search)) as rank, room_id, event_id"
" FROM event_search"
Expand All @@ -437,7 +440,7 @@ async def search_msgs(self, room_ids, search_term, keys):
"SELECT room_id, count(*) as count FROM event_search"
" WHERE value MATCH ?"
)
count_args = [search_term] + count_args
Copy link
Contributor Author

@novocaine novocaine Dec 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it was a bug to pass search_term here rather than search_query..?

count_args = [search_query] + count_args
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
Expand Down Expand Up @@ -469,7 +472,9 @@ async def search_msgs(self, room_ids, search_term, keys):

highlights = None
if isinstance(self.database_engine, PostgresEngine):
highlights = await self._find_highlights_in_postgres(search_query, events)
highlights = await self._find_highlights_in_postgres(
search_query, events, tsquery_func
)

count_sql += " GROUP BY room_id"

Expand Down Expand Up @@ -511,8 +516,6 @@ async def search_rooms(
"""
clauses = []

search_query = _parse_query(self.database_engine, search_term)

args = []

# Make sure we don't explode because the person is in too many rooms.
Expand Down Expand Up @@ -550,20 +553,24 @@ async def search_rooms(
args.extend([origin_server_ts, origin_server_ts, stream])

if isinstance(self.database_engine, PostgresEngine):
search_query, tsquery_func = _parse_query_for_pgsql(
search_term, self.database_engine
)
sql = (
"SELECT ts_rank_cd(vector, to_tsquery('english', ?)) as rank,"
f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank,"
" origin_server_ts, stream_ordering, room_id, event_id"
" FROM event_search"
" WHERE vector @@ to_tsquery('english', ?) AND "
f" WHERE vector @@ {tsquery_func}('english', ?) AND "
)
args = [search_query, search_query] + args

count_sql = (
"SELECT room_id, count(*) as count FROM event_search"
" WHERE vector @@ to_tsquery('english', ?) AND "
f" WHERE vector @@ {tsquery_func}('english', ?) AND "
)
count_args = [search_query] + count_args
elif isinstance(self.database_engine, Sqlite3Engine):

# We use CROSS JOIN here to ensure we use the right indexes.
# https://sqlite.org/optoverview.html#crossjoin
#
Expand All @@ -582,13 +589,14 @@ async def search_rooms(
" CROSS JOIN events USING (event_id)"
" WHERE "
)
search_query = _parse_query_for_sqlite(search_term)
args = [search_query] + args

count_sql = (
"SELECT room_id, count(*) as count FROM event_search"
" WHERE value MATCH ? AND "
)
count_args = [search_term] + count_args
count_args = [search_query] + count_args
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
Expand Down Expand Up @@ -627,7 +635,9 @@ async def search_rooms(

highlights = None
if isinstance(self.database_engine, PostgresEngine):
highlights = await self._find_highlights_in_postgres(search_query, events)
highlights = await self._find_highlights_in_postgres(
search_query, events, tsquery_func
)

count_sql += " GROUP BY room_id"

Expand All @@ -653,7 +663,7 @@ async def search_rooms(
}

async def _find_highlights_in_postgres(
self, search_query: str, events: List[EventBase]
self, search_query: str, events: List[EventBase], tsquery_func: str
) -> Set[str]:
"""Given a list of events and a search term, return a list of words
that match from the content of the event.
Expand All @@ -664,6 +674,7 @@ async def _find_highlights_in_postgres(
Args:
search_query
events: A list of events
tsquery_func: The tsquery_* function to use when making queries

Returns:
A set of strings.
Expand Down Expand Up @@ -696,7 +707,7 @@ def f(txn):
while stop_sel in value:
stop_sel += ">"

query = "SELECT ts_headline(?, to_tsquery('english', ?), %s)" % (
query = f"SELECT ts_headline(?, {tsquery_func}('english', ?), %s)" % (
_to_postgres_options(
{
"StartSel": start_sel,
Expand Down Expand Up @@ -727,20 +738,25 @@ def _to_postgres_options(options_dict):
return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),)


def _parse_query(database_engine, search_term):
def _parse_query_for_sqlite(search_term: str) -> str:
"""Takes a plain unicode string from the user and converts it into a form
that can be passed to database.
We use this so that we can add prefix matching, which isn't something
that is supported by default.
that can be passed to sqllite's matchinfo().
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is currently a no-op, but we will probably want to add stuff here to handle certain cases e.g. #3024

"""
return search_term


# Pull out the individual words, discarding any non-word characters.
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
def _parse_query_for_pgsql(search_term: str, engine: PostgresEngine) -> Tuple[str, str]:
"""Selects a tsquery_* func to use and transforms the search_term into syntax appropriate for it.

Args:
search_term: A user supplied search query.
engine: The database engine.

Returns:
A tuple of (parsed search_term, tsquery func to use).
"""

if isinstance(database_engine, PostgresEngine):
return " & ".join(result + ":*" for result in results)
elif isinstance(database_engine, Sqlite3Engine):
return " & ".join(result + "*" for result in results)
if engine.supports_websearch_to_tsquery:
clokep marked this conversation as resolved.
Show resolved Hide resolved
return search_term, "websearch_to_tsquery"
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
return search_term, "plainto_tsquery"
4 changes: 4 additions & 0 deletions synapse/storage/engines/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ def supports_returning(self) -> bool:
"""Do we support the `RETURNING` clause in insert/update/delete?"""
return True

@property
def supports_websearch_to_tsquery(self) -> bool:
return int(self.server_version.split(".")[0]) >= 11

def is_deadlock(self, error):
if isinstance(error, self.module.DatabaseError):
# https://www.postgresql.org/docs/current/static/errcodes-appendix.html
Expand Down
131 changes: 131 additions & 0 deletions tests/storage/test_room_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple
from unittest.case import SkipTest
from unittest.mock import PropertyMock, patch

import synapse.rest.admin
from synapse.rest.client import login, room
from synapse.storage.databases.main import DataStore
from synapse.storage.engines import PostgresEngine
from synapse.storage.engines.sqlite import Sqlite3Engine

from tests.unittest import HomeserverTestCase

Expand Down Expand Up @@ -72,3 +78,128 @@ def test_null_byte(self):
)
if isinstance(store.database_engine, PostgresEngine):
self.assertIn("alice", result.get("highlights"))


class MessageSearchTest(HomeserverTestCase):
servlets = [
synapse.rest.admin.register_servlets_for_client_rest_resource,
login.register_servlets,
room.register_servlets,
]

PHRASE = "the quick brown fox jumps over the lazy dog"

def setUp(self):
super().setUp()

# Register a user and create a room, create some messages
self.register_user("alice", "password")
self.access_token = self.login("alice", "password")
self.room_id = self.helper.create_room_as("alice", tok=self.access_token)

# Send the phrase as a message and check it was created
response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token)
self.assertIn("event_id", response)

def _check_test_cases(
self, store: DataStore, cases: list[Tuple[str, bool]]
) -> None:
# Run all the test cases versus search_msgs
for query, has_results in cases:
result = self.get_success(
store.search_msgs([self.room_id], query, ["content.body"])
)
self.assertEquals(result["count"], 1 if has_results else 0, query)
self.assertEquals(len(result["results"]), 1 if has_results else 0, query)

# Run them again versus search_rooms
for query, has_results in cases:
result = self.get_success(
store.search_rooms([self.room_id], query, ["content.body"], 10)
)
self.assertEquals(result["count"], 1 if has_results else 0, query)
self.assertEquals(len(result["results"]), 1 if has_results else 0, query)

def test_postgres_web_search_for_phrase(self):
"""
Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery.
This test is skipped unless the postgres instance supports websearch_to_tsquery.
"""

store = self.hs.get_datastore()
if not isinstance(store.database_engine, PostgresEngine):
raise SkipTest("Test only applies when postgres is used as the database")

if not store.database_engine.supports_websearch_to_tsquery:
raise SkipTest(
"Test only applies when postgres supporting websearch_to_tsquery is used as the database"
)

cases = [
("brown", True),
("quick brown", True),
("brown quick", True),
('"brown quick"', False),
('"jumps over"', True),
('"quick fox"', False),
("furphy OR fox", True),
("nope OR doublenope", False),
("-fox", False),
("-nope", True),
]

self._check_test_cases(store, cases)

def test_postgres_non_web_search_for_phrase(self):
"""
Test postgres searching for phrases without using web search, which is used when websearch_to_tsquery isn't
supported by the current postgres version.
"""

store = self.hs.get_datastore()
if not isinstance(store.database_engine, PostgresEngine):
raise SkipTest("Test only applies when postgres is used as the database")

cases = [
("nope", False),
("brown", True),
("quick brown", True),
("brown quick", True),
("brown nope", False),
(
"furphy OR fox",
False,
), # syntax not supported, OR will be ignored as it'll be between &
('"jumps over"', True), # syntax not supported, quotes are ignored
("-nope", False), # syntax not supported, - will be ignored
]

# Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path.
with patch(
"synapse.storage.engines.postgres.PostgresEngine.supports_websearch_to_tsquery",
new_callable=PropertyMock,
) as supports_websearch_to_tsquery:
supports_websearch_to_tsquery.return_value = False
self._check_test_cases(store, cases)

def test_sqlite_search(self):
"""
Test sqlite searching for phrases.
"""
store = self.hs.get_datastore()
if not isinstance(store.database_engine, Sqlite3Engine):
raise SkipTest("Test only applies when sqlite is used as the database")

cases = [
("nope", False),
("brown", True),
("quick brown", True),
("brown quick", True),
("brown nope", False),
("furphy OR fox", True), # sqllite supports OR
('"jumps over"', True),
('"quick fox"', False), # syntax supports quotes
("fox NOT nope", True), # sqllite supports NOT
]

self._check_test_cases(store, cases)