Add escapexmlfragment parameter for usecontent=orig.

usecontent=orig cuts a fragment from the original input data (instead of reconstructing a version from the forward index). If this is not properly escaped, it can cause issues. The escapexmlfragment will enclose the snippet in a CDATA section. The default value is false for API v4, but will become true starting with API v5. Squashed commit of the following: commit 8f95719a7a1a983da1896321a7ff4da251627f31 Author: Jan Niestadt <jan.niestadt@ivdnt.org> Date: Fri Apr 5 13:18:42 2024 +0200 Rename parameter to escapexmlfragment. commit 0b7ae92efa5dea96f4f28284b762ee55daed71df Author: Jan Niestadt <jan.niestadt@ivdnt.org> Date: Fri Apr 5 13:13:59 2024 +0200 Allow XML fragments from usecontent=orig to be escaped as CDATA. In API v4 this feature must be requested explicitly using xmlfragmentcdata=true. Starting from API v5 (e.g. the /corpora/... URLs), this feature is enabled by default and must be explicitly disabled if desired (for backward compatibility).
INL · Jul 1, 2024 · d99ac39 · d99ac39
1 parent ca081a4
commit d99ac39
Show file tree

Hide file tree

Showing 11 changed files with 94 additions and 34 deletions.
diff --git a/server/src/main/java/nl/inl/blacklab/server/BlackLabServer.java b/server/src/main/java/nl/inl/blacklab/server/BlackLabServer.java
@@ -60,6 +60,12 @@ public class BlackLabServer extends HttpServlet {
 
     private static final String CONFIG_FILE_NAME = "blacklab-server";
 
+    /** Pretty-print the response? */
+    public static final String PARAM_PRETTYPRINT = "prettyprint";
+
+    /** Include XML fragments from document escaped as CDATA or not (i.e. as part of the XML structure)? */
+    public static final String PARAM_ESCAPE_XML_FRAGMENT = "escapexmlfragment";
+
     /** Manages all our searches */
     private SearchManager searchManager;
 
@@ -205,7 +211,7 @@ private void handleRequest(HttpServletRequest request, HttpServletResponse respo
         try {
             ensureSearchManagerAvailable();
         } catch (BlackLabRuntimeException | BlsException e) {
-            boolean prettyPrint = ServletUtil.getParameter(request, "prettyprint", true);
+            boolean prettyPrint = ServletUtil.getParameter(request, PARAM_PRETTYPRINT, true);
             String strApiVersion = ServletUtil.getParameter(request, WebserviceParameter.API_VERSION.value(),
                     ApiVersion.CURRENT.toString());
             ApiVersion apiVersion = ApiVersion.fromValue(strApiVersion);
@@ -244,10 +250,16 @@ private void handleRequest(HttpServletRequest request, HttpServletResponse respo
         String rootEl = requestHandler.omitBlackLabResponseRootElement() ? null : ResponseStreamer.BLACKLAB_RESPONSE_ROOT_ELEMENT;
 
         // === Handle the request
-        boolean prettyPrint = ServletUtil.getParameter(request, "prettyprint", userRequest.isDebugMode());
+        boolean prettyPrint = ServletUtil.getParameter(request, PARAM_PRETTYPRINT, userRequest.isDebugMode());
         ApiVersion api = requestHandler.apiCompatibility();
         DataStream ds = DataStreamAbstract.create(outputType, prettyPrint, api);
         ds.setOmitEmptyAnnotations(searchManager.config().getProtocol().isOmitEmptyProperties());
+        if (request.getParameterMap().containsKey(PARAM_ESCAPE_XML_FRAGMENT)) {
+            // We want to override whether XML fragments are output as CDATA or not
+            // (defaults to true for v5, false before)
+            boolean escapeXmlFragment = ServletUtil.getParameter(request, PARAM_ESCAPE_XML_FRAGMENT, true);
+            ds.setEscapeXmlFragment(escapeXmlFragment);
+        }
         ds.startDocument(rootEl);
         ResponseStreamer dstream = ResponseStreamer.get(ds, api);
         DataStream es = DataStreamAbstract.create(outputType, prettyPrint, api);

diff --git a/server/src/main/java/nl/inl/blacklab/server/datastream/DataStreamAbstract.java b/server/src/main/java/nl/inl/blacklab/server/datastream/DataStreamAbstract.java
@@ -148,6 +148,12 @@ public DataStream plain(String value) {
         return print(value);
     }
 
+    /* NOTE: the attrEntry methods that follow mirror the entry methods above.
+     *       Both sets of methods are intended only for entries in maps.
+     *       The attrEntry versions are specifically meant for the case where you're not sure
+     *       your keys are valid XML element names. They will use a different XML serialization using
+     *       an attribute for the key. */
+
     /**
      * Output an XML fragment, either as a string
      * value or as part of the XML structure.

diff --git a/server/src/main/java/nl/inl/blacklab/server/datastream/DataStreamXml.java b/server/src/main/java/nl/inl/blacklab/server/datastream/DataStreamXml.java
@@ -26,16 +26,29 @@ public class DataStreamXml extends DataStreamAbstract {
     /** Should contextList omit empty annotations if possible? */
     protected boolean omitEmptyAnnotations = false;
 
+    /** Should XML fragments from documents be escaped as CDATA? [true in v5, false before] */
+    protected boolean escapeXmlFragment;
+
     private ApiVersion api;
 
     @Override
     public void setOmitEmptyAnnotations(boolean omitEmptyAnnotations) {
         this.omitEmptyAnnotations = omitEmptyAnnotations;
     }
 
+    /** Should XML fragments from documents be escaped as CDATA in the XML response?
+     *
+     * @param escapeXmlFragment true if XML fragments should be escaped
+     */
+    @Override
+    public void setEscapeXmlFragment(boolean escapeXmlFragment) {
+        this.escapeXmlFragment = escapeXmlFragment;
+    }
+
     public DataStreamXml(boolean prettyPrint, ApiVersion api) {
         super(prettyPrint);
         this.api = api;
+        escapeXmlFragment = api.getMajor() >= 5;
     }
 
     public DataStream startOpenEl(String name) {
@@ -472,19 +485,34 @@ public DataStream value(boolean value) {
 
     /**
      * Output an XML fragment, either as a string
-     * value or as part of the XML structure.
+     * value (CDATA) or as part of the XML structure.
      *
      * @param fragment
      * @return data stream
      */
     public DataStream xmlFragment(String fragment) {
-        // Because we're outputting XML, we output the fragment plain (unquoted or -escaped)
-        return plain(fragment);
+        if (escapeXmlFragment) {
+            // In API v5+, we output the fragment as a CDATA section (unquoted or -escaped)
+            // (not part of the XML structure, but a string value that may contain XML)
+            return cdata(fragment);
+        } else {
+            // Because we're outputting XML, we output the fragment plain (unquoted or -escaped)
+            // (i.e. it becomes part of the XML structure, not a string value)
+            return plain(fragment);
+        }
     }
 
     @Override
     public String getType() {
         return "xml";
     }
 
+    private static final String CDATA_START = "<![CDATA[";
+    private static final String CDATA_END = "]]>";
+
+    private DataStream cdata(String value) {
+        // Escape value for CDATA section (i.e. break into multiple CDATAsections if necessary)
+        String escaped = value.replace("]]>", "]]" + CDATA_END + CDATA_START + ">");
+        return print(CDATA_START).print(escaped).print(CDATA_END);
+    }
 }
diff --git a/site/docs/development/api-redesign/README.md b/site/docs/development/api-redesign/README.md
@@ -66,9 +66,6 @@ DONE IN /corpora ENDPOINTS (e.g. v5):
 
 DONE API v5:
 - remove `/blacklab-server/CORPUSNAME` endpoints.
-
-
-TODO /corpora ENDPOINTS:
 - XML: When using `usecontent=orig`, don't make the content part of the XML anymore.<br>
   (escape it using CDATA (again, same as in JSON). Also consider just returning both
   the FI concordances as well as the original content (if requested), so the response

diff --git a/site/docs/development/customization/legacy-docindexers.md b/site/docs/development/customization/legacy-docindexers.md
@@ -65,7 +65,7 @@ annotation. You should probably have a forward index for at least the word and p
 
 A note about forward indices and indexing multiple values at a single corpus position: as of right now, the forward index will only store the first value indexed at any position. We would like to expand this so that it is possible to quickly retrieve all values indexed at a corpus position, but that is not the case now.
 
-Note that if you want KWICs or snippets that include annotations without a forward index (as well the rest of the original XML), you can switch to using the original XML to generate KWICs and snippets, at the cost of speed. To do this, pass usecontent=orig to BlackLab Server, or call Hits.settings().setConcordanceType(ConcordanceType.CONTENT_STORE).
+Note that if you want KWICs or snippets that include annotations without a forward index (as well the rest of the original XML), you can switch to using the original XML to generate KWICs and snippets, at the cost of speed. To do this, pass `usecontent=orig` to BlackLab Server, or call `Hits.settings().setConcordanceType(ConcordanceType.CONTENT_STORE)`.
 
 ## Custom DocIndexers
 

diff --git a/site/docs/server/rest-api/api-versions.md b/site/docs/server/rest-api/api-versions.md
@@ -57,12 +57,14 @@ To prepare for API version 5.0 (which will likely be the default in BlackLab 5.0
   - In addition to `captureGroups`, `matchInfos` will be reported that includes the same information as well as any inline tags and relations matched. You should use this instead of `captureGroups` for future compatibility.
   - `before`/`after` are the new, preferred alternatives to `left`/`right`,e.g. when sorting/grouping on context. Not all languages are LTR, so this makes more sense. Existing endpoints still use `left`/`right` in the response for compatibility, but new endpoints have been updated as well. These properties can now get a number of tokens as an extra parameter, e.g. `before:lemma:i:2`.
   - For grouping on context, `wordleft`/`wordright` have been deprecated. Use `before`/`after` with 1 token instead.
+  - Grouped results also include a `properties` key that gives the values for the individual properties used for grouping. This is an alternative to `identityDisplay`, which clients would sometimes have to parse to display it in their preferred way.
   - `context` is the new name for the `wordsaroundhit` parameter and supports more options (separate before/after, whole sentence, etc.)
 - Pages that list values for fields, tags, etc. now support the `limitvalues` parameter. This parameter defaults to `200`, but can be set higher if you need really long value lists.
 - New endpoints were added for all operations on corpora, at `/corpora/CORPUSNAME/...` (for now alongside existing endpoints `/CORPUSNAME`). These endpoints are available in BlackLab v4 but only "speak" API v5 (see below). You should move to these endpoints for future compatibility.
 - A new endpoint `/parse-pattern` was added that allows you to parse a CorpusQL or JSON query structure pattern without actually executing the search.
 - A new endpoint `.../CORPUSNAME/relations` that will return all the spans ("inline tags") and relations indexed in the corpus.
 - Doc info on results pages and document info page: the new `tokenCounts` array gives token counts for all annotated fields. The first annotated field is the main one, which has the same value as `lengthInTokens`.
+- When using `usecontent=orig`, you can now specify `escapexmlfragment` to control whether XML fragments are escaped as CDATA or not. This defaults to `false` for API v4 and older, but will be `true` for API v5+.
 
 ### Deprecated
 
@@ -108,6 +110,8 @@ These are breaking changes compared to v4.0. Make sure you update your client ac
     - `summary` has been restructured to group related values together. Keys have been renamed for clarity.
     - response keys `left`/`right` have been replaced with `before`/`after` in the `/hits` response.
     - `docInfos` now have a `metadata` subobject instead of mixing metadata with `mayView` and `lengthInTokens`.
+    - When using `usecontent=orig`, the value of `escapexmlfragment` now defaults to `true`, so XML fragments from the document will be escaped as CDATA. Set it to `false` to include them as part of the XML structure instead (the old default).
+
 
 ## API support roadmap