diff --git a/x-pack/plugins/ml/common/api_schemas/json_schema_schema.ts b/x-pack/plugins/ml/common/api_schemas/json_schema_schema.ts new file mode 100644 index 00000000000000..fec92343beb40c --- /dev/null +++ b/x-pack/plugins/ml/common/api_schemas/json_schema_schema.ts @@ -0,0 +1,24 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { schema, type TypeOf } from '@kbn/config-schema'; + +export const getJsonSchemaQuerySchema = schema.object({ + /** + * ES API path + */ + path: schema.oneOf([ + schema.literal('/_ml/anomaly_detectors/{job_id}'), + schema.literal('/_ml/datafeeds/{datafeed_id}'), + ]), + /** + * API Method + */ + method: schema.string(), +}); + +export type SupportedPath = TypeOf['path']; diff --git a/x-pack/plugins/ml/public/application/jobs/jobs_list/components/ml_job_editor/ml_job_editor.tsx b/x-pack/plugins/ml/public/application/jobs/jobs_list/components/ml_job_editor/ml_job_editor.tsx index f1e4f79f5e7e11..449659b9f5dc5e 100644 --- a/x-pack/plugins/ml/public/application/jobs/jobs_list/components/ml_job_editor/ml_job_editor.tsx +++ b/x-pack/plugins/ml/public/application/jobs/jobs_list/components/ml_job_editor/ml_job_editor.tsx @@ -6,14 +6,13 @@ */ import React, { FC } from 'react'; -import { XJsonMode } from '@kbn/ace'; - -import { EuiCodeEditor, XJson } from '@kbn/es-ui-shared-plugin/public'; -import type { EuiCodeEditorProps } from '@kbn/es-ui-shared-plugin/public'; +import { monaco, XJsonLang } from '@kbn/monaco'; +import { CodeEditor } from '@kbn/kibana-react-plugin/public'; +import { type EuiCodeEditorProps, XJson } from '@kbn/es-ui-shared-plugin/public'; const { expandLiteralStrings } = XJson; -export const ML_EDITOR_MODE = { TEXT: 'text', JSON: 'json', XJSON: new XJsonMode() }; +export const ML_EDITOR_MODE = { TEXT: 'text', JSON: 'json', XJSON: XJsonLang.ID }; interface MlJobEditorProps { value: string; @@ -25,6 +24,7 @@ interface MlJobEditorProps { theme?: string; onChange?: EuiCodeEditorProps['onChange']; 'data-test-subj'?: string; + schema?: object; } export const MLJobEditor: FC = ({ value, @@ -36,6 +36,7 @@ export const MLJobEditor: FC = ({ theme = 'textmate', onChange = () => {}, 'data-test-subj': dataTestSubj, + schema, }) => { if (mode === ML_EDITOR_MODE.XJSON) { try { @@ -47,23 +48,35 @@ export const MLJobEditor: FC = ({ } return ( - { + const editorModelUri: string = editor.getModel()?.uri.toString()!; + if (schema) { + monaco.languages.json.jsonDefaults.setDiagnosticsOptions({ + validate: true, + enableSchemaRequest: false, + schemaValidation: 'error', + schemas: [ + ...(monaco.languages.json.jsonDefaults.diagnosticsOptions.schemas ?? []), + { + uri: editorModelUri, + fileMatch: [editorModelUri], + schema, + }, + ], + }); + } + }} /> ); }; diff --git a/x-pack/plugins/ml/public/application/jobs/new_job/pages/components/common/json_editor_flyout/json_editor_flyout.tsx b/x-pack/plugins/ml/public/application/jobs/new_job/pages/components/common/json_editor_flyout/json_editor_flyout.tsx index b9e77a3a5f2732..5d4cbb1d422de7 100644 --- a/x-pack/plugins/ml/public/application/jobs/new_job/pages/components/common/json_editor_flyout/json_editor_flyout.tsx +++ b/x-pack/plugins/ml/public/application/jobs/new_job/pages/components/common/json_editor_flyout/json_editor_flyout.tsx @@ -8,6 +8,7 @@ import React, { Fragment, FC, useState, useContext, useEffect, useMemo } from 'react'; import { i18n } from '@kbn/i18n'; import { FormattedMessage } from '@kbn/i18n-react'; +import { memoize } from 'lodash'; import { EuiFlyout, EuiFlyoutFooter, @@ -21,6 +22,7 @@ import { EuiCallOut, } from '@elastic/eui'; import { XJson } from '@kbn/es-ui-shared-plugin/public'; +import { useMlApiContext } from '../../../../../../contexts/kibana'; import { CombinedJob, Datafeed } from '../../../../../../../../common/types/anomaly_detection_jobs'; import { ML_EDITOR_MODE, MLJobEditor } from '../../../../../jobs_list/components/ml_job_editor'; import { isValidJson } from '../../../../../../../../common/util/validation_utils'; @@ -42,18 +44,32 @@ interface Props { jobEditorMode: EDITOR_MODE; datafeedEditorMode: EDITOR_MODE; } + +const fetchSchemas = memoize( + async (jsonSchemaApi, path: string, method: string) => + jsonSchemaApi.getSchemaDefinition({ + path, + method, + }), + (jsonSchemaApi, path, method) => path + method +); + export const JsonEditorFlyout: FC = ({ isDisabled, jobEditorMode, datafeedEditorMode }) => { const { jobCreator, jobCreatorUpdate, jobCreatorUpdated } = useContext(JobCreatorContext); const { displayErrorToast } = useToastNotificationService(); const [showJsonFlyout, setShowJsonFlyout] = useState(false); const [showChangedIndicesWarning, setShowChangedIndicesWarning] = useState(false); + const { jsonSchema: jsonSchemaApi } = useMlApiContext(); + const [jobConfigString, setJobConfigString] = useState(jobCreator.formattedJobJson); const [datafeedConfigString, setDatafeedConfigString] = useState( jobCreator.formattedDatafeedJson ); const [saveable, setSaveable] = useState(false); const [tempCombinedJob, setTempCombinedJob] = useState(null); + const [jobSchema, setJobSchema] = useState(); + const [datafeedSchema, setDatafeedSchema] = useState(); useEffect(() => { setJobConfigString(jobCreator.formattedJobJson); @@ -78,6 +94,29 @@ export const JsonEditorFlyout: FC = ({ isDisabled, jobEditorMode, datafee // eslint-disable-next-line react-hooks/exhaustive-deps }, [showJsonFlyout]); + useEffect( + function fetchSchemasOnMount() { + fetchSchemas(jsonSchemaApi, '/_ml/anomaly_detectors/{job_id}', 'put') + .then((result) => { + setJobSchema(result); + }) + .catch((e) => { + // eslint-disable-next-line no-console + console.error(e); + }); + + fetchSchemas(jsonSchemaApi, '/_ml/datafeeds/{datafeed_id}', 'put') + .then((result) => { + setDatafeedSchema(result); + }) + .catch((e) => { + // eslint-disable-next-line no-console + console.error(e); + }); + }, + [jsonSchemaApi] + ); + const editJsonMode = jobEditorMode === EDITOR_MODE.EDITABLE || datafeedEditorMode === EDITOR_MODE.EDITABLE; const readOnlyMode = @@ -160,7 +199,7 @@ export const JsonEditorFlyout: FC = ({ isDisabled, jobEditorMode, datafee setShowJsonFlyout(false)} hideCloseButton size={'l'}> - {jobEditorMode !== EDITOR_MODE.HIDDEN && ( + {jobEditorMode !== EDITOR_MODE.HIDDEN ? ( = ({ isDisabled, jobEditorMode, datafee })} value={jobConfigString} heightOffset={showChangedIndicesWarning ? WARNING_CALLOUT_OFFSET : 0} + schema={jobSchema} /> - )} - {datafeedEditorMode !== EDITOR_MODE.HIDDEN && ( + ) : null} + {datafeedEditorMode !== EDITOR_MODE.HIDDEN ? ( <> = ({ isDisabled, jobEditorMode, datafee })} value={datafeedConfigString} heightOffset={showChangedIndicesWarning ? WARNING_CALLOUT_OFFSET : 0} + schema={datafeedSchema} /> {datafeedEditorMode === EDITOR_MODE.EDITABLE && ( @@ -191,7 +232,7 @@ export const JsonEditorFlyout: FC = ({ isDisabled, jobEditorMode, datafee )} - )} + ) : null} {showChangedIndicesWarning && ( <> @@ -274,7 +315,8 @@ const Contents: FC<{ editJson: boolean; onChange(s: string): void; heightOffset?: number; -}> = ({ title, value, editJson, onChange, heightOffset = 0 }) => { + schema?: object; +}> = ({ title, value, editJson, onChange, heightOffset = 0, schema }) => { // the ace editor requires a fixed height const editorHeight = useMemo( () => `${window.innerHeight - 230 - heightOffset}px`, @@ -289,9 +331,10 @@ const Contents: FC<{ ); diff --git a/x-pack/plugins/ml/public/application/services/ml_api_service/index.ts b/x-pack/plugins/ml/public/application/services/ml_api_service/index.ts index d55668d6f6e650..c242b0e58832dd 100644 --- a/x-pack/plugins/ml/public/application/services/ml_api_service/index.ts +++ b/x-pack/plugins/ml/public/application/services/ml_api_service/index.ts @@ -8,6 +8,7 @@ import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; import { Observable } from 'rxjs'; import type { HttpStart } from '@kbn/core/public'; +import { jsonSchemaProvider } from './json_schema'; import { HttpService } from '../http_service'; import { annotationsApiProvider } from './annotations'; @@ -732,5 +733,6 @@ export function mlApiServicesProvider(httpService: HttpService) { savedObjects: savedObjectsApiProvider(httpService), trainedModels: trainedModelsApiProvider(httpService), notifications: notificationsProvider(httpService), + jsonSchema: jsonSchemaProvider(httpService), }; } diff --git a/x-pack/plugins/ml/public/application/services/ml_api_service/json_schema.ts b/x-pack/plugins/ml/public/application/services/ml_api_service/json_schema.ts new file mode 100644 index 00000000000000..aa630cd51d5cd7 --- /dev/null +++ b/x-pack/plugins/ml/public/application/services/ml_api_service/json_schema.ts @@ -0,0 +1,31 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { omitBy } from 'lodash'; +import { isDefined } from '@kbn/ml-is-defined'; +import { type SupportedPath } from '../../../../common/api_schemas/json_schema_schema'; +import { HttpService } from '../http_service'; +import { basePath } from '.'; + +export interface GetSchemaDefinitionParams { + path: SupportedPath; + method: string; +} + +export function jsonSchemaProvider(httpService: HttpService) { + const apiBasePath = basePath(); + + return { + getSchemaDefinition(params: GetSchemaDefinitionParams) { + return httpService.http({ + path: `${apiBasePath}/json_schema`, + method: 'GET', + query: omitBy(params, (v) => !isDefined(v)), + }); + }, + }; +} diff --git a/x-pack/plugins/ml/server/models/json_schema_service/index.ts b/x-pack/plugins/ml/server/models/json_schema_service/index.ts new file mode 100644 index 00000000000000..348fb5b0c46eae --- /dev/null +++ b/x-pack/plugins/ml/server/models/json_schema_service/index.ts @@ -0,0 +1,8 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export { JsonSchemaService } from './json_schema_service'; diff --git a/x-pack/plugins/ml/server/models/json_schema_service/json_schema_service.test.ts b/x-pack/plugins/ml/server/models/json_schema_service/json_schema_service.test.ts new file mode 100644 index 00000000000000..c9c66ee5a554b8 --- /dev/null +++ b/x-pack/plugins/ml/server/models/json_schema_service/json_schema_service.test.ts @@ -0,0 +1,612 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { JsonSchemaService } from './json_schema_service'; + +describe.skip('JsonSchemaService', function () { + test('extract schema definition and applies overrides', async () => { + const service = new JsonSchemaService(); + + const result = await service.extractSchema('/_ml/anomaly_detectors/{job_id}', 'put'); + + expect(result).toEqual({ + additionalProperties: false, + properties: { + allow_lazy_open: { + description: + 'Advanced configuration option. Specifies whether this job can open when there is insufficient machine learning node capacity for it to be immediately assigned to a node. By default, if a machine learning node with capacity to run the job cannot immediately be found, the open anomaly detection jobs API returns an error. However, this is also subject to the cluster-wide `xpack.ml.max_lazy_ml_nodes` setting. If this option is set to true, the open anomaly detection jobs API does not return an error and the job waits in the opening state until sufficient machine learning node capacity is available.', + type: 'boolean', + }, + analysis_config: { + additionalProperties: false, + description: + 'Specifies how to analyze the data. After you create a job, you cannot change the analysis configuration; all the properties are informational.', + properties: { + bucket_span: { + description: + 'The size of the interval that the analysis is aggregated into, typically between `5m` and `1h`. This value should be either a whole number of days or equate to a\nwhole number of buckets in one day. If the anomaly detection job uses a datafeed with aggregations, this value must also be divisible by the interval of the date histogram aggregation.\n* @server_default 5m', + type: 'string', + }, + categorization_analyzer: { + anyOf: [ + { + type: 'string', + }, + { + additionalProperties: false, + properties: { + char_filter: { + description: + 'One or more character filters. In addition to the built-in character filters, other plugins can provide more character filters. If this property is not specified, no character filters are applied prior to categorization. If you are customizing some other aspect of the analyzer and you need to achieve the equivalent of `categorization_filters` (which are not permitted when some other aspect of the analyzer is customized), add them here as pattern replace character filters.', + items: { + anyOf: [ + { + type: 'string', + }, + { + anyOf: [{}, {}, {}, {}, {}], + }, + ], + }, + type: 'array', + }, + filter: { + description: + 'One or more token filters. In addition to the built-in token filters, other plugins can provide more token filters. If this property is not specified, no token filters are applied prior to categorization.', + items: { + anyOf: [ + { + type: 'string', + }, + { + anyOf: [ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + ], + }, + ], + }, + type: 'array', + }, + tokenizer: { + anyOf: [ + { + type: 'string', + }, + { + anyOf: [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}], + }, + ], + description: + 'The name or definition of the tokenizer to use after character filters are applied. This property is compulsory if `categorization_analyzer` is specified as an object. Machine learning provides a tokenizer called `ml_standard` that tokenizes in a way that has been determined to produce good categorization results on a variety of log file formats for logs in English. If you want to use that tokenizer but change the character or token filters, specify "tokenizer": "ml_standard" in your `categorization_analyzer`. Additionally, the `ml_classic` tokenizer is available, which tokenizes in the same way as the non-customizable tokenizer in old versions of the product (before 6.2). `ml_classic` was the default categorization tokenizer in versions 6.2 to 7.13, so if you need categorization identical to the default for jobs created in these versions, specify "tokenizer": "ml_classic" in your `categorization_analyzer`.', + }, + }, + type: 'object', + }, + ], + description: + 'If `categorization_field_name` is specified, you can also define the analyzer that is used to interpret the categorization field. This property cannot be used at the same time as `categorization_filters`. The categorization analyzer specifies how the `categorization_field` is interpreted by the categorization process. The `categorization_analyzer` field can be specified either as a string or as an object. If it is a string, it must refer to a built-in analyzer or one added by another plugin.', + }, + categorization_field_name: { + description: + 'If this property is specified, the values of the specified field will be categorized. The resulting categories must be used in a detector by setting `by_field_name`, `over_field_name`, or `partition_field_name` to the keyword `mlcategory`.', + type: 'string', + }, + categorization_filters: { + description: + 'If `categorization_field_name` is specified, you can also define optional filters. This property expects an array of regular expressions. The expressions are used to filter out matching sequences from the categorization field values. You can use this functionality to fine tune the categorization by excluding sequences from consideration when categories are defined. For example, you can exclude SQL statements that appear in your log files. This property cannot be used at the same time as `categorization_analyzer`. If you only want to define simple regular expression filters that are applied prior to tokenization, setting this property is the easiest method. If you also want to customize the tokenizer or post-tokenization filtering, use the `categorization_analyzer` property instead and include the filters as pattern_replace character filters. The effect is exactly the same.', + items: { + type: 'string', + }, + type: 'array', + }, + detectors: { + description: + 'Detector configuration objects specify which data fields a job analyzes. They also specify which analytical functions are used. You can specify multiple detectors for a job. If the detectors array does not contain at least one detector, no analysis can occur and an error is returned.', + items: { + additionalProperties: false, + properties: { + by_field_name: { + description: + 'The field used to split the data. In particular, this property is used for analyzing the splits with respect to their own history. It is used for finding unusual values in the context of the split.', + type: 'string', + }, + custom_rules: { + description: + 'Custom rules enable you to customize the way detectors operate. For example, a rule may dictate conditions under which results should be skipped. Kibana refers to custom rules as job rules.', + items: { + additionalProperties: false, + properties: { + actions: { + description: + 'The set of actions to be triggered when the rule applies. If more than one action is specified the effects of all actions are combined.', + items: { + enum: ['skip_result', 'skip_model_update'], + type: 'string', + }, + type: 'array', + }, + conditions: { + description: + 'An array of numeric conditions when the rule applies. A rule must either have a non-empty scope or at least one condition. Multiple conditions are combined together with a logical AND.', + items: { + additionalProperties: false, + properties: { + applies_to: { + description: + 'Specifies the result property to which the condition applies. If your detector uses `lat_long`, `metric`, `rare`, or `freq_rare` functions, you can only specify conditions that apply to time.', + enum: ['actual', 'typical', 'diff_from_typical', 'time'], + type: 'string', + }, + operator: { + description: + 'Specifies the condition operator. The available options are greater than, greater than or equals, less than, and less than or equals.', + enum: ['gt', 'gte', 'lt', 'lte'], + type: 'string', + }, + value: { + description: + 'The value that is compared against the `applies_to` field using the operator.', + type: 'number', + }, + }, + required: ['applies_to', 'operator', 'value'], + type: 'object', + }, + type: 'array', + }, + scope: { + additionalProperties: { + additionalProperties: false, + properties: { + filter_id: { + description: 'The identifier for the filter.', + type: 'string', + }, + filter_type: { + description: + 'If set to `include`, the rule applies for values in the filter. If set to `exclude`, the rule applies for values not in the filter.', + enum: ['include', 'exclude'], + type: 'string', + }, + }, + required: ['filter_id'], + type: 'object', + }, + description: + 'A scope of series where the rule applies. A rule must either have a non-empty scope or at least one condition. By default, the scope includes all series. Scoping is allowed for any of the fields that are also specified in `by_field_name`, `over_field_name`, or `partition_field_name`.', + type: 'object', + }, + }, + type: 'object', + }, + type: 'array', + }, + detector_description: { + description: 'A description of the detector.', + type: 'string', + }, + detector_index: { + description: + 'A unique identifier for the detector. This identifier is based on the order of the detectors in the `analysis_config`, starting at zero. If you specify a value for this property, it is ignored.', + type: 'number', + }, + exclude_frequent: { + description: + 'If set, frequent entities are excluded from influencing the anomaly results. Entities can be considered frequent over time or frequent in a population. If you are working with both over and by fields, you can set `exclude_frequent` to `all` for both fields, or to `by` or `over` for those specific fields.', + enum: ['all', 'none', 'by', 'over'], + type: 'string', + }, + field_name: { + description: + 'The field that the detector uses in the function. If you use an event rate function such as count or rare, do not specify this field. The `field_name` cannot contain double quotes or backslashes.', + type: 'string', + }, + function: { + description: + 'The analysis function that is used. For example, `count`, `rare`, `mean`, `min`, `max`, or `sum`.', + type: 'string', + }, + over_field_name: { + description: + 'The field used to split the data. In particular, this property is used for analyzing the splits with respect to the history of all splits. It is used for finding unusual values in the population of all splits.', + type: 'string', + }, + partition_field_name: { + description: + 'The field used to segment the analysis. When you use this property, you have completely independent baselines for each value of this field.', + type: 'string', + }, + use_null: { + description: + 'Defines whether a new series is used as the null series when there is no value for the by or partition fields.', + type: 'boolean', + }, + }, + required: ['function'], + type: 'object', + }, + type: 'array', + }, + influencers: { + description: + 'A comma separated list of influencer field names. Typically these can be the by, over, or partition fields that are used in the detector configuration. You might also want to use a field name that is not specifically named in a detector, but is available as part of the input data. When you use multiple detectors, the use of influencers is recommended as it aggregates results for each influencer entity.', + items: { + type: 'string', + }, + type: 'array', + }, + latency: { + anyOf: [ + { + type: 'string', + }, + { + type: 'number', + }, + ], + description: + 'The size of the window in which to expect data that is out of time order. If you specify a non-zero value, it must be greater than or equal to one second. NOTE: Latency is applicable only when you send data by using the post data API.', + }, + model_prune_window: { + anyOf: [ + { + type: 'string', + }, + { + type: 'number', + }, + ], + description: + 'Advanced configuration option. Affects the pruning of models that have not been updated for the given time duration. The value must be set to a multiple of the `bucket_span`. If set too low, important information may be removed from the model. For jobs created in 8.1 and later, the default value is the greater of `30d` or 20 times `bucket_span`.', + }, + multivariate_by_fields: { + description: + 'This functionality is reserved for internal use. It is not supported for use in customer environments and is not subject to the support SLA of official GA features. If set to `true`, the analysis will automatically find correlations between metrics for a given by field value and report anomalies when those correlations cease to hold. For example, suppose CPU and memory usage on host A is usually highly correlated with the same metrics on host B. Perhaps this correlation occurs because they are running a load-balanced application. If you enable this property, anomalies will be reported when, for example, CPU usage on host A is high and the value of CPU usage on host B is low. That is to say, you’ll see an anomaly when the CPU of host A is unusual given the CPU of host B. To use the `multivariate_by_fields` property, you must also specify `by_field_name` in your detector.', + type: 'boolean', + }, + per_partition_categorization: { + additionalProperties: false, + description: + 'Settings related to how categorization interacts with partition fields.', + properties: { + enabled: { + description: + 'To enable this setting, you must also set the `partition_field_name` property to the same value in every detector that uses the keyword `mlcategory`. Otherwise, job creation fails.', + type: 'boolean', + }, + stop_on_warn: { + description: + 'This setting can be set to true only if per-partition categorization is enabled. If true, both categorization and subsequent anomaly detection stops for partitions where the categorization status changes to warn. This setting makes it viable to have a job where it is expected that categorization works well for some partitions but not others; you do not pay the cost of bad categorization forever in the partitions where it works badly.', + type: 'boolean', + }, + }, + type: 'object', + }, + summary_count_field_name: { + description: + 'If this property is specified, the data that is fed to the job is expected to be pre-summarized. This property value is the name of the field that contains the count of raw data points that have been summarized. The same `summary_count_field_name` applies to all detectors in the job. NOTE: The `summary_count_field_name` property cannot be used with the `metric` function.', + type: 'string', + }, + }, + required: ['bucket_span', 'detectors'], + type: 'object', + }, + analysis_limits: { + additionalProperties: false, + description: + 'Limits can be applied for the resources required to hold the mathematical models in memory. These limits are approximate and can be set per job. They do not control the memory used by other processes, for example the Elasticsearch Java processes.', + properties: { + categorization_examples_limit: { + description: + 'The maximum number of examples stored per category in memory and in the results data store. If you increase this value, more examples are available, however it requires that you have more storage available. If you set this value to 0, no examples are stored. NOTE: The `categorization_examples_limit` applies only to analysis that uses categorization.', + type: 'number', + }, + model_memory_limit: { + description: + 'The approximate maximum amount of memory resources that are required for analytical processing. Once this limit is approached, data pruning becomes more aggressive. Upon exceeding this limit, new entities are not modeled. If the `xpack.ml.max_model_memory_limit` setting has a value greater than 0 and less than 1024mb, that value is used instead of the default. The default value is relatively small to ensure that high resource usage is a conscious decision. If you have jobs that are expected to analyze high cardinality fields, you will likely need to use a higher value. If you specify a number instead of a string, the units are assumed to be MiB. Specifying a string is recommended for clarity. If you specify a byte size unit of `b` or `kb` and the number does not equate to a discrete number of megabytes, it is rounded down to the closest MiB. The minimum valid value is 1 MiB. If you specify a value less than 1 MiB, an error occurs. If you specify a value for the `xpack.ml.max_model_memory_limit` setting, an error occurs when you try to create jobs that have `model_memory_limit` values greater than that setting value.', + type: 'string', + }, + }, + type: 'object', + }, + background_persist_interval: { + anyOf: [ + { + type: 'string', + }, + { + type: 'number', + }, + ], + description: + 'Advanced configuration option. The time between each periodic persistence of the model. The default value is a randomized value between 3 to 4 hours, which avoids all jobs persisting at exactly the same time. The smallest allowed value is 1 hour. For very large models (several GB), persistence could take 10-20 minutes, so do not set the `background_persist_interval` value too low.', + }, + custom_settings: { + description: 'Advanced configuration option. Contains custom meta data about the job.', + }, + daily_model_snapshot_retention_after_days: { + description: + 'Advanced configuration option, which affects the automatic removal of old model snapshots for this job. It specifies a period of time (in days) after which only the first snapshot per day is retained. This period is relative to the timestamp of the most recent snapshot for this job. Valid values range from 0 to `model_snapshot_retention_days`.', + type: 'number', + }, + data_description: { + additionalProperties: false, + description: + 'Defines the format of the input data when you send data to the job by using the post data API. Note that when configure a datafeed, these properties are automatically set. When data is received via the post data API, it is not stored in Elasticsearch. Only the results for anomaly detection are retained.', + properties: { + field_delimiter: { + type: 'string', + }, + format: { + description: 'Only JSON format is supported at this time.', + type: 'string', + }, + time_field: { + description: 'The name of the field that contains the timestamp.', + type: 'string', + }, + time_format: { + description: + "The time format, which can be `epoch`, `epoch_ms`, or a custom pattern. The value `epoch` refers to UNIX or Epoch time (the number of seconds since 1 Jan 1970). The value `epoch_ms` indicates that time is measured in milliseconds since the epoch. The `epoch` and `epoch_ms` time formats accept either integer or real values. Custom patterns must conform to the Java DateTimeFormatter class. When you use date-time formatting patterns, it is recommended that you provide the full date, time and time zone. For example: `yyyy-MM-dd'T'HH:mm:ssX`. If the pattern that you specify is not sufficient to produce a complete timestamp, job creation fails.", + type: 'string', + }, + }, + type: 'object', + }, + datafeed_config: { + additionalProperties: false, + description: + 'Defines a datafeed for the anomaly detection job. If Elasticsearch security features are enabled, your datafeed remembers which roles the user who created it had at the time of creation and runs the query using those same roles. If you provide secondary authorization headers, those credentials are used instead.', + properties: { + aggregations: { + additionalProperties: {}, + description: + 'If set, the datafeed performs aggregation searches. Support for aggregations is limited and should be used only with low cardinality data.', + type: 'object', + }, + aggs: { + additionalProperties: {}, + description: + 'If set, the datafeed performs aggregation searches. Support for aggregations is limited and should be used only with low cardinality data.', + type: 'object', + }, + chunking_config: { + additionalProperties: false, + description: + 'Datafeeds might be required to search over long time periods, for several months or years. This search is split into time chunks in order to ensure the load on Elasticsearch is managed. Chunking configuration controls how the size of these time chunks are calculated and is an advanced configuration option.', + properties: { + mode: { + description: + 'If the mode is `auto`, the chunk size is dynamically calculated;\nthis is the recommended value when the datafeed does not use aggregations.\nIf the mode is `manual`, chunking is applied according to the specified `time_span`;\nuse this mode when the datafeed uses aggregations. If the mode is `off`, no chunking is applied.', + enum: ['auto', 'manual', 'off'], + type: 'string', + }, + time_span: { + anyOf: [ + { + type: 'string', + }, + { + type: 'number', + }, + ], + description: + 'The time span that each search will be querying. This setting is applicable only when the `mode` is set to `manual`.', + }, + }, + required: ['mode'], + type: 'object', + }, + datafeed_id: { + description: + 'A numerical character string that uniquely identifies the datafeed. This identifier can contain lowercase alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start and end with alphanumeric characters. The default value is the job identifier.', + type: 'string', + }, + delayed_data_check_config: { + additionalProperties: false, + description: + 'Specifies whether the datafeed checks for missing data and the size of the window. The datafeed can optionally search over indices that have already been read in an effort to determine whether any data has subsequently been added to the index. If missing data is found, it is a good indication that the `query_delay` option is set too low and the data is being indexed after the datafeed has passed that moment in time. This check runs only on real-time datafeeds.', + properties: { + check_window: { + anyOf: [ + { + type: 'string', + }, + { + type: 'number', + }, + ], + description: + 'The window of time that is searched for late data. This window of time ends with the latest finalized bucket.\nIt defaults to null, which causes an appropriate `check_window` to be calculated when the real-time datafeed runs.\nIn particular, the default `check_window` span calculation is based on the maximum of `2h` or `8 * bucket_span`.', + }, + enabled: { + description: + 'Specifies whether the datafeed periodically checks for delayed data.', + type: 'boolean', + }, + }, + required: ['enabled'], + type: 'object', + }, + frequency: { + description: + 'The interval at which scheduled queries are made while the datafeed runs in real time. The default value is either the bucket span for short bucket spans, or, for longer bucket spans, a sensible fraction of the bucket span. For example: `150s`. When `frequency` is shorter than the bucket span, interim results for the last (partial) bucket are written then eventually overwritten by the full bucket results. If the datafeed uses aggregations, this value must be divisible by the interval of the date histogram aggregation.', + type: 'string', + }, + indexes: { + items: { + type: 'string', + }, + type: 'array', + }, + indices: { + description: + 'An array of index names. Wildcards are supported. If any indices are in remote clusters, the machine learning nodes must have the `remote_cluster_client` role.', + items: { + type: 'string', + }, + type: 'array', + }, + indices_options: { + description: 'Specifies index expansion options that are used during search.', + }, + job_id: { + type: 'string', + }, + max_empty_searches: { + description: + 'If a real-time datafeed has never seen any data (including during any initial training period) then it will automatically stop itself and close its associated job after this many real-time searches that return no documents. In other words, it will stop after `frequency` times `max_empty_searches` of real-time operation. If not set then a datafeed with no end time that sees no data will remain started until it is explicitly stopped.', + type: 'number', + }, + query: { + description: + 'The Elasticsearch query domain-specific language (DSL). This value corresponds to the query object in an Elasticsearch search POST body. All the options that are supported by Elasticsearch can be used, as this object is passed verbatim to Elasticsearch.', + }, + query_delay: { + description: + 'The number of seconds behind real time that data is queried. For example, if data from 10:04 a.m. might not be searchable in Elasticsearch until 10:06 a.m., set this property to 120 seconds. The default value is randomly selected between `60s` and `120s`. This randomness improves the query performance when there are multiple jobs running on the same node.', + type: 'string', + }, + runtime_mappings: { + additionalProperties: { + anyOf: [ + {}, + { + items: {}, + type: 'array', + }, + ], + }, + description: 'Specifies runtime fields for the datafeed search.', + type: 'object', + }, + script_fields: { + additionalProperties: {}, + description: + 'Specifies scripts that evaluate custom expressions and returns script fields to the datafeed. The detector configuration objects in a job can contain functions that use these script fields.', + type: 'object', + }, + scroll_size: { + description: + 'The size parameter that is used in Elasticsearch searches when the datafeed does not use aggregations. The maximum value is the value of `index.max_result_window`, which is 10,000 by default.', + type: 'number', + }, + }, + required: ['indices', 'query'], + type: 'object', + }, + description: { + description: 'A description of the job.', + type: 'string', + }, + groups: { + description: 'A list of job groups. A job can belong to no groups or many.', + items: { + type: 'string', + }, + type: 'array', + }, + job_id: { + description: 'Identifier for the anomaly detection job.', + type: 'string', + }, + model_plot_config: { + additionalProperties: false, + description: + 'This advanced configuration option stores model information along with the results. It provides a more detailed view into anomaly detection. If you enable model plot it can add considerable overhead to the performance of the system; it is not feasible for jobs with many entities. Model plot provides a simplified and indicative view of the model and its bounds. It does not display complex features such as multivariate correlations or multimodal data. As such, anomalies may occasionally be reported which cannot be seen in the model plot. Model plot config can be configured when the job is created or updated later. It must be disabled if performance issues are experienced.', + properties: { + annotations_enabled: { + description: + 'If true, enables calculation and storage of the model change annotations for each entity that is being analyzed.', + type: 'boolean', + }, + enabled: { + description: + 'If true, enables calculation and storage of the model bounds for each entity that is being analyzed.', + type: 'boolean', + }, + terms: { + description: + 'Limits data collection to this comma separated list of partition or by field values. If terms are not specified or it is an empty string, no filtering is applied. Wildcards are not supported. Only the specified terms can be viewed when using the Single Metric Viewer.', + type: 'string', + }, + }, + type: 'object', + }, + model_snapshot_retention_days: { + description: + 'Advanced configuration option, which affects the automatic removal of old model snapshots for this job. It specifies the maximum period of time (in days) that snapshots are retained. This period is relative to the timestamp of the most recent snapshot for this job. By default, snapshots ten days older than the newest snapshot are deleted.', + type: 'number', + }, + renormalization_window_days: { + description: + 'Advanced configuration option. The period over which adjustments to the score are applied, as new data is seen. The default value is the longer of 30 days or 100 bucket spans.', + type: 'number', + }, + results_index_name: { + description: + 'A text string that affects the name of the machine learning results index. By default, the job generates an index named `.ml-anomalies-shared`.', + type: 'string', + }, + results_retention_days: { + description: + 'Advanced configuration option. The period of time (in days) that results are retained. Age is calculated relative to the timestamp of the latest bucket result. If this property has a non-null value, once per day at 00:30 (server time), results that are the specified number of days older than the latest bucket result are deleted from Elasticsearch. The default value is null, which means all results are retained. Annotations generated by the system also count as results for retention purposes; they are deleted after the same number of days as results. Annotations added by users are retained forever.', + type: 'number', + }, + }, + required: ['analysis_config', 'data_description'], + type: 'object', + }); + }); +}); diff --git a/x-pack/plugins/ml/server/models/json_schema_service/json_schema_service.ts b/x-pack/plugins/ml/server/models/json_schema_service/json_schema_service.ts new file mode 100644 index 00000000000000..ca6e2a0b18b485 --- /dev/null +++ b/x-pack/plugins/ml/server/models/json_schema_service/json_schema_service.ts @@ -0,0 +1,151 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import Fs from 'fs'; +import Path from 'path'; +import { type SupportedPath } from '../../../common/api_schemas/json_schema_schema'; +import { jsonSchemaOverrides } from './schema_overrides'; +import { type PropertyDefinition } from './types'; + +const supportedEndpoints = [ + { + path: '/_ml/anomaly_detectors/{job_id}' as const, + method: 'put', + }, + { + path: '/_ml/datafeeds/{datafeed_id}' as const, + method: 'put', + }, +]; + +/** + * + */ +export class JsonSchemaService { + /** + * Dictionary with the schema components + * @private + */ + private _schemaComponents: Record = {}; + + private _requiredComponents: Set = new Set(); + + /** + * Extracts properties definition + * @private + */ + private extractProperties(propertyDef: PropertyDefinition): void { + for (const key in propertyDef) { + if (propertyDef.hasOwnProperty(key)) { + const propValue = propertyDef[key as keyof PropertyDefinition]!; + + if (key === '$ref') { + const comp = (propValue as string).split('/'); + const refKey = comp[comp.length - 1]; + + delete propertyDef.$ref; + + // FIXME there is an issue with the maximum call stack size exceeded + if (!refKey.startsWith('Ml_Types_')) return; + + const schemaComponent = this._schemaComponents[refKey]; + + this._requiredComponents.add(refKey); + + Object.assign(propertyDef, schemaComponent); + + this.extractProperties(propertyDef); + } + + if (Array.isArray(propValue)) { + propValue.forEach((v) => { + if (typeof v === 'object') { + this.extractProperties(v); + } + }); + } else if (typeof propValue === 'object') { + this.extractProperties(propValue as PropertyDefinition); + } + } + } + } + + private applyOverrides(path: SupportedPath, schema: PropertyDefinition): PropertyDefinition { + const overrides = jsonSchemaOverrides[path]; + return { + ...schema, + ...overrides, + properties: { + ...schema.properties, + ...overrides.properties, + }, + }; + } + + /** + * Extracts resolved schema definition for requested path and method + * @param path + * @param method + */ + public async extractSchema(path: SupportedPath, method: string, schema?: object) { + const fileContent = + schema ?? JSON.parse(Fs.readFileSync(Path.resolve(__dirname, 'openapi.json'), 'utf8')); + + const definition = fileContent.paths[path][method]; + + if (!definition) { + throw new Error('Schema definition is not defined'); + } + + const bodySchema = definition.requestBody.content['application/json'].schema; + + this._schemaComponents = fileContent.components.schemas; + + this.extractProperties(bodySchema); + + return this.applyOverrides(path, bodySchema); + } + + /** + * Generates openapi file, removing redundant content. + * Only used internally via a node command to generate the file. + */ + public async generateSchemaFile() { + const schema = JSON.parse( + Fs.readFileSync(Path.resolve(__dirname, 'openapi_source.json'), 'utf8') + ); + + supportedEndpoints.forEach((e) => { + // need to extract schema in order to keep required components + this.extractSchema(e.path, e.method, schema); + }); + + for (const pathName in schema.paths) { + if (!schema.paths.hasOwnProperty(pathName)) continue; + + const supportedEndpoint = supportedEndpoints.find((v) => v.path === pathName); + if (supportedEndpoint) { + for (const methodName in schema.paths[pathName]) { + if (methodName !== supportedEndpoint.method) { + delete schema.paths[pathName][methodName]; + } + } + } else { + delete schema.paths[pathName]; + } + } + + const components = schema.components.schemas; + for (const componentName in components) { + if (!this._requiredComponents.has(componentName)) { + delete components[componentName]; + } + } + + Fs.writeFileSync(Path.resolve(__dirname, 'openapi.json'), JSON.stringify(schema, null, 2)); + } +} diff --git a/x-pack/plugins/ml/server/models/json_schema_service/openapi.json b/x-pack/plugins/ml/server/models/json_schema_service/openapi.json new file mode 100644 index 00000000000000..202e641e9494f9 --- /dev/null +++ b/x-pack/plugins/ml/server/models/json_schema_service/openapi.json @@ -0,0 +1,1235 @@ +{ + "openapi": "3.0.3", + "info": { + "title": "Elasticsearch specification", + "version": "8.3.0" + }, + "paths": { + "/_ml/datafeeds/{datafeed_id}": { + "put": { + "operationId": "ml.put_datafeed", + "description": "Instantiates a datafeed.\nDatafeeds retrieve data from Elasticsearch for analysis by an anomaly detection job.\nYou can associate only one datafeed with each anomaly detection job.\nThe datafeed contains a query that runs at a defined interval (`frequency`).\nIf you are concerned about delayed data, you can add a delay (`query_delay') at each interval.\nWhen Elasticsearch security features are enabled, your datafeed remembers which roles the user who created it had\nat the time of creation and runs the query using those same roles. If you provide secondary authorization headers,\nthose credentials are used instead.\nYou must use Kibana, this API, or the create anomaly detection jobs API to create a datafeed. Do not add a datafeed\ndirectly to the `.ml-config` index. Do not give users `write` privileges on the `.ml-config` index.", + "parameters": [ + { + "name": "datafeed_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + }, + "description": "A numerical character string that uniquely identifies the datafeed.\nThis identifier can contain lowercase alphanumeric characters (a-z and 0-9), hyphens, and underscores.\nIt must start and end with alphanumeric characters." + }, + { + "name": "allow_no_indices", + "in": "query", + "schema": { + "type": "boolean" + }, + "description": "If true, wildcard indices expressions that resolve into no concrete indices are ignored. This includes the `_all`\nstring or when no indices are specified." + }, + { + "name": "expand_wildcards", + "in": "query", + "schema": { + "anyOf": [ + { + "type": "string", + "enum": [ + "all", + "open", + "closed", + "hidden", + "none" + ] + }, + { + "type": "array", + "items": { + "type": "string", + "enum": [ + "all", + "open", + "closed", + "hidden", + "none" + ] + } + } + ] + }, + "description": "Type of index that wildcard patterns can match. If the request can target data streams, this argument determines\nwhether wildcard expressions match hidden data streams. Supports comma-separated values." + }, + { + "name": "ignore_throttled", + "in": "query", + "schema": { + "type": "boolean" + }, + "description": "If true, concrete, expanded, or aliased indices are ignored when frozen." + }, + { + "name": "ignore_unavailable", + "in": "query", + "schema": { + "type": "boolean" + }, + "description": "If true, unavailable indices (missing or closed) are ignored." + } + ], + "tags": [ + "ml" + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "aggregations": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Types_Aggregations_AggregationContainer" + }, + "description": "If set, the datafeed performs aggregation searches.\nSupport for aggregations is limited and should be used only with low cardinality data." + }, + "chunking_config": { + "$ref": "#/components/schemas/Ml_Types_ChunkingConfig", + "description": "Datafeeds might be required to search over long time periods, for several months or years.\nThis search is split into time chunks in order to ensure the load on Elasticsearch is managed.\nChunking configuration controls how the size of these time chunks are calculated;\nit is an advanced configuration option." + }, + "delayed_data_check_config": { + "$ref": "#/components/schemas/Ml_Types_DelayedDataCheckConfig", + "description": "Specifies whether the datafeed checks for missing data and the size of the window.\nThe datafeed can optionally search over indices that have already been read in an effort to determine whether\nany data has subsequently been added to the index. If missing data is found, it is a good indication that the\n`query_delay` is set too low and the data is being indexed after the datafeed has passed that moment in time.\nThis check runs only on real-time datafeeds." + }, + "frequency": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ], + "description": "The interval at which scheduled queries are made while the datafeed runs in real time.\nThe default value is either the bucket span for short bucket spans, or, for longer bucket spans, a sensible\nfraction of the bucket span. When `frequency` is shorter than the bucket span, interim results for the last\n(partial) bucket are written then eventually overwritten by the full bucket results. If the datafeed uses\naggregations, this value must be divisible by the interval of the date histogram aggregation." + }, + "indices": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "An array of index names. Wildcards are supported. If any of the indices are in remote clusters, the machine\nlearning nodes must have the `remote_cluster_client` role." + }, + "indexes": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "An array of index names. Wildcards are supported. If any of the indices are in remote clusters, the machine\nlearning nodes must have the `remote_cluster_client` role." + }, + "indices_options": { + "$ref": "#/components/schemas/Types_IndicesOptions", + "description": "Specifies index expansion options that are used during search" + }, + "job_id": { + "type": "string", + "description": "Identifier for the anomaly detection job." + }, + "max_empty_searches": { + "type": "number", + "description": "If a real-time datafeed has never seen any data (including during any initial training period), it automatically\nstops and closes the associated job after this many real-time searches return no documents. In other words,\nit stops after `frequency` times `max_empty_searches` of real-time operation. If not set, a datafeed with no\nend time that sees no data remains started until it is explicitly stopped. By default, it is not set." + }, + "query": { + "$ref": "#/components/schemas/Types_QueryDsl_QueryContainer", + "description": "The Elasticsearch query domain-specific language (DSL). This value corresponds to the query object in an\nElasticsearch search POST body. All the options that are supported by Elasticsearch can be used, as this\nobject is passed verbatim to Elasticsearch." + }, + "query_delay": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ], + "description": "The number of seconds behind real time that data is queried. For example, if data from 10:04 a.m. might\nnot be searchable in Elasticsearch until 10:06 a.m., set this property to 120 seconds. The default\nvalue is randomly selected between `60s` and `120s`. This randomness improves the query performance\nwhen there are multiple jobs running on the same node." + }, + "runtime_mappings": { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "$ref": "#/components/schemas/Types_Mapping_RuntimeField" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/Types_Mapping_RuntimeField" + } + } + ] + }, + "description": "Specifies runtime fields for the datafeed search." + }, + "script_fields": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Types_ScriptField" + }, + "description": "Specifies scripts that evaluate custom expressions and returns script fields to the datafeed.\nThe detector configuration objects in a job can contain functions that use these script fields." + }, + "scroll_size": { + "type": "number", + "description": "The size parameter that is used in Elasticsearch searches when the datafeed does not use aggregations.\nThe maximum value is the value of `index.max_result_window`, which is 10,000 by default." + }, + "headers": { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + } + }, + "additionalProperties": false + } + } + } + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "aggregations": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Types_Aggregations_AggregationContainer" + } + }, + "chunking_config": { + "$ref": "#/components/schemas/Ml_Types_ChunkingConfig" + }, + "delayed_data_check_config": { + "$ref": "#/components/schemas/Ml_Types_DelayedDataCheckConfig" + }, + "datafeed_id": { + "type": "string" + }, + "frequency": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ] + }, + "indices": { + "type": "array", + "items": { + "type": "string" + } + }, + "job_id": { + "type": "string" + }, + "indices_options": { + "$ref": "#/components/schemas/Types_IndicesOptions" + }, + "max_empty_searches": { + "type": "number" + }, + "query": { + "$ref": "#/components/schemas/Types_QueryDsl_QueryContainer" + }, + "query_delay": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ] + }, + "runtime_mappings": { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "$ref": "#/components/schemas/Types_Mapping_RuntimeField" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/Types_Mapping_RuntimeField" + } + } + ] + } + }, + "script_fields": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Types_ScriptField" + } + }, + "scroll_size": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "aggregations", + "chunking_config", + "datafeed_id", + "frequency", + "indices", + "job_id", + "max_empty_searches", + "query", + "query_delay", + "scroll_size" + ] + } + } + } + } + } + } + }, + "/_ml/anomaly_detectors/{job_id}": { + "put": { + "operationId": "ml.put_job", + "description": "Instantiates an anomaly detection job. If you include a `datafeed_config`, you must have read index privileges on the source index.", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + }, + "description": "The identifier for the anomaly detection job. This identifier can contain lowercase alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start and end with alphanumeric characters." + } + ], + "tags": [ + "ml" + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "allow_lazy_open": { + "type": "boolean", + "description": "Advanced configuration option. Specifies whether this job can open when there is insufficient machine learning node capacity for it to be immediately assigned to a node. By default, if a machine learning node with capacity to run the job cannot immediately be found, the open anomaly detection jobs API returns an error. However, this is also subject to the cluster-wide `xpack.ml.max_lazy_ml_nodes` setting. If this option is set to true, the open anomaly detection jobs API does not return an error and the job waits in the opening state until sufficient machine learning node capacity is available." + }, + "analysis_config": { + "$ref": "#/components/schemas/Ml_Types_AnalysisConfig", + "description": "Specifies how to analyze the data. After you create a job, you cannot change the analysis configuration; all the properties are informational." + }, + "analysis_limits": { + "$ref": "#/components/schemas/Ml_Types_AnalysisLimits", + "description": "Limits can be applied for the resources required to hold the mathematical models in memory. These limits are approximate and can be set per job. They do not control the memory used by other processes, for example the Elasticsearch Java processes." + }, + "background_persist_interval": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ], + "description": "Advanced configuration option. The time between each periodic persistence of the model. The default value is a randomized value between 3 to 4 hours, which avoids all jobs persisting at exactly the same time. The smallest allowed value is 1 hour. For very large models (several GB), persistence could take 10-20 minutes, so do not set the `background_persist_interval` value too low." + }, + "custom_settings": { + "description": "Advanced configuration option. Contains custom meta data about the job." + }, + "daily_model_snapshot_retention_after_days": { + "type": "number", + "description": "Advanced configuration option, which affects the automatic removal of old model snapshots for this job. It specifies a period of time (in days) after which only the first snapshot per day is retained. This period is relative to the timestamp of the most recent snapshot for this job. Valid values range from 0 to `model_snapshot_retention_days`." + }, + "data_description": { + "$ref": "#/components/schemas/Ml_Types_DataDescription", + "description": "Defines the format of the input data when you send data to the job by using the post data API. Note that when configure a datafeed, these properties are automatically set. When data is received via the post data API, it is not stored in Elasticsearch. Only the results for anomaly detection are retained." + }, + "datafeed_config": { + "$ref": "#/components/schemas/Ml_Types_DatafeedConfig", + "description": "Defines a datafeed for the anomaly detection job. If Elasticsearch security features are enabled, your datafeed remembers which roles the user who created it had at the time of creation and runs the query using those same roles. If you provide secondary authorization headers, those credentials are used instead." + }, + "description": { + "type": "string", + "description": "A description of the job." + }, + "groups": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of job groups. A job can belong to no groups or many." + }, + "model_plot_config": { + "$ref": "#/components/schemas/Ml_Types_ModelPlotConfig", + "description": "This advanced configuration option stores model information along with the results. It provides a more detailed view into anomaly detection. If you enable model plot it can add considerable overhead to the performance of the system; it is not feasible for jobs with many entities. Model plot provides a simplified and indicative view of the model and its bounds. It does not display complex features such as multivariate correlations or multimodal data. As such, anomalies may occasionally be reported which cannot be seen in the model plot. Model plot config can be configured when the job is created or updated later. It must be disabled if performance issues are experienced." + }, + "model_snapshot_retention_days": { + "type": "number", + "description": "Advanced configuration option, which affects the automatic removal of old model snapshots for this job. It specifies the maximum period of time (in days) that snapshots are retained. This period is relative to the timestamp of the most recent snapshot for this job. By default, snapshots ten days older than the newest snapshot are deleted." + }, + "renormalization_window_days": { + "type": "number", + "description": "Advanced configuration option. The period over which adjustments to the score are applied, as new data is seen. The default value is the longer of 30 days or 100 bucket spans." + }, + "results_index_name": { + "type": "string", + "description": "A text string that affects the name of the machine learning results index. By default, the job generates an index named `.ml-anomalies-shared`." + }, + "results_retention_days": { + "type": "number", + "description": "Advanced configuration option. The period of time (in days) that results are retained. Age is calculated relative to the timestamp of the latest bucket result. If this property has a non-null value, once per day at 00:30 (server time), results that are the specified number of days older than the latest bucket result are deleted from Elasticsearch. The default value is null, which means all results are retained. Annotations generated by the system also count as results for retention purposes; they are deleted after the same number of days as results. Annotations added by users are retained forever." + } + }, + "additionalProperties": false, + "required": [ + "analysis_config", + "background_persist_interval", + "data_description" + ] + } + } + } + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "allow_lazy_open": { + "type": "boolean" + }, + "analysis_config": { + "$ref": "#/components/schemas/Ml_Types_AnalysisConfigRead" + }, + "analysis_limits": { + "$ref": "#/components/schemas/Ml_Types_AnalysisLimits" + }, + "background_persist_interval": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ] + }, + "create_time": { + "type": "string" + }, + "custom_settings": { + "description": "User provided value" + }, + "daily_model_snapshot_retention_after_days": { + "type": "number" + }, + "data_description": { + "$ref": "#/components/schemas/Ml_Types_DataDescription" + }, + "datafeed_config": { + "$ref": "#/components/schemas/Ml_Types_Datafeed" + }, + "description": { + "type": "string" + }, + "groups": { + "type": "array", + "items": { + "type": "string" + } + }, + "job_id": { + "type": "string" + }, + "job_type": { + "type": "string" + }, + "job_version": { + "type": "string" + }, + "model_plot_config": { + "$ref": "#/components/schemas/Ml_Types_ModelPlotConfig" + }, + "model_snapshot_id": { + "type": "string" + }, + "model_snapshot_retention_days": { + "type": "number" + }, + "renormalization_window_days": { + "type": "number" + }, + "results_index_name": { + "type": "string" + }, + "results_retention_days": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "allow_lazy_open", + "analysis_config", + "analysis_limits", + "create_time", + "daily_model_snapshot_retention_after_days", + "data_description", + "job_id", + "job_type", + "job_version", + "model_snapshot_retention_days", + "results_index_name" + ] + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Ml_Types_AnalysisConfig": { + "type": "object", + "properties": { + "bucket_span": { + "type": "string", + "description": "The size of the interval that the analysis is aggregated into, typically between `5m` and `1h`. This value should be either a whole number of days or equate to a\nwhole number of buckets in one day. If the anomaly detection job uses a datafeed with aggregations, this value must also be divisible by the interval of the date histogram aggregation.\n* @server_default 5m" + }, + "categorization_analyzer": { + "anyOf": [ + { + "type": "string" + }, + { + "$ref": "#/components/schemas/Ml_Types_CategorizationAnalyzerDefinition" + } + ], + "description": "If `categorization_field_name` is specified, you can also define the analyzer that is used to interpret the categorization field. This property cannot be used at the same time as `categorization_filters`. The categorization analyzer specifies how the `categorization_field` is interpreted by the categorization process. The `categorization_analyzer` field can be specified either as a string or as an object. If it is a string, it must refer to a built-in analyzer or one added by another plugin." + }, + "categorization_field_name": { + "type": "string", + "description": "If this property is specified, the values of the specified field will be categorized. The resulting categories must be used in a detector by setting `by_field_name`, `over_field_name`, or `partition_field_name` to the keyword `mlcategory`." + }, + "categorization_filters": { + "type": "array", + "items": { + "type": "string" + }, + "description": "If `categorization_field_name` is specified, you can also define optional filters. This property expects an array of regular expressions. The expressions are used to filter out matching sequences from the categorization field values. You can use this functionality to fine tune the categorization by excluding sequences from consideration when categories are defined. For example, you can exclude SQL statements that appear in your log files. This property cannot be used at the same time as `categorization_analyzer`. If you only want to define simple regular expression filters that are applied prior to tokenization, setting this property is the easiest method. If you also want to customize the tokenizer or post-tokenization filtering, use the `categorization_analyzer` property instead and include the filters as pattern_replace character filters. The effect is exactly the same." + }, + "detectors": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Ml_Types_Detector" + }, + "description": "Detector configuration objects specify which data fields a job analyzes. They also specify which analytical functions are used. You can specify multiple detectors for a job. If the detectors array does not contain at least one detector, no analysis can occur and an error is returned." + }, + "influencers": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A comma separated list of influencer field names. Typically these can be the by, over, or partition fields that are used in the detector configuration. You might also want to use a field name that is not specifically named in a detector, but is available as part of the input data. When you use multiple detectors, the use of influencers is recommended as it aggregates results for each influencer entity." + }, + "latency": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ], + "description": "The size of the window in which to expect data that is out of time order. If you specify a non-zero value, it must be greater than or equal to one second. NOTE: Latency is applicable only when you send data by using the post data API." + }, + "model_prune_window": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ], + "description": "Advanced configuration option. Affects the pruning of models that have not been updated for the given time duration. The value must be set to a multiple of the `bucket_span`. If set too low, important information may be removed from the model. For jobs created in 8.1 and later, the default value is the greater of `30d` or 20 times `bucket_span`." + }, + "multivariate_by_fields": { + "type": "boolean", + "description": "This functionality is reserved for internal use. It is not supported for use in customer environments and is not subject to the support SLA of official GA features. If set to `true`, the analysis will automatically find correlations between metrics for a given by field value and report anomalies when those correlations cease to hold. For example, suppose CPU and memory usage on host A is usually highly correlated with the same metrics on host B. Perhaps this correlation occurs because they are running a load-balanced application. If you enable this property, anomalies will be reported when, for example, CPU usage on host A is high and the value of CPU usage on host B is low. That is to say, you’ll see an anomaly when the CPU of host A is unusual given the CPU of host B. To use the `multivariate_by_fields` property, you must also specify `by_field_name` in your detector." + }, + "per_partition_categorization": { + "$ref": "#/components/schemas/Ml_Types_PerPartitionCategorization", + "description": "Settings related to how categorization interacts with partition fields." + }, + "summary_count_field_name": { + "type": "string", + "description": "If this property is specified, the data that is fed to the job is expected to be pre-summarized. This property value is the name of the field that contains the count of raw data points that have been summarized. The same `summary_count_field_name` applies to all detectors in the job. NOTE: The `summary_count_field_name` property cannot be used with the `metric` function." + } + }, + "additionalProperties": false, + "required": [ + "bucket_span", + "detectors" + ] + }, + "Ml_Types_AnalysisLimits": { + "type": "object", + "properties": { + "categorization_examples_limit": { + "type": "number", + "description": "The maximum number of examples stored per category in memory and in the results data store. If you increase this value, more examples are available, however it requires that you have more storage available. If you set this value to 0, no examples are stored. NOTE: The `categorization_examples_limit` applies only to analysis that uses categorization." + }, + "model_memory_limit": { + "type": "string", + "description": "The approximate maximum amount of memory resources that are required for analytical processing. Once this limit is approached, data pruning becomes more aggressive. Upon exceeding this limit, new entities are not modeled. If the `xpack.ml.max_model_memory_limit` setting has a value greater than 0 and less than 1024mb, that value is used instead of the default. The default value is relatively small to ensure that high resource usage is a conscious decision. If you have jobs that are expected to analyze high cardinality fields, you will likely need to use a higher value. If you specify a number instead of a string, the units are assumed to be MiB. Specifying a string is recommended for clarity. If you specify a byte size unit of `b` or `kb` and the number does not equate to a discrete number of megabytes, it is rounded down to the closest MiB. The minimum valid value is 1 MiB. If you specify a value less than 1 MiB, an error occurs. If you specify a value for the `xpack.ml.max_model_memory_limit` setting, an error occurs when you try to create jobs that have `model_memory_limit` values greater than that setting value." + } + }, + "additionalProperties": false + }, + "Ml_Types_CategorizationAnalyzerDefinition": { + "type": "object", + "properties": { + "char_filter": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "$ref": "#/components/schemas/Types_Analysis_HtmlStripCharFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_MappingCharFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_PatternReplaceCharFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_IcuNormalizationCharFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KuromojiIterationMarkCharFilter" + } + ] + } + ] + }, + "description": "One or more character filters. In addition to the built-in character filters, other plugins can provide more character filters. If this property is not specified, no character filters are applied prior to categorization. If you are customizing some other aspect of the analyzer and you need to achieve the equivalent of `categorization_filters` (which are not permitted when some other aspect of the analyzer is customized), add them here as pattern replace character filters." + }, + "filter": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "$ref": "#/components/schemas/Types_Analysis_AsciiFoldingTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_CommonGramsTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_ConditionTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_DelimitedPayloadTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_EdgeNGramTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_ElisionTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_FingerprintTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_HunspellTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_HyphenationDecompounderTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KeepTypesTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KeepWordsTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KeywordMarkerTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KStemTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_LengthTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_LimitTokenCountTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_LowercaseTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_MultiplexerTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_NGramTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_NoriPartOfSpeechTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_PatternCaptureTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_PatternReplaceTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_PorterStemTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_PredicateTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_RemoveDuplicatesTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_ReverseTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_ShingleTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_SnowballTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_StemmerOverrideTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_StemmerTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_StopTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_SynonymGraphTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_SynonymTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_TrimTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_TruncateTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_UniqueTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_UppercaseTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_WordDelimiterGraphTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_WordDelimiterTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KuromojiStemmerTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KuromojiReadingFormTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KuromojiPartOfSpeechTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_IcuTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_IcuCollationTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_IcuFoldingTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_IcuNormalizationTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_IcuTransformTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_PhoneticTokenFilter" + }, + { + "$ref": "#/components/schemas/Types_Analysis_DictionaryDecompounderTokenFilter" + } + ] + } + ] + }, + "description": "One or more token filters. In addition to the built-in token filters, other plugins can provide more token filters. If this property is not specified, no token filters are applied prior to categorization." + }, + "tokenizer": { + "anyOf": [ + { + "type": "string" + }, + { + "anyOf": [ + { + "$ref": "#/components/schemas/Types_Analysis_CharGroupTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_EdgeNGramTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KeywordTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_LetterTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_LowercaseTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_NGramTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_NoriTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_PathHierarchyTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_StandardTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_UaxEmailUrlTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_WhitespaceTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_KuromojiTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_PatternTokenizer" + }, + { + "$ref": "#/components/schemas/Types_Analysis_IcuTokenizer" + } + ] + } + ], + "description": "The name or definition of the tokenizer to use after character filters are applied. This property is compulsory if `categorization_analyzer` is specified as an object. Machine learning provides a tokenizer called `ml_standard` that tokenizes in a way that has been determined to produce good categorization results on a variety of log file formats for logs in English. If you want to use that tokenizer but change the character or token filters, specify \"tokenizer\": \"ml_standard\" in your `categorization_analyzer`. Additionally, the `ml_classic` tokenizer is available, which tokenizes in the same way as the non-customizable tokenizer in old versions of the product (before 6.2). `ml_classic` was the default categorization tokenizer in versions 6.2 to 7.13, so if you need categorization identical to the default for jobs created in these versions, specify \"tokenizer\": \"ml_classic\" in your `categorization_analyzer`." + } + }, + "additionalProperties": false + }, + "Ml_Types_ChunkingConfig": { + "type": "object", + "properties": { + "mode": { + "type": "string", + "enum": [ + "auto", + "manual", + "off" + ], + "description": "If the mode is `auto`, the chunk size is dynamically calculated;\nthis is the recommended value when the datafeed does not use aggregations.\nIf the mode is `manual`, chunking is applied according to the specified `time_span`;\nuse this mode when the datafeed uses aggregations. If the mode is `off`, no chunking is applied." + }, + "time_span": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ], + "description": "The time span that each search will be querying. This setting is applicable only when the `mode` is set to `manual`." + } + }, + "additionalProperties": false, + "required": [ + "mode" + ] + }, + "Ml_Types_DataDescription": { + "type": "object", + "properties": { + "format": { + "type": "string", + "description": "Only JSON format is supported at this time." + }, + "time_field": { + "type": "string", + "description": "The name of the field that contains the timestamp." + }, + "time_format": { + "type": "string", + "description": "The time format, which can be `epoch`, `epoch_ms`, or a custom pattern. The value `epoch` refers to UNIX or Epoch time (the number of seconds since 1 Jan 1970). The value `epoch_ms` indicates that time is measured in milliseconds since the epoch. The `epoch` and `epoch_ms` time formats accept either integer or real values. Custom patterns must conform to the Java DateTimeFormatter class. When you use date-time formatting patterns, it is recommended that you provide the full date, time and time zone. For example: `yyyy-MM-dd'T'HH:mm:ssX`. If the pattern that you specify is not sufficient to produce a complete timestamp, job creation fails." + }, + "field_delimiter": { + "type": "string" + } + }, + "additionalProperties": false + }, + "Ml_Types_DatafeedConfig": { + "type": "object", + "properties": { + "aggregations": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Types_Aggregations_AggregationContainer" + }, + "description": "If set, the datafeed performs aggregation searches. Support for aggregations is limited and should be used only with low cardinality data." + }, + "aggs": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Types_Aggregations_AggregationContainer" + }, + "description": "If set, the datafeed performs aggregation searches. Support for aggregations is limited and should be used only with low cardinality data." + }, + "chunking_config": { + "$ref": "#/components/schemas/Ml_Types_ChunkingConfig", + "description": "Datafeeds might be required to search over long time periods, for several months or years. This search is split into time chunks in order to ensure the load on Elasticsearch is managed. Chunking configuration controls how the size of these time chunks are calculated and is an advanced configuration option." + }, + "datafeed_id": { + "type": "string", + "description": "A numerical character string that uniquely identifies the datafeed. This identifier can contain lowercase alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start and end with alphanumeric characters. The default value is the job identifier." + }, + "delayed_data_check_config": { + "$ref": "#/components/schemas/Ml_Types_DelayedDataCheckConfig", + "description": "Specifies whether the datafeed checks for missing data and the size of the window. The datafeed can optionally search over indices that have already been read in an effort to determine whether any data has subsequently been added to the index. If missing data is found, it is a good indication that the `query_delay` option is set too low and the data is being indexed after the datafeed has passed that moment in time. This check runs only on real-time datafeeds." + }, + "frequency": { + "type": "string", + "description": "The interval at which scheduled queries are made while the datafeed runs in real time. The default value is either the bucket span for short bucket spans, or, for longer bucket spans, a sensible fraction of the bucket span. For example: `150s`. When `frequency` is shorter than the bucket span, interim results for the last (partial) bucket are written then eventually overwritten by the full bucket results. If the datafeed uses aggregations, this value must be divisible by the interval of the date histogram aggregation." + }, + "indexes": { + "type": "array", + "items": { + "type": "string" + } + }, + "indices": { + "type": "array", + "items": { + "type": "string" + }, + "description": "An array of index names. Wildcards are supported. If any indices are in remote clusters, the machine learning nodes must have the `remote_cluster_client` role." + }, + "indices_options": { + "$ref": "#/components/schemas/Types_IndicesOptions", + "description": "Specifies index expansion options that are used during search." + }, + "job_id": { + "type": "string" + }, + "max_empty_searches": { + "type": "number", + "description": "If a real-time datafeed has never seen any data (including during any initial training period) then it will automatically stop itself and close its associated job after this many real-time searches that return no documents. In other words, it will stop after `frequency` times `max_empty_searches` of real-time operation. If not set then a datafeed with no end time that sees no data will remain started until it is explicitly stopped." + }, + "query": { + "$ref": "#/components/schemas/Types_QueryDsl_QueryContainer", + "description": "The Elasticsearch query domain-specific language (DSL). This value corresponds to the query object in an Elasticsearch search POST body. All the options that are supported by Elasticsearch can be used, as this object is passed verbatim to Elasticsearch." + }, + "query_delay": { + "type": "string", + "description": "The number of seconds behind real time that data is queried. For example, if data from 10:04 a.m. might not be searchable in Elasticsearch until 10:06 a.m., set this property to 120 seconds. The default value is randomly selected between `60s` and `120s`. This randomness improves the query performance when there are multiple jobs running on the same node." + }, + "runtime_mappings": { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "$ref": "#/components/schemas/Types_Mapping_RuntimeField" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/Types_Mapping_RuntimeField" + } + } + ] + }, + "description": "Specifies runtime fields for the datafeed search." + }, + "script_fields": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Types_ScriptField" + }, + "description": "Specifies scripts that evaluate custom expressions and returns script fields to the datafeed. The detector configuration objects in a job can contain functions that use these script fields." + }, + "scroll_size": { + "type": "number", + "description": "The size parameter that is used in Elasticsearch searches when the datafeed does not use aggregations. The maximum value is the value of `index.max_result_window`, which is 10,000 by default." + } + }, + "additionalProperties": false, + "required": [ + "indices", + "query" + ] + }, + "Ml_Types_DelayedDataCheckConfig": { + "type": "object", + "properties": { + "check_window": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "number" + } + ], + "description": "The window of time that is searched for late data. This window of time ends with the latest finalized bucket.\nIt defaults to null, which causes an appropriate `check_window` to be calculated when the real-time datafeed runs.\nIn particular, the default `check_window` span calculation is based on the maximum of `2h` or `8 * bucket_span`." + }, + "enabled": { + "type": "boolean", + "description": "Specifies whether the datafeed periodically checks for delayed data." + } + }, + "additionalProperties": false, + "required": [ + "enabled" + ] + }, + "Ml_Types_DetectionRule": { + "type": "object", + "properties": { + "actions": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "skip_result", + "skip_model_update" + ] + }, + "description": "The set of actions to be triggered when the rule applies. If more than one action is specified the effects of all actions are combined." + }, + "conditions": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Ml_Types_RuleCondition" + }, + "description": "An array of numeric conditions when the rule applies. A rule must either have a non-empty scope or at least one condition. Multiple conditions are combined together with a logical AND." + }, + "scope": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/Ml_Types_FilterRef" + }, + "description": "A scope of series where the rule applies. A rule must either have a non-empty scope or at least one condition. By default, the scope includes all series. Scoping is allowed for any of the fields that are also specified in `by_field_name`, `over_field_name`, or `partition_field_name`." + } + }, + "additionalProperties": false + }, + "Ml_Types_Detector": { + "type": "object", + "properties": { + "by_field_name": { + "type": "string", + "description": "The field used to split the data. In particular, this property is used for analyzing the splits with respect to their own history. It is used for finding unusual values in the context of the split." + }, + "custom_rules": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Ml_Types_DetectionRule" + }, + "description": "Custom rules enable you to customize the way detectors operate. For example, a rule may dictate conditions under which results should be skipped. Kibana refers to custom rules as job rules." + }, + "detector_description": { + "type": "string", + "description": "A description of the detector." + }, + "detector_index": { + "type": "number", + "description": "A unique identifier for the detector. This identifier is based on the order of the detectors in the `analysis_config`, starting at zero. If you specify a value for this property, it is ignored." + }, + "exclude_frequent": { + "type": "string", + "enum": [ + "all", + "none", + "by", + "over" + ], + "description": "If set, frequent entities are excluded from influencing the anomaly results. Entities can be considered frequent over time or frequent in a population. If you are working with both over and by fields, you can set `exclude_frequent` to `all` for both fields, or to `by` or `over` for those specific fields." + }, + "field_name": { + "type": "string", + "description": "The field that the detector uses in the function. If you use an event rate function such as count or rare, do not specify this field. The `field_name` cannot contain double quotes or backslashes." + }, + "function": { + "type": "string", + "description": "The analysis function that is used. For example, `count`, `rare`, `mean`, `min`, `max`, or `sum`." + }, + "over_field_name": { + "type": "string", + "description": "The field used to split the data. In particular, this property is used for analyzing the splits with respect to the history of all splits. It is used for finding unusual values in the population of all splits." + }, + "partition_field_name": { + "type": "string", + "description": "The field used to segment the analysis. When you use this property, you have completely independent baselines for each value of this field." + }, + "use_null": { + "type": "boolean", + "description": "Defines whether a new series is used as the null series when there is no value for the by or partition fields." + } + }, + "additionalProperties": false, + "required": [ + "function" + ] + }, + "Ml_Types_FilterRef": { + "type": "object", + "properties": { + "filter_id": { + "type": "string", + "description": "The identifier for the filter." + }, + "filter_type": { + "type": "string", + "enum": [ + "include", + "exclude" + ], + "description": "If set to `include`, the rule applies for values in the filter. If set to `exclude`, the rule applies for values not in the filter." + } + }, + "additionalProperties": false, + "required": [ + "filter_id" + ] + }, + "Ml_Types_ModelPlotConfig": { + "type": "object", + "properties": { + "annotations_enabled": { + "type": "boolean", + "description": "If true, enables calculation and storage of the model change annotations for each entity that is being analyzed." + }, + "enabled": { + "type": "boolean", + "description": "If true, enables calculation and storage of the model bounds for each entity that is being analyzed." + }, + "terms": { + "type": "string", + "description": "Limits data collection to this comma separated list of partition or by field values. If terms are not specified or it is an empty string, no filtering is applied. Wildcards are not supported. Only the specified terms can be viewed when using the Single Metric Viewer." + } + }, + "additionalProperties": false + }, + "Ml_Types_PerPartitionCategorization": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "To enable this setting, you must also set the `partition_field_name` property to the same value in every detector that uses the keyword `mlcategory`. Otherwise, job creation fails." + }, + "stop_on_warn": { + "type": "boolean", + "description": "This setting can be set to true only if per-partition categorization is enabled. If true, both categorization and subsequent anomaly detection stops for partitions where the categorization status changes to warn. This setting makes it viable to have a job where it is expected that categorization works well for some partitions but not others; you do not pay the cost of bad categorization forever in the partitions where it works badly." + } + }, + "additionalProperties": false + }, + "Ml_Types_RuleCondition": { + "type": "object", + "properties": { + "applies_to": { + "type": "string", + "enum": [ + "actual", + "typical", + "diff_from_typical", + "time" + ], + "description": "Specifies the result property to which the condition applies. If your detector uses `lat_long`, `metric`, `rare`, or `freq_rare` functions, you can only specify conditions that apply to time." + }, + "operator": { + "type": "string", + "enum": [ + "gt", + "gte", + "lt", + "lte" + ], + "description": "Specifies the condition operator. The available options are greater than, greater than or equals, less than, and less than or equals." + }, + "value": { + "type": "number", + "description": "The value that is compared against the `applies_to` field using the operator." + } + }, + "additionalProperties": false, + "required": [ + "applies_to", + "operator", + "value" + ] + } + } + } +} \ No newline at end of file diff --git a/x-pack/plugins/ml/server/models/json_schema_service/schema_overrides.ts b/x-pack/plugins/ml/server/models/json_schema_service/schema_overrides.ts new file mode 100644 index 00000000000000..bacc90389592a2 --- /dev/null +++ b/x-pack/plugins/ml/server/models/json_schema_service/schema_overrides.ts @@ -0,0 +1,32 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import type { SupportedPath } from '../../../common/api_schemas/json_schema_schema'; +import type { PropertyDefinition } from './types'; + +/** + * Extension of the schema definition extracted from the Elasticsearch specification. + */ +export const jsonSchemaOverrides: Record> = { + '/_ml/anomaly_detectors/{job_id}': { + // background_persist_interval is required according to the ES spec + required: ['analysis_config', 'data_description'], + properties: { + job_id: { + type: 'string', + description: 'Identifier for the anomaly detection job.', + }, + }, + }, + '/_ml/datafeeds/{datafeed_id}': { + properties: { + datafeed_id: { + type: 'string', + }, + }, + }, +}; diff --git a/x-pack/plugins/ml/server/models/json_schema_service/types.ts b/x-pack/plugins/ml/server/models/json_schema_service/types.ts new file mode 100644 index 00000000000000..ecb44e65712822 --- /dev/null +++ b/x-pack/plugins/ml/server/models/json_schema_service/types.ts @@ -0,0 +1,16 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export interface PropertyDefinition { + properties?: Record; + type: string; + description?: string; + $ref?: string; + items?: PropertyDefinition; + anyOf?: PropertyDefinition[]; + required?: string[]; +} diff --git a/x-pack/plugins/ml/server/plugin.ts b/x-pack/plugins/ml/server/plugin.ts index 75c3082a61f634..81e8316f1bda6d 100644 --- a/x-pack/plugins/ml/server/plugin.ts +++ b/x-pack/plugins/ml/server/plugin.ts @@ -24,6 +24,7 @@ import type { PluginStart as DataViewsPluginStart } from '@kbn/data-views-plugin import type { SpacesPluginSetup } from '@kbn/spaces-plugin/server'; import { FieldFormatsStart } from '@kbn/field-formats-plugin/server'; import type { HomeServerPluginSetup } from '@kbn/home-plugin/server'; +import { jsonSchemaRoutes } from './routes/json_schema'; import { notificationsRoutes } from './routes/notifications'; import type { PluginsSetup, PluginsStart, RouteInitialization } from './types'; import { PLUGIN_ID } from '../common/constants/app'; @@ -236,6 +237,7 @@ export class MlServerPlugin }); trainedModelsRoutes(routeInit); notificationsRoutes(routeInit); + jsonSchemaRoutes(routeInit); alertingRoutes(routeInit, sharedServicesProviders); initMlServerLog({ log: this.log }); diff --git a/x-pack/plugins/ml/server/routes/apidoc.json b/x-pack/plugins/ml/server/routes/apidoc.json index 573080a1d187ae..addb672aa3d88b 100644 --- a/x-pack/plugins/ml/server/routes/apidoc.json +++ b/x-pack/plugins/ml/server/routes/apidoc.json @@ -187,6 +187,9 @@ "ModelManagement", "GetModelManagementNodesOverview", - "GetModelManagementMemoryUsage" + "GetModelManagementMemoryUsage", + + "JsonSchema", + "GetJsonSchema" ] } diff --git a/x-pack/plugins/ml/server/routes/json_schema.ts b/x-pack/plugins/ml/server/routes/json_schema.ts new file mode 100644 index 00000000000000..c1c7c65b485190 --- /dev/null +++ b/x-pack/plugins/ml/server/routes/json_schema.ts @@ -0,0 +1,48 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { getJsonSchemaQuerySchema } from '../../common/api_schemas/json_schema_schema'; +import { wrapError } from '../client/error_wrapper'; +import { RouteInitialization } from '../types'; +import { JsonSchemaService } from '../models/json_schema_service'; + +export function jsonSchemaRoutes({ router, routeGuard }: RouteInitialization) { + /** + * @apiGroup JsonSchema + * + * @api {get} /api/ml/json_schema Get requested JSON schema + * @apiName GetJsonSchema + * @apiDescription Retrieves the JSON schema + */ + router.get( + { + path: '/api/ml/json_schema', + validate: { + query: getJsonSchemaQuerySchema, + }, + options: { + tags: ['access:ml:canAccessML'], + }, + }, + routeGuard.fullLicenseAPIGuard(async ({ request, response }) => { + try { + const jsonSchemaService = new JsonSchemaService(); + + const result = await jsonSchemaService.extractSchema( + request.query.path, + request.query.method + ); + + return response.ok({ + body: result, + }); + } catch (e) { + return response.customError(wrapError(e)); + } + }) + ); +} diff --git a/x-pack/plugins/ml/tsconfig.json b/x-pack/plugins/ml/tsconfig.json index 72fbeee62048b2..dd0d5d0d23ec05 100644 --- a/x-pack/plugins/ml/tsconfig.json +++ b/x-pack/plugins/ml/tsconfig.json @@ -81,5 +81,6 @@ "@kbn/unified-field-list-plugin", "@kbn/shared-ux-router", "@kbn/saved-search-plugin", + "@kbn/monaco", ], } diff --git a/x-pack/test/functional/services/ml/job_wizard_advanced.ts b/x-pack/test/functional/services/ml/job_wizard_advanced.ts index ca36cb3687c18f..b904c7dbc1d8ed 100644 --- a/x-pack/test/functional/services/ml/job_wizard_advanced.ts +++ b/x-pack/test/functional/services/ml/job_wizard_advanced.ts @@ -17,7 +17,7 @@ export function MachineLearningJobWizardAdvancedProvider( const comboBox = getService('comboBox'); const testSubjects = getService('testSubjects'); const retry = getService('retry'); - const aceEditor = getService('aceEditor'); + const monacoEditor = getService('monacoEditor'); return { async getValueOrPlaceholder(inputLocator: string): Promise { @@ -30,13 +30,11 @@ export function MachineLearningJobWizardAdvancedProvider( }, async assertDatafeedQueryEditorExists() { - await testSubjects.existOrFail('mlAdvancedDatafeedQueryEditor > codeEditorContainer'); + await testSubjects.existOrFail('mlAdvancedDatafeedQueryEditor > codeEditorHint'); }, async assertDatafeedQueryEditorValue(expectedValue: string) { - const actualValue = await aceEditor.getValue( - 'mlAdvancedDatafeedQueryEditor > codeEditorContainer' - ); + const actualValue = await monacoEditor.getCodeEditorValue(); expect(actualValue).to.eql( expectedValue, `Expected datafeed query editor value to be '${expectedValue}' (got '${actualValue}')`