From 0fd5d2ff07eeba27a1b5bc52be01be305a649ed2 Mon Sep 17 00:00:00 2001 From: Arya Soni Date: Thu, 30 Oct 2025 00:29:54 +0530 Subject: [PATCH 01/34] [DOC] Can't find k1 parameter using search (not indexed?) Signed-off-by: Arya Soni --- _search-plugins/keyword-search.md | 9 ++++++++- assets/js/search.js | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/_search-plugins/keyword-search.md b/_search-plugins/keyword-search.md index f23aa9c20ac..bd75abf988a 100644 --- a/_search-plugins/keyword-search.md +++ b/_search-plugins/keyword-search.md @@ -3,6 +3,8 @@ layout: default title: Keyword search has_children: false nav_order: 10 +meta_description: Learn about BM25 keyword search in OpenSearch, including how to configure BM25 parameters k1 and b for better search relevance +meta_keywords: BM25, keyword search, k1, b, term frequency, inverse document frequency, TF/IDF, search relevance, Okapi BM25 --- # Keyword search @@ -165,7 +167,12 @@ PUT /testindex ## Configuring BM25 similarity -You can configure BM25 similarity parameters at the index level as follows: +You can configure BM25 similarity parameters at the index level. The BM25 algorithm supports two key parameters: `k1` (term saturation parameter) and `b` (length normalization parameter). These parameters control how BM25 scores documents: + +- The `k1` parameter controls term frequency saturation, determining how quickly the relevance score increases as term frequency grows. +- The `b` parameter controls the impact of document length on scoring. + +You can configure these parameters at the index level as follows: ```json PUT /testindex diff --git a/assets/js/search.js b/assets/js/search.js index 427d736c881..36b6c0ad559 100644 --- a/assets/js/search.js +++ b/assets/js/search.js @@ -90,7 +90,7 @@ const doSearch = async () => { const query = elInput.value.replace(/[^a-z0-9-_. ]+/ig, ' '); - if (query.length < 3) return hideResults(true); + if (query.length < 2) return hideResults(true); if (query === lastQuery) return; recordEvent('search', { From 0458b0bf00e5b857b2d3802bc3f36deea429686f Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Wed, 29 Oct 2025 15:10:44 -0400 Subject: [PATCH 02/34] Update plugins.calcite.enabled setting default (#11435) Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Arya Soni --- _search-plugins/sql/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_search-plugins/sql/settings.md b/_search-plugins/sql/settings.md index 2bd319adaa0..19d3575407a 100644 --- a/_search-plugins/sql/settings.md +++ b/_search-plugins/sql/settings.md @@ -80,7 +80,7 @@ Setting | Default | Description `plugins.query.size_limit` | 200 | Sets the default size of index that the query engine fetches from OpenSearch. `plugins.query.datasources.enabled` | true | Change to `false` to disable support for data sources in the plugin. `plugins.query.field_type_tolerance` | true | If `false`, then an array is reduced to the first non-array value at any nesting level. For example, `[[1, 2], [3, 4]]` will be reduced to `1`. If `true`, then the array is preserved. Default is `true`. -`plugins.calcite.enabled` | false | Set to `true` to enable experimental features that use the Apache Calcite query engine, including advanced SQL and PPL capabilities such as subsearches, joins, and lookup operations. +`plugins.calcite.enabled` | true | Enables the Apache Calcite query engine, including advanced SQL and PPL capabilities such as subsearches, joins, and lookup operations. ## Spark connector settings From 1eeb672dc5003e274fe43eb5f69b4da0c37875ad Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 30 Oct 2025 12:10:56 +0000 Subject: [PATCH 03/34] updating the example for more_like_this (#11454) Signed-off-by: Anton Rubin Signed-off-by: Arya Soni --- _query-dsl/specialized/more-like-this.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/_query-dsl/specialized/more-like-this.md b/_query-dsl/specialized/more-like-this.md index 59890069c79..7af8ced6547 100644 --- a/_query-dsl/specialized/more-like-this.md +++ b/_query-dsl/specialized/more-like-this.md @@ -103,11 +103,15 @@ PUT /articles-optimized { "mappings": { "properties": { - "title": { + "name": { "type": "text", "term_vector": "with_positions_offsets" }, - "content": { + "alias": { + "type": "text", + "term_vector": "with_positions_offsets" + }, + "quote": { "type": "text", "term_vector": "with_positions_offsets" } From 5dfeec370a7b8d377e9788f0c943bee3a7c30147 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 30 Oct 2025 12:33:18 +0000 Subject: [PATCH 04/34] updating the debian install with apt (#11456) Signed-off-by: Anton Rubin Signed-off-by: Arya Soni --- _install-and-configure/install-dashboards/debian.md | 4 ++-- _install-and-configure/install-opensearch/debian.md | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/_install-and-configure/install-dashboards/debian.md b/_install-and-configure/install-dashboards/debian.md index b58a0a0a077..dc915fb2476 100644 --- a/_install-and-configure/install-dashboards/debian.md +++ b/_install-and-configure/install-dashboards/debian.md @@ -79,11 +79,11 @@ APT, the primary package management tool for Debian–based operating systems, a ``` 1. Import the public GPG key. This key is used to verify that the APT repository is signed. ```bash - curl -o- https://artifacts.opensearch.org/publickeys/opensearch-release.pgp | sudo gpg --dearmor --batch --yes -o /usr/share/keyrings/opensearch-release-keyring + curl -o- https://artifacts.opensearch.org/publickeys/opensearch-release.pgp | sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/opensearch-release-keyring ``` 1. Create an APT repository for OpenSearch. ```bash - echo "deb [signed-by=/usr/share/keyrings/opensearch-release-keyring] https://artifacts.opensearch.org/releases/bundle/opensearch-dashboards/{{major_version_mask}}/apt stable main" | sudo tee /etc/apt/sources.list.d/opensearch-dashboards-{{major_version_mask}}.list + echo "deb [signed-by=/etc/apt/keyrings/opensearch-release-keyring] https://artifacts.opensearch.org/releases/bundle/opensearch-dashboards/{{major_version_mask}}/apt stable main" | sudo tee /etc/apt/sources.list.d/opensearch-dashboards-{{major_version_mask}}.list ``` 1. Verify that the repository was created successfully. ```bash diff --git a/_install-and-configure/install-opensearch/debian.md b/_install-and-configure/install-opensearch/debian.md index 04a6bfc9c5d..6e7517d5966 100644 --- a/_install-and-configure/install-opensearch/debian.md +++ b/_install-and-configure/install-opensearch/debian.md @@ -114,13 +114,15 @@ APT, the primary package management tool for Debian–based operating systems, a 1. Import the public GPG key. This key is used to verify that the APT repository is signed. ```bash - curl -o- https://artifacts.opensearch.org/publickeys/opensearch-release.pgp | sudo gpg --dearmor --batch --yes -o /usr/share/keyrings/opensearch-release-keyring + curl -fsSL https://artifacts.opensearch.org/publickeys/opensearch-release.pgp \ + | sudo gpg --dearmor -o /etc/apt/keyrings/opensearch.gpg ``` {% include copy.html %} 1. Create an APT repository for OpenSearch: ```bash - echo "deb [signed-by=/usr/share/keyrings/opensearch-release-keyring] https://artifacts.opensearch.org/releases/bundle/opensearch/{{major_version_mask}}/apt stable main" | sudo tee /etc/apt/sources.list.d/opensearch-{{major_version_mask}}.list + echo "deb [signed-by=/etc/apt/keyrings/opensearch.gpg] https://artifacts.opensearch.org/releases/bundle/opensearch/3.x/apt stable main" \ + | sudo tee /etc/apt/sources.list.d/opensearch-3.x.list ``` {% include copy.html %} From 554396aecc14659f046f12074e35b4a5d6db6b5f Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 30 Oct 2025 14:30:54 +0000 Subject: [PATCH 05/34] updating the logstash migration example (#11201) * updating the logstash migration example Signed-off-by: Anton Rubin * removing the migration from logstash page Signed-off-by: Anton Rubin --------- Signed-off-by: Anton Rubin Signed-off-by: Arya Soni --- _data-prepper/getting-started.md | 2 - _data-prepper/index.md | 1 + .../migrating-from-logstash-data-prepper.md | 48 ------------------- 3 files changed, 1 insertion(+), 50 deletions(-) delete mode 100644 _data-prepper/migrating-from-logstash-data-prepper.md diff --git a/_data-prepper/getting-started.md b/_data-prepper/getting-started.md index 5dc90316d0f..92a38adafed 100644 --- a/_data-prepper/getting-started.md +++ b/_data-prepper/getting-started.md @@ -151,8 +151,6 @@ Trace analytics is an important Data Prepper use case. If you haven't yet config Log ingestion is also an important Data Prepper use case. To learn more, see [Log analytics]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/log-analytics/). -To learn how to run Data Prepper with a Logstash configuration, see [Migrating from Logstash]({{site.url}}{{site.baseurl}}/data-prepper/migrating-from-logstash-data-prepper/). - For information on how to monitor Data Prepper, see [Monitoring]({{site.url}}{{site.baseurl}}/data-prepper/managing-data-prepper/monitoring/). ## More examples diff --git a/_data-prepper/index.md b/_data-prepper/index.md index 63ff2fd07c1..c8f4d321820 100644 --- a/_data-prepper/index.md +++ b/_data-prepper/index.md @@ -10,6 +10,7 @@ redirect_from: - /clients/data-prepper/index/ - /monitoring-plugins/trace/data-prepper/ - /data-prepper/index/ + - /data-prepper/migrating-from-logstash-data-prepper/ --- # OpenSearch Data Prepper diff --git a/_data-prepper/migrating-from-logstash-data-prepper.md b/_data-prepper/migrating-from-logstash-data-prepper.md deleted file mode 100644 index 13548092dce..00000000000 --- a/_data-prepper/migrating-from-logstash-data-prepper.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -layout: default -title: Migrating from Logstash -nav_order: 25 -redirect_from: - - /clients/data-prepper/configure-logstash-data-prepper/ - - /data-prepper/configure-logstash-data-prepper/ ---- - -# Migrating from Logstash - -You can run OpenSearch Data Prepper with a Logstash configuration. - -As mentioned in [Getting started with OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/), you'll need to configure Data Prepper with a pipeline using a `pipelines.yaml` file. - -Alternatively, if you have a Logstash configuration `logstash.conf` to configure Data Prepper instead of `pipelines.yaml`. - -## Supported plugins - -As of the Data Prepper 1.2 release, the following plugins from the Logstash configuration are supported: -* HTTP Input plugin -* Grok Filter plugin -* Elasticsearch Output plugin -* Amazon Elasticsearch Output plugin - -## Limitations -* Apart from the supported plugins, all other plugins from the Logstash configuration will throw an `Exception` and fail to run. -* Conditionals in the Logstash configuration are not supported as of the Data Prepper 1.2 release. - -## Running Data Prepper with a Logstash configuration - -1. To install Data Prepper's Docker image, see Installing Data Prepper in [Getting Started with OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started#1-installing-data-prepper). - -2. Run the Docker image installed in Step 1 by supplying your `logstash.conf` configuration. - -``` -docker run --name data-prepper -p 4900:4900 -v ${PWD}/logstash.conf:/usr/share/data-prepper/pipelines.conf opensearchproject/data-prepper:latest pipelines.conf -``` - -The `logstash.conf` file is converted to `logstash.yaml` by mapping the plugins and attributes in the Logstash configuration to the corresponding plugins and attributes in Data Prepper. -You can find the converted `logstash.yaml` file in the same directory where you stored `logstash.conf`. - - -The following output in your terminal indicates that Data Prepper is running correctly: - -``` -INFO org.opensearch.dataprepper.pipeline.ProcessWorker - log-pipeline Worker: No records received from buffer -``` From 6ad513db61c0628e884b0b5b2c2b8e6baa878aa1 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 30 Oct 2025 12:52:35 -0400 Subject: [PATCH 06/34] Add tag documentation to ingest processors and pipelines topic (#11459) Signed-off-by: Fanit Kolchina Signed-off-by: Arya Soni --- _ingest-pipelines/create-ingest.md | 186 +++++++++++++++++++++++++ _ingest-pipelines/processors/ip2geo.md | 5 + 2 files changed, 191 insertions(+) diff --git a/_ingest-pipelines/create-ingest.md b/_ingest-pipelines/create-ingest.md index 3175327d873..17938e74511 100644 --- a/_ingest-pipelines/create-ingest.md +++ b/_ingest-pipelines/create-ingest.md @@ -100,3 +100,189 @@ PUT _ingest/pipeline/my-pipeline } ``` {% include copy-curl.html %} + +## Processor tags for monitoring and debugging + +When monitoring ingest pipeline performance using the `GET /_nodes/stats/ingest` API, processors without tags appear with generic names in the statistics output. This makes it difficult to identify which specific processor stage might be causing performance bottlenecks in complex pipelines. All ingest processors support an optional `tag` parameter that assigns a meaningful identifier to each processor. This parameter is useful for monitoring pipeline performance and debugging issues in production environments. + +The following examples demonstrate the difference between using processors with and without tags when monitoring pipeline performance. + +Create a pipeline without processor tags: + +```json +PUT _ingest/pipeline/log-processing-without-tags +{ + "description": "Process web server logs without processor tags", + "processors": [ + { + "grok": { + "field": "message", + "patterns": ["%{COMMONAPACHELOG}"] + } + }, + { + "date": { + "field": "timestamp", + "formats": ["dd/MMM/yyyy:HH:mm:ss Z"] + } + }, + { + "convert": { + "field": "response", + "type": "integer" + } + } + ] +} +``` +{% include copy-curl.html %} + +Create a pipeline with processor tags: + +```json +PUT _ingest/pipeline/log-processing-with-tags +{ + "description": "Process web server logs with processor tags for monitoring", + "processors": [ + { + "grok": { + "field": "message", + "patterns": ["%{COMMONAPACHELOG}"], + "tag": "parse-apache-log" + } + }, + { + "date": { + "field": "timestamp", + "formats": ["dd/MMM/yyyy:HH:mm:ss Z"], + "tag": "parse-timestamp" + } + }, + { + "convert": { + "field": "response", + "type": "integer", + "tag": "convert-response-code" + } + } + ] +} +``` +{% include copy-curl.html %} + +Test both pipelines with sample log data: + +```json +POST logs-without-tags/_doc?pipeline=log-processing-without-tags +{ + "message": "192.168.1.100 - - [30/Oct/2023:14:23:45 +0000] \"POST /api/users HTTP/1.1\" 201 512" +} +``` +{% include copy-curl.html %} + +```json +POST logs-with-tags/_doc?pipeline=log-processing-with-tags +{ + "message": "192.168.1.100 - - [30/Oct/2023:14:23:45 +0000] \"POST /api/users HTTP/1.1\" 201 512" +} +``` +{% include copy-curl.html %} + +Check the ingest statistics to see the difference in processor identification: + +```json +GET _nodes/stats/ingest +``` +{% include copy-curl.html %} + +The pipeline without tags contains generic processor names: + +```json +"log-processing-without-tags": { + "count": 1, + "time_in_millis": 1, + "current": 0, + "failed": 0, + "processors": [ + { + "grok": { + "type": "grok", + "stats": { + "count": 1, + "time_in_millis": 0, + "current": 0, + "failed": 0 + } + } + }, + { + "date": { + "type": "date", + "stats": { + "count": 1, + "time_in_millis": 1, + "current": 0, + "failed": 0 + } + } + }, + { + "convert": { + "type": "convert", + "stats": { + "count": 1, + "time_in_millis": 0, + "current": 0, + "failed": 0 + } + } + } + ] +} +``` + +The pipeline with tags contains descriptive processor names: + +```json +"log-processing-with-tags": { + "count": 1, + "time_in_millis": 0, + "current": 0, + "failed": 0, + "processors": [ + { + "grok:parse-apache-log": { + "type": "grok", + "stats": { + "count": 1, + "time_in_millis": 0, + "current": 0, + "failed": 0 + } + } + }, + { + "date:parse-timestamp": { + "type": "date", + "stats": { + "count": 1, + "time_in_millis": 0, + "current": 0, + "failed": 0 + } + } + }, + { + "convert:convert-response-code": { + "type": "convert", + "stats": { + "count": 1, + "time_in_millis": 0, + "current": 0, + "failed": 0 + } + } + } + ] +} +``` diff --git a/_ingest-pipelines/processors/ip2geo.md b/_ingest-pipelines/processors/ip2geo.md index 8e53c778a1c..963cb757bb0 100644 --- a/_ingest-pipelines/processors/ip2geo.md +++ b/_ingest-pipelines/processors/ip2geo.md @@ -168,6 +168,11 @@ The following table lists the required and optional parameters for the `ip2geo` | `ignore_missing` | Optional | Specifies whether the processor should ignore documents that do not contain the specified field. If set to `true`, the processor does not modify the document if the field does not exist or is `null`. Default is `false`. | | `properties` | Optional | The field that controls which properties are added to `target_field` from `datasource`. Default is all the fields in `datasource`. | | `target_field` | Optional | The field containing the geographical information retrieved from the data source. Default is `ip2geo`. | +| `description` | Optional | A brief description of the processor. | +| `if` | Optional | A condition for running the processor. | +| `ignore_failure` | Optional | Specifies whether the processor continues execution even if it encounters errors. If set to `true`, failures are ignored. Default is `false`. | +| `on_failure` | Optional | A list of processors to run if the processor fails. | +| `tag` | Optional | An identifier tag for the processor. Useful for debugging in order to distinguish between processors of the same type. | ## Using the processor From 22ecfae78153eea7cc28f1527495f2f6f48573f1 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:45:48 -0400 Subject: [PATCH 07/34] Update blueprints.md (#11463) Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Arya Soni --- _ml-commons-plugin/remote-models/blueprints.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_ml-commons-plugin/remote-models/blueprints.md b/_ml-commons-plugin/remote-models/blueprints.md index ef43e708bfd..1193d7ec34c 100644 --- a/_ml-commons-plugin/remote-models/blueprints.md +++ b/_ml-commons-plugin/remote-models/blueprints.md @@ -88,7 +88,7 @@ The `client_config` parameter supports the following options. | Field | Data type | Description | |:---|:---|:---| | `max_connection` | Integer | The maximum number of concurrent connections that the client can establish to the server. Some remote services, like SageMaker, constrain the maximum number of concurrent connections and throw a throttling exception if the number of concurrent connections exceeds the threshold. The maximum number of concurrent OpenSearch connections is `max_connection`*`node_number_for_connector`. To mitigate this issue, try to decrease the value of this parameter and modify the retry settings in `client_config`. Default is `30`. | -| `connection_timeout` | Integer | The maximum amount of time (in seconds) that the client will wait while trying to establish a connection to the server. A timeout prevents the client from waiting indefinitely and allows the client to recover when it encounters unreachable network endpoints. | +| `connection_timeout` | Integer | The maximum amount of time (in milliseconds) that the client will wait while trying to establish a connection to the server. A timeout prevents the client from waiting indefinitely and allows the client to recover when it encounters unreachable network endpoints. | | `read_timeout` | Integer | The maximum amount of time (in seconds) that the client will wait for a response from the server after sending a request. This is useful when the server is slow to respond or encounters an issue while processing a request. | | `retry_backoff_policy` | String | The backoff policy for retries to the remote connector. This is useful when there is spike in traffic causing throttling exceptions. Supported policies are `constant`, `exponential_equal_jitter`, and `exponential_full_jitter`. Default is `constant`. | | `max_retry_times` | Integer | The maximum number of times that a single remote inference request will be retried. This is useful when there is a spike in traffic causing throttling exceptions. When set to `0`, retrying is disabled. When set to `-1`, OpenSearch does not limit the number of `retry_times`. Setting this to a positive integer specifies the maximum number of retry attempts. Default is `0`. | From 2cc4cd5b286e328a09b70a2cc9cddc8a41bdb84d Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:26:35 -0400 Subject: [PATCH 08/34] Add copy buttons and highlighting to data prepper code samples (#11465) Signed-off-by: Fanit Kolchina Signed-off-by: Arya Soni --- .../common-use-cases/log-analytics.md | 9 +++-- .../common-use-cases/trace-analytics.md | 38 ++++++++++--------- _data-prepper/getting-started.md | 21 +++++----- .../configuration/processors/aggregate.md | 10 ++--- .../processors/anomaly-detector.md | 5 ++- .../configuration/processors/aws-lambda.md | 5 +-- .../pipelines/configuration/processors/csv.md | 5 ++- .../configuration/processors/date.md | 4 +- .../configuration/processors/decompress.md | 1 + .../configuration/processors/delay.md | 1 + .../configuration/processors/dissect.md | 5 ++- .../configuration/processors/flatten.md | 3 ++ .../configuration/processors/geoip.md | 6 ++- 13 files changed, 67 insertions(+), 46 deletions(-) diff --git a/_data-prepper/common-use-cases/log-analytics.md b/_data-prepper/common-use-cases/log-analytics.md index 242e16dfe94..715200ea72a 100644 --- a/_data-prepper/common-use-cases/log-analytics.md +++ b/_data-prepper/common-use-cases/log-analytics.md @@ -67,7 +67,7 @@ log-pipeline: # Change to your credentials username: "admin" password: "admin" - # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate #cert: /path/to/cert # If you are connecting to an Amazon OpenSearch Service domain without # Fine-Grained Access Control, enable these settings. Comment out the @@ -78,6 +78,7 @@ log-pipeline: # You should change this to correspond with how your OpenSearch indexes are set up. index: apache_logs ``` +{% include copy.html %} This pipeline configuration is an example of Apache log ingestion. Don't forget that you can easily configure the Grok Processor for your own custom logs. You will need to modify the configuration for your OpenSearch cluster. @@ -100,7 +101,7 @@ Note that you should adjust the file `path`, output `Host`, and `Port` according The following is an example `fluent-bit.conf` file without SSL and basic authentication enabled on the HTTP source: -``` +```text [INPUT] name tail refresh_interval 5 @@ -115,6 +116,7 @@ The following is an example `fluent-bit.conf` file without SSL and basic authent URI /log/ingest Format json ``` +{% include copy.html %} If your HTTP source has SSL and basic authentication enabled, you will need to add the details of `http_User`, `http_Passwd`, `tls.crt_file`, and `tls.key_file` to the `fluent-bit.conf` file, as shown in the following example. @@ -122,7 +124,7 @@ If your HTTP source has SSL and basic authentication enabled, you will need to a The following is an example `fluent-bit.conf` file with SSL and basic authentication enabled on the HTTP source: -``` +```text [INPUT] name tail refresh_interval 5 @@ -142,6 +144,7 @@ The following is an example `fluent-bit.conf` file with SSL and basic authentica URI /log/ingest Format json ``` +{% include copy.html %} # Next steps diff --git a/_data-prepper/common-use-cases/trace-analytics.md b/_data-prepper/common-use-cases/trace-analytics.md index 2c1351d4ee8..47c0f2fe051 100644 --- a/_data-prepper/common-use-cases/trace-analytics.md +++ b/_data-prepper/common-use-cases/trace-analytics.md @@ -116,7 +116,7 @@ The following example demonstrates how to build a pipeline that supports the [Op Starting with Data Prepper version 2.0, Data Prepper no longer supports the `otel_traces_prepper` processor. The `otel_traces` processor replaces the `otel_traces_prepper` processor and supports some of Data Prepper's recent data model changes. Instead, you should use the `otel_traces` processor. See the following YAML file example: -```yml +```yaml entry-pipeline: delay: "100" source: @@ -167,6 +167,7 @@ service-map-pipeline: password: admin index_type: trace-analytics-service-map ``` +{% include copy.html %} To maintain similar ingestion throughput and latency, scale the `buffer_size` and `batch_size` by the estimated maximum batch size in the client request payload. {: .tip} @@ -186,6 +187,7 @@ source: username: "my-user" password: "my_s3cr3t" ``` +{% include copy.html %} #### Example: pipeline.yaml @@ -193,14 +195,14 @@ The following is an example `pipeline.yaml` file without SSL and basic authentic ```yaml otel-trace-pipeline: - # workers is the number of threads processing data in each pipeline. + # workers is the number of threads processing data in each pipeline. # We recommend same value for all pipelines. # default value is 1, set a value based on the machine you are running Data Prepper - workers: 8 + workers: 8 # delay in milliseconds is how often the worker threads should process data. # Recommend not to change this config as we want the entry-pipeline to process as quick as possible # default value is 3_000 ms - delay: "100" + delay: "100" source: otel_trace_source: #record_type: event # Add this when using Data Prepper 1.x. This option is removed in 2.0 @@ -209,8 +211,8 @@ otel-trace-pipeline: unauthenticated: buffer: bounded_blocking: - # buffer_size is the number of ExportTraceRequest from otel-collector the data prepper should hold in memeory. - # We recommend to keep the same buffer_size for all pipelines. + # buffer_size is the number of ExportTraceRequest from otel-collector the data prepper should hold in memeory. + # We recommend to keep the same buffer_size for all pipelines. # Make sure you configure sufficient heap # default value is 512 buffer_size: 512 @@ -225,9 +227,9 @@ otel-trace-pipeline: name: "entry-pipeline" raw-trace-pipeline: # Configure same as the otel-trace-pipeline - workers: 8 + workers: 8 # We recommend using the default value for the raw-trace-pipeline. - delay: "3000" + delay: "3000" source: pipeline: name: "entry-pipeline" @@ -248,7 +250,7 @@ raw-trace-pipeline: # Change to your credentials username: "admin" password: "admin" - # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate #cert: /path/to/cert # If you are connecting to an Amazon OpenSearch Service domain without # Fine-Grained Access Control, enable these settings. Comment out the @@ -262,7 +264,7 @@ raw-trace-pipeline: # Change to your credentials username: "admin" password: "admin" - # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate #cert: /path/to/cert # If you are connecting to an Amazon OpenSearch Service domain without # Fine-Grained Access Control, enable these settings. Comment out the @@ -277,14 +279,14 @@ service-map-pipeline: name: "entry-pipeline" processor: - service_map: - # The window duration is the maximum length of time the data prepper stores the most recent trace data to evaluvate service-map relationships. + # The window duration is the maximum length of time the data prepper stores the most recent trace data to evaluvate service-map relationships. # The default is 3 minutes, this means we can detect relationships between services from spans reported in last 3 minutes. - # Set higher value if your applications have higher latency. - window_duration: 180 + # Set higher value if your applications have higher latency. + window_duration: 180 buffer: bounded_blocking: - # buffer_size is the number of ExportTraceRequest from otel-collector the data prepper should hold in memeory. - # We recommend to keep the same buffer_size for all pipelines. + # buffer_size is the number of ExportTraceRequest from otel-collector the data prepper should hold in memeory. + # We recommend to keep the same buffer_size for all pipelines. # Make sure you configure sufficient heap # default value is 512 buffer_size: 512 @@ -299,7 +301,7 @@ service-map-pipeline: # Change to your credentials username: "admin" password: "admin" - # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate #cert: /path/to/cert # If you are connecting to an Amazon OpenSearch Service domain without # Fine-Grained Access Control, enable these settings. Comment out the @@ -307,6 +309,7 @@ service-map-pipeline: #aws_sigv4: true #aws_region: us-east-1 ``` +{% include copy.html %} You need to modify the preceding configuration for your OpenSearch cluster so that the configuration matches your environment. Note that it has two `opensearch` sinks that need to be modified. {: .note} @@ -328,7 +331,7 @@ You need to run OpenTelemetry Collector in your service environment. Follow [Get The following is an example `otel-collector-config.yaml` file: -``` +```yaml receivers: jaeger: protocols: @@ -356,6 +359,7 @@ service: processors: [batch/traces] exporters: [otlp/data-prepper] ``` +{% include copy.html %} After you run OpenTelemetry in your service environment, you must configure your application to use the OpenTelemetry Collector. The OpenTelemetry Collector typically runs alongside your application. diff --git a/_data-prepper/getting-started.md b/_data-prepper/getting-started.md index 92a38adafed..533fff8a3c4 100644 --- a/_data-prepper/getting-started.md +++ b/_data-prepper/getting-started.md @@ -19,7 +19,7 @@ There are two ways to install Data Prepper: you can run the Docker image or buil The easiest way to use Data Prepper is by running the Docker image. We suggest that you use this approach if you have [Docker](https://www.docker.com) available. Run the following command: -``` +```bash docker pull opensearchproject/data-prepper:latest ``` {% include copy.html %} @@ -36,27 +36,30 @@ Two configuration files are required to run a Data Prepper instance. Optionally, For Data Prepper versions earlier than 2.0, the `.jar` file expects the pipeline configuration file path to be followed by the server configuration file path. See the following configuration path example: -``` +```bash java -jar data-prepper-core-$VERSION.jar pipelines.yaml data-prepper-config.yaml ``` +{% include copy.html %} Optionally, you can add `"-Dlog4j.configurationFile=config/log4j2.properties"` to the command to pass a custom Log4j 2 configuration file. If you don't provide a properties file, Data Prepper defaults to the `log4j2.properties` file in the `shared-config` directory. Starting with Data Prepper 2.0, you can launch Data Prepper by using the following `data-prepper` script that does not require any additional command line arguments: -``` +```bash bin/data-prepper ``` +{% include copy.html %} Configuration files are read from specific subdirectories in the application's home directory: 1. `pipelines/`: Used for pipeline configurations. Pipeline configurations can be written in one or more YAML files. 2. `config/data-prepper-config.yaml`: Used for the Data Prepper server configuration. You can supply your own pipeline configuration file path followed by the server configuration file path. However, this method will not be supported in a future release. See the following example: -``` +```bash bin/data-prepper pipelines.yaml data-prepper-config.yaml ``` +{% include copy.html %} The Log4j 2 configuration file is read from the `config/log4j2.properties` file located in the application's home directory. @@ -69,7 +72,7 @@ To configure Data Prepper, see the following information for each use case: Create a Data Prepper pipeline file named `pipelines.yaml` using the following configuration: -```yml +```yaml simple-sample-pipeline: workers: 2 delay: "5000" @@ -96,7 +99,7 @@ The example pipeline configuration above demonstrates a simple pipeline with a s After starting Data Prepper, you should see log output and some UUIDs after a few seconds: -```yml +```text 2021-09-30T20:19:44,147 [main] INFO com.amazon.dataprepper.pipeline.server.DataPrepperServer - Data Prepper server running at :4900 2021-09-30T20:19:44,681 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer 2021-09-30T20:19:45,183 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer @@ -120,21 +123,21 @@ image and modify both the `pipelines.yaml` and `data-prepper-config.yaml` files. For Data Prepper 2.0 or later, use this command: -``` +```bash docker run --name data-prepper -p 4900:4900 -v ${PWD}/pipelines.yaml:/usr/share/data-prepper/pipelines/pipelines.yaml -v ${PWD}/data-prepper-config.yaml:/usr/share/data-prepper/config/data-prepper-config.yaml opensearchproject/data-prepper:latest ``` {% include copy.html %} For Data Prepper versions earlier than 2.0, use this command: -``` +```bash docker run --name data-prepper -p 4900:4900 -v ${PWD}/pipelines.yaml:/usr/share/data-prepper/pipelines.yaml -v ${PWD}/data-prepper-config.yaml:/usr/share/data-prepper/data-prepper-config.yaml opensearchproject/data-prepper:1.x ``` {% include copy.html %} Once Data Prepper is running, it processes data until it is shut down. Once you are done, shut it down with the following command: -``` +```bash POST /shutdown ``` {% include copy-curl.html %} diff --git a/_data-prepper/pipelines/configuration/processors/aggregate.md b/_data-prepper/pipelines/configuration/processors/aggregate.md index 781ce61a3fa..2296ed58c59 100644 --- a/_data-prepper/pipelines/configuration/processors/aggregate.md +++ b/_data-prepper/pipelines/configuration/processors/aggregate.md @@ -38,7 +38,7 @@ The `remove_duplicates` action processes the first event for a group immediately The `put_all` action combines events belonging to the same group by overwriting existing keys and adding new keys, similarly to the Java `Map.putAll`. The action drops all events that make up the combined event. For example, when using `identification_keys: ["sourceIp", "destination_ip"]`, the `put_all` action processes the following three events: -``` +```json { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200 } { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 } { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "http_verb": "GET" } @@ -46,7 +46,7 @@ The `put_all` action combines events belonging to the same group by overwriting Then the action combines the events into one. The pipeline then uses the following combined event: -``` +```json { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "status": 200, "bytes": 1000, "http_verb": "GET" } ``` @@ -93,7 +93,7 @@ You can customize the processor with the following configuration options: For example, when using `identification_keys: ["sourceIp", "destination_ip", "request"]`, `key: latency`, and `buckets: [0.0, 0.25, 0.5]`, the `histogram` action processes the following events: -``` +```json { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "request" : "/index.html", "latency": 0.2 } { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "request" : "/index.html", "latency": 0.55} { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "request" : "/index.html", "latency": 0.25 } @@ -139,7 +139,7 @@ You can set the percentage of events using the `percent` configuration, which in For example, if percent is set to `50`, the action tries to process the following events in the one-second interval: -``` +```json { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 2500 } { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 500 } { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 1000 } @@ -148,7 +148,7 @@ For example, if percent is set to `50`, the action tries to process the followin The pipeline processes 50% of the events, drops the other events, and does not generate a new event: -``` +```json { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 500 } { "sourceIp": "127.0.0.1", "destinationIp": "192.168.0.1", "bytes": 3100 } ``` diff --git a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md index 0b35e8387c6..8bbeeb3ead9 100644 --- a/_data-prepper/pipelines/configuration/processors/anomaly-detector.md +++ b/_data-prepper/pipelines/configuration/processors/anomaly-detector.md @@ -64,13 +64,14 @@ To get started, create the following `pipeline.yaml` file. You can use the follo ad-pipeline: source: ... - .... + .... processor: - anomaly_detector: keys: ["latency"] - mode: + mode: random_cut_forest: ``` +{% include copy.html %} When you run the `anomaly_detector` processor, the processor extracts the value for the `latency` key and then passes the value through the RCF ML algorithm. You can configure any key that comprises integers or real numbers as values. In the following example, you can configure `bytes` or `latency` as the key for an anomaly detector. diff --git a/_data-prepper/pipelines/configuration/processors/aws-lambda.md b/_data-prepper/pipelines/configuration/processors/aws-lambda.md index 65a2f0a1855..77cf6f05159 100644 --- a/_data-prepper/pipelines/configuration/processors/aws-lambda.md +++ b/_data-prepper/pipelines/configuration/processors/aws-lambda.md @@ -42,7 +42,7 @@ Field | Type | Required | Description #### Example configuration -``` +```yaml processors: - aws_lambda: function_name: "my-lambda-function" @@ -62,7 +62,6 @@ processors: maximum_size: "5mb" event_collect_timeout: PT10S lambda_when: "event['status'] == 'process'" - ``` {% include copy.html %} @@ -98,7 +97,7 @@ Note the following limitations: Integration tests for this plugin are executed separately from the main Data Prepper build process. Use the following Gradle command to run these tests: -``` +```bash ./gradlew :data-prepper-plugins:aws-lambda:integrationTest -Dtests.processor.lambda.region="us-east-1" -Dtests.processor.lambda.functionName="lambda_test_function" -Dtests.processor.lambda.sts_role_arn="arn:aws:iam::123456789012:role/dataprepper-role ``` {% include copy.html %} diff --git a/_data-prepper/pipelines/configuration/processors/csv.md b/_data-prepper/pipelines/configuration/processors/csv.md index fb9fc6f9d6e..1b6f5b08ec9 100644 --- a/_data-prepper/pipelines/configuration/processors/csv.md +++ b/_data-prepper/pipelines/configuration/processors/csv.md @@ -48,9 +48,10 @@ csv-pipeline: When run, the processor will parse the message. Although only two column names are specified in processor settings, a third column name is automatically generated because the data contained in `ingest.csv` includes three columns, `1,2,3`: -``` +```json {"message": "1,2,3", "col1": "1", "col2": "2", "column3": "3"} ``` + ### Automatically detect column names The following configuration automatically detects the header of a CSV file ingested through an [`s3 source`]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/configuration/sources/s3/): @@ -80,7 +81,7 @@ csv-s3-pipeline: For example, if the `ingest.csv` file in the Amazon Simple Storage Service (Amazon S3) bucket that the Amazon Simple Queue Service (SQS) queue is attached to contains the following data: -``` +```text Should,skip,this,line a,b,c 1,2,3 diff --git a/_data-prepper/pipelines/configuration/processors/date.md b/_data-prepper/pipelines/configuration/processors/date.md index 1b442618ea6..559604c5301 100644 --- a/_data-prepper/pipelines/configuration/processors/date.md +++ b/_data-prepper/pipelines/configuration/processors/date.md @@ -66,6 +66,7 @@ The following `date` processor configuration can be used to add a default timest from_time_received: true destination: "@timestamp" ``` +{% include copy.html %} ## Example: Parse a timestamp to convert its format and time zone The following `date` processor configuration can be used to parse the value of the timestamp applied to `dd/MMM/yyyy:HH:mm:ss` and write it in `yyyy-MM-dd'T'HH:mm:ss.SSSXXX` format: @@ -74,10 +75,11 @@ The following `date` processor configuration can be used to parse the value of t - date: match: - key: timestamp - patterns: ["dd/MMM/yyyy:HH:mm:ss"] + patterns: ["dd/MMM/yyyy:HH:mm:ss"] destination: "@timestamp" output_format: "yyyy-MM-dd'T'HH:mm:ss.SSSXXX" source_timezone: "America/Los_Angeles" destination_timezone: "America/Chicago" locale: "en_US" ``` +{% include copy.html %} diff --git a/_data-prepper/pipelines/configuration/processors/decompress.md b/_data-prepper/pipelines/configuration/processors/decompress.md index 1dc44222bf4..030e8733bb3 100644 --- a/_data-prepper/pipelines/configuration/processors/decompress.md +++ b/_data-prepper/pipelines/configuration/processors/decompress.md @@ -30,6 +30,7 @@ processor: keys: [ "base_64_gzip_key" ] type: gzip ``` +{% include copy.html %} ## Metrics diff --git a/_data-prepper/pipelines/configuration/processors/delay.md b/_data-prepper/pipelines/configuration/processors/delay.md index a2b80bbace1..c7e716d85f9 100644 --- a/_data-prepper/pipelines/configuration/processors/delay.md +++ b/_data-prepper/pipelines/configuration/processors/delay.md @@ -25,3 +25,4 @@ processor: - delay: for: 2s ``` +{% include copy.html %} diff --git a/_data-prepper/pipelines/configuration/processors/dissect.md b/_data-prepper/pipelines/configuration/processors/dissect.md index c0a776c6b2e..22cbb792586 100644 --- a/_data-prepper/pipelines/configuration/processors/dissect.md +++ b/_data-prepper/pipelines/configuration/processors/dissect.md @@ -28,10 +28,11 @@ dissect-pipeline: sink: - stdout: ``` +{% include copy.html %} Then create the following file named `logs_json.log` and replace the `path` in the file source of your `pipeline.yaml` file with the path of a file containing the following JSON data: -``` +```json {"log": "07-25-2023 10:00:00 ERROR: error message"} ``` @@ -39,7 +40,7 @@ The `dissect` processor will retrieve the fields (`Date`, `Time`, `Log_Type`, an After running the pipeline, you should receive the following standard output: -``` +```json { "log" : "07-25-2023 10:00:00 ERROR: Some error", "Date" : "07-25-2023" diff --git a/_data-prepper/pipelines/configuration/processors/flatten.md b/_data-prepper/pipelines/configuration/processors/flatten.md index e3c589d63a0..ddb3fe0dfc5 100644 --- a/_data-prepper/pipelines/configuration/processors/flatten.md +++ b/_data-prepper/pipelines/configuration/processors/flatten.md @@ -84,6 +84,7 @@ Use the `remove_processed_fields` option when flattening all of an event's neste remove_processed_fields: true ... ``` +{% include copy.html %} For example, when the input event contains the following nested objects: @@ -140,6 +141,7 @@ Use the `exclude_keys` option to prevent specific keys from being flattened in t exclude_keys: ["key2"] ... ``` +{% include copy.html %} For example, when the input event contains the following nested objects: @@ -199,6 +201,7 @@ Use the `remove_list_indices` option to convert the fields from the source map i remove_list_indices: true ... ``` +{% include copy.html %} For example, when the input event contains the following nested objects: diff --git a/_data-prepper/pipelines/configuration/processors/geoip.md b/_data-prepper/pipelines/configuration/processors/geoip.md index dcc4e9fa8e3..125101b5f5e 100644 --- a/_data-prepper/pipelines/configuration/processors/geoip.md +++ b/_data-prepper/pipelines/configuration/processors/geoip.md @@ -21,17 +21,18 @@ The minimal configuration requires at least one entry, and each entry at least o The following configuration extracts all available geolocation data from the IP address provided in the field named `clientip`. It will write the geolocation data to a new field named `geo`, the default source when none is configured: -``` +```yaml my-pipeline: processor: - geoip: entries: - source: clientip ``` +{% include copy.html %} The following example excludes Autonomous System Number (ASN) fields and puts the geolocation data into a field named `clientlocation`: -``` +```yaml my-pipeline: processor: - geoip: @@ -40,6 +41,7 @@ my-pipeline: target: clientlocation include_fields: [asn, asn_organization, network] ``` +{% include copy.html %} ## Configuration From cf148e33fb42db2eb5697f0c38b1ccfaefefb29f Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 30 Oct 2025 15:32:55 -0400 Subject: [PATCH 09/34] Add Polish and Ukranian analyzer documentation (#11469) * Add Polish and Ukranian analyzer documentation Signed-off-by: Fanit Kolchina * Apply suggestions from code review Signed-off-by: Nathan Bower --------- Signed-off-by: Fanit Kolchina Signed-off-by: Nathan Bower Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- _analyzers/language-analyzers/index.md | 2 +- _analyzers/language-analyzers/polish.md | 126 ++++++++++++++++++ _analyzers/language-analyzers/ukrainian.md | 78 +++++++++++ .../additional-plugins/index.md | 4 +- 4 files changed, 207 insertions(+), 3 deletions(-) create mode 100644 _analyzers/language-analyzers/polish.md create mode 100644 _analyzers/language-analyzers/ukrainian.md diff --git a/_analyzers/language-analyzers/index.md b/_analyzers/language-analyzers/index.md index cc53c1cdac8..d4c7d4a204a 100644 --- a/_analyzers/language-analyzers/index.md +++ b/_analyzers/language-analyzers/index.md @@ -13,7 +13,7 @@ redirect_from: # Language analyzers OpenSearch supports the following language analyzers: -`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `thai`, and `turkish`. +`arabic`, `armenian`, `basque`, `bengali`, `brazilian`, `bulgarian`, `catalan`, `czech`, `danish`, `dutch`, `english`, `estonian`, `finnish`, `french`, `galician`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `lithuanian`, `norwegian`, `persian`, [`polish`]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/polish/) (requires plugin), `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `thai`, `turkish`, and [`ukrainian`]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/ukrainian/) (requires plugin). To use an analyzer when you map an index, specify the value in your query. For example, to map your index with the French language analyzer, specify the `french` value in the analyzer field: diff --git a/_analyzers/language-analyzers/polish.md b/_analyzers/language-analyzers/polish.md new file mode 100644 index 00000000000..cda76b95a0c --- /dev/null +++ b/_analyzers/language-analyzers/polish.md @@ -0,0 +1,126 @@ +--- +layout: default +title: Polish +nav_order: 255 +parent: Language analyzers +grand_parent: Analyzers +--- + +# Polish analyzer + +The Polish language analyzer (`polish`) provides analysis for Polish text. This analyzer is part of the `analysis-stempel` plugin, which must be installed before use. + +## Installing the plugin + +Before you can use the Polish analyzer, you must install the `analysis-stempel` plugin by running the following command: + +```bash +./bin/opensearch-plugin install analysis-stempel +``` +{% include copy.html %} + +For more information, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/): Complete list of available OpenSearch plugins. + +## Using the Polish analyzer + +To use the Polish analyzer when you map an index, specify the `polish` value in the analyzer field: + +```json +PUT my-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "polish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Configuring a custom Polish analyzer + +You can configure a custom Polish analyzer by creating a custom analyzer that uses the Polish stemmer token filter. The default Polish analyzer applies the following analysis chain: + +1. **Tokenizer**: `standard` +2. **Token filters**: + - `lowercase` + - `polish_stop` (removes Polish stop words) + - `polish_stem` (applies Polish stemming) + +### Example: Custom Polish analyzer + +```json +PUT my-polish-index +{ + "settings": { + "analysis": { + "analyzer": { + "custom_polish": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "polish_stop", + "polish_stem" + ] + } + } + } + }, + "mappings": { + "properties": { + "title": { + "type": "text", + "analyzer": "custom_polish" + }, + "content": { + "type": "text", + "analyzer": "polish" + } + } + } +} +``` +{% include copy-curl.html %} + +## Polish token filters + +The `analysis-stempel` plugin provides the following token filters for Polish language processing. + +### `polish_stop` token filter + +Removes common Polish stop words from the token stream. + +### `polish_stem` token filter + +Applies Polish-specific stemming rules to reduce words to their root forms using the Stempel stemming algorithm. + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST _analyze +{ + "analyzer": "polish", + "text": "Jestem programistą w Polsce i pracuję z OpenSearch" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "jest", "start_offset": 0, "end_offset": 6, "type": "", "position": 0}, + {"token": "prograć", "start_offset": 7, "end_offset": 18, "type": "", "position": 1}, + {"token": "polsce", "start_offset": 21, "end_offset": 27, "type": "", "position": 3}, + {"token": "pracować", "start_offset": 30, "end_offset": 37, "type": "", "position": 5}, + {"token": "opensearch", "start_offset": 40, "end_offset": 50, "type": "", "position": 7} + ] +} +``` \ No newline at end of file diff --git a/_analyzers/language-analyzers/ukrainian.md b/_analyzers/language-analyzers/ukrainian.md new file mode 100644 index 00000000000..a3261244329 --- /dev/null +++ b/_analyzers/language-analyzers/ukrainian.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Ukrainian +nav_order: 340 +parent: Language analyzers +grand_parent: Analyzers +--- + +# Ukrainian analyzer + +The Ukrainian language analyzer (`ukrainian`) provides analysis for Ukrainian text. This analyzer is part of the `analysis-ukrainian` plugin, which must be installed before use. + +## Installing the plugin + +Before you can use the Ukrainian analyzer, you must install the `analysis-ukrainian` plugin by running the following command: + +```bash +./bin/opensearch-plugin install analysis-ukrainian +``` +{% include copy.html %} + +For more information, see [Additional plugins]({{site.url}}{{site.baseurl}}/install-and-configure/additional-plugins/): Complete list of available OpenSearch plugins. + +## Using the Ukrainian analyzer + +To use the Ukrainian analyzer when you map an index, specify the `ukrainian` value in the analyzer field: + +```json +PUT my-index +{ + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "ukrainian" + } + } + } +} +``` +{% include copy-curl.html %} + +## Ukrainian language processing + +The Ukrainian analyzer processes text using the following approach: + +1. **Tokenization**: Splits text into individual words. +2. **Stop word removal**: Removes common Ukrainian stop words like "і", "в", "з", "для", "та", and so on. +3. **Morphological analysis**: Generates various word forms and stems for Ukrainian words. +4. **Case normalization**: Handles Ukrainian text appropriately. + +The Ukrainian analyzer uses sophisticated morphological analysis that can generate multiple forms of words to improve search recall. Unlike some other language analyzers, the Ukrainian plugin does not expose individual token filters for custom configuration. + +## Generated tokens + +Use the following request to examine the tokens generated using the analyzer: + +```json +POST _analyze +{ + "analyzer": "ukrainian", + "text": "Я програміст і працюю з OpenSearch в Україні" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "програміст", "start_offset": 2, "end_offset": 12, "type": "", "position": 1}, + {"token": "працювати", "start_offset": 15, "end_offset": 21, "type": "", "position": 3}, + {"token": "opensearch", "start_offset": 24, "end_offset": 34, "type": "", "position": 5}, + {"token": "Україна", "start_offset": 37, "end_offset": 44, "type": "", "position": 7} + ] +} +``` \ No newline at end of file diff --git a/_install-and-configure/additional-plugins/index.md b/_install-and-configure/additional-plugins/index.md index a4c87ff3ff8..d6e1eee384e 100644 --- a/_install-and-configure/additional-plugins/index.md +++ b/_install-and-configure/additional-plugins/index.md @@ -19,8 +19,8 @@ There are many more plugins available in addition to those provided by the stand | [`analysis-phonenumber`]({{site.url}}{{site.baseurl}}/analyzers/supported-analyzers/phone-analyzers/) | 2.18.0 | | `analysis-phonetic` | 1.0.0 | | `analysis-smartcn` | 1.0.0 | -| `analysis-stempel` | 1.0.0 | -| `analysis-ukrainian` | 1.0.0 | +| [`analysis-stempel`]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/polish/) | 1.0.0 | +| [`analysis-ukrainian`]({{site.url}}{{site.baseurl}}/analyzers/language-analyzers/ukrainian/) | 1.0.0 | | `discovery-azure-classic` | 1.0.0 | | `discovery-ec2` | 1.0.0 | | `discovery-gce` | 1.0.0 | From 4734a57ed315260e79567304302471b474f228ee Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 30 Oct 2025 15:38:13 -0400 Subject: [PATCH 10/34] Add missing PPL settings (#11470) * Add missing PPL settings Signed-off-by: Fanit Kolchina * Apply suggestions from code review Signed-off-by: Nathan Bower --------- Signed-off-by: Fanit Kolchina Signed-off-by: Nathan Bower Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- _search-plugins/sql/settings.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/_search-plugins/sql/settings.md b/_search-plugins/sql/settings.md index 19d3575407a..64785196a4c 100644 --- a/_search-plugins/sql/settings.md +++ b/_search-plugins/sql/settings.md @@ -77,10 +77,13 @@ Setting | Default | Description `plugins.sql.slowlog` | 2 seconds | Configures the time limit (in seconds) for slow queries. The plugin logs slow queries as `Slow query: elapsed=xxx (ms)` in `opensearch.log`. `plugins.sql.cursor.keep_alive` | 1 minute | Configures how long the cursor context is kept open. Cursor contexts are resource-intensive, so we recommend a low value. `plugins.query.memory_limit` | 85% | Configures the heap memory usage limit for the circuit breaker of the query engine. -`plugins.query.size_limit` | 200 | Sets the default size of index that the query engine fetches from OpenSearch. +`plugins.query.size_limit` | 10000 | Sets the default size of index that the query engine fetches from OpenSearch. `plugins.query.datasources.enabled` | true | Change to `false` to disable support for data sources in the plugin. `plugins.query.field_type_tolerance` | true | If `false`, then an array is reduced to the first non-array value at any nesting level. For example, `[[1, 2], [3, 4]]` will be reduced to `1`. If `true`, then the array is preserved. Default is `true`. `plugins.calcite.enabled` | true | Enables the Apache Calcite query engine, including advanced SQL and PPL capabilities such as subsearches, joins, and lookup operations. +`plugins.calcite.all_join_types.allowed` | false | Enables performance-sensitive join types, like `RIGHT`, `FULL`, and `CROSS` joins. Change to `true` to allow these join operations. +`plugins.ppl.syntax.legacy.preferred` | true | Controls certain PPL syntax behaviors, including default argument values. When `false`, uses newer syntax standards. +`plugins.ppl.values.max.limit` | 0 | Sets the maximum number of unique values that the `VALUES` aggregation function can return. A value of `0` indicates no limit. ## Spark connector settings From 6e4e1f5a34c7f307f02de2c5b9bdfbfa067d5883 Mon Sep 17 00:00:00 2001 From: Peter Zhu Date: Thu, 30 Oct 2025 17:16:44 -0400 Subject: [PATCH 11/34] Bump docs to 3.3.2 version with OS updates only (#11404) * Bump docs to 3.3.2 version with OS updates only Signed-off-by: Peter Zhu * Update 3.3.2 releaseinfo Signed-off-by: Peter Zhu * Update _about/version-history.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Peter Zhu * Update plugin entries Signed-off-by: Peter Zhu --------- Signed-off-by: Peter Zhu Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Arya Soni --- _about/version-history.md | 3 ++- _config.yml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/_about/version-history.md b/_about/version-history.md index 53e540bca37..18a997ac169 100644 --- a/_about/version-history.md +++ b/_about/version-history.md @@ -9,7 +9,8 @@ permalink: /version-history/ OpenSearch version | Release highlights | Release date :--- | :--- | :--- -[3.3.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.3.1.md) | Fixes backward compatibility handling of date fields while maintaining performance optimizations. The `skip_list` parameter is now automatically set to `true` for new `@timestamp` fields created since 3.3.0, while preserving `skip_list=false` for existing indexes with `@timestamp` or index sort date fields. This approach ensures date histogram aggregation performance benefits for new indexes while maintaining compatibility with existing workloads. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.3.1.md). | 21 October 2025 +[3.3.2](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.3.2.md) | Includes maintenance changes and bug fixes for the OpenSearch core engine and ML Commons, Neural Search, Skills, k-NN and Security plugins. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.3.2.md). | 30 October 2025 +[3.3.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.3.1.md) | Fixes backward compatibility handling of date fields while maintaining performance optimizations. The `skip_list` parameter is now automatically set to `true` for new `@timestamp` fields created since 3.3.0, while preserving `skip_list=false` for existing indexes with `@timestamp` or index sort date fields. This approach ensures date histogram aggregation performance benefits for new indexes while maintaining compatibility with existing workloads. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.3.1.md). | 22 October 2025 [3.3.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.3.0.md) | Introduces redesigned Discover interface with log analytics and distributed tracing capabilities, and Apache Calcite as default PPL query engine with expanded functions. Makes agentic search and agentic memory APIs generally available for AI applications. Implements Seismic algorithm for neural sparse search performance improvements and processor chains for data transformation pipelines. Expands gRPC support for additional query types and adds experimental streaming with Apache Arrow Flight. Includes workload management enhancements with rule-based auto-tagging and query monitoring capabilities. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.3.0.md). | 14 October 2025 [3.2.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.2.0.md) | Updates Search Relevance Workbench. Makes gRPC APIs generally available. Introduces derived source, updates workload management, semantic field, and star tree functionality. Adds experimental Agentic Memory APIs and Job Scheduler APIs. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.2.0.md). | 19 August 2025 [3.1.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.1.0.md) | Makes GPU acceleration for vector index builds generally available. Introduces memory-optimized search for Faiss indexes using Lucene HNSW, semantic field type for streamlined semantic search, and Search Relevance Workbench for search quality optimization. Makes star-tree indexes generally available with support for comprehensive query types. Enhances observability with ML Commons metrics integration, custom index support for OpenTelemetry data, and new PPL commands for JSON manipulation. Improves agent management with Update Agent API and persistent MCP tools. Includes security enhancements with immutable user objects and new resource sharing framework. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.1.0.md). | 24 June 2025 diff --git a/_config.yml b/_config.yml index 7e5e6b7155d..122b2ce6763 100644 --- a/_config.yml +++ b/_config.yml @@ -6,7 +6,7 @@ latesturl: "/latest" # The subpath of the latest version. Used for non-version-s url: "https://docs.opensearch.org" # the base hostname & protocol for your site, e.g. http://example.com permalink: /:path/ -opensearch_version: '3.3.1' +opensearch_version: '3.3.2' opensearch_dashboards_version: '3.3.0' opensearch_major_minor_version: '3.3' lucene_version: '10_3_1' From 5e249880ac9cf1c6a819c895b37cefd714d2a0d8 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Tue, 4 Nov 2025 15:27:48 +0000 Subject: [PATCH 12/34] adding examples to http source of data prepper (#11347) * adding examples to http source of data prepper Signed-off-by: Anton Rubin * Update http.md Signed-off-by: AntonEliatra * Apply suggestions from code review Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Apply suggestions from code review Signed-off-by: Nathan Bower --------- Signed-off-by: Anton Rubin Signed-off-by: AntonEliatra Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Nathan Bower Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- .../pipelines/configuration/sources/http.md | 108 ++++++++++++++++-- 1 file changed, 101 insertions(+), 7 deletions(-) diff --git a/_data-prepper/pipelines/configuration/sources/http.md b/_data-prepper/pipelines/configuration/sources/http.md index 870261f410b..5f8f1856404 100644 --- a/_data-prepper/pipelines/configuration/sources/http.md +++ b/_data-prepper/pipelines/configuration/sources/http.md @@ -24,10 +24,10 @@ max_connection_count | No | Integer | The maximum allowed number of open connect max_pending_requests | No | Integer | The maximum allowed number of tasks in the `ScheduledThreadPool` work queue. Default value is `1024`. max_request_length | No | ByteCount | The maximum number of bytes allowed in the payload of a single HTTP request. Default value is `10mb`. authentication | No | Object | An authentication configuration. By default, this creates an unauthenticated server for the pipeline. This uses pluggable authentication for HTTPS. To use basic authentication define the `http_basic` plugin with a `username` and `password`. To provide customer authentication, use or create a plugin that implements [ArmeriaHttpAuthenticationProvider](https://github.com/opensearch-project/data-prepper/blob/1.2.0/data-prepper-plugins/armeria-common/src/main/java/com/amazon/dataprepper/armeria/authentication/ArmeriaHttpAuthenticationProvider.java). -ssl | No | Boolean | Enables TLS/SSL. Default value is false. -ssl_certificate_file | Conditionally | String | SSL certificate chain file path or Amazon Simple Storage Service (Amazon S3) path. Amazon S3 path example `s3:///`. Required if `ssl` is set to true and `use_acm_certificate_for_ssl` is set to false. -ssl_key_file | Conditionally | String | SSL key file path or Amazon S3 path. Amazon S3 path example `s3:///`. Required if `ssl` is set to true and `use_acm_certificate_for_ssl` is set to false. -use_acm_certificate_for_ssl | No | Boolean | Enables a TLS/SSL using certificate and private key from AWS Certificate Manager (ACM). Default value is false. +ssl | No | Boolean | Enables TLS/SSL. Default value is `false`. +ssl_certificate_file | Conditionally | String | The SSL certificate chain file path or Amazon Simple Storage Service (Amazon S3) path (for example, `s3:///`). Required if `ssl` is set to `true` and `use_acm_certificate_for_ssl` is set to `false`. +ssl_key_file | Conditionally | String | The SSL key file path or Amazon S3 path (for example, `s3:///`). Required if `ssl` is set to `true` and `use_acm_certificate_for_ssl` is set to `false`. +use_acm_certificate_for_ssl | No | Boolean | Enables TLS/SSL using the certificate and private key from AWS Certificate Manager (ACM). Default is `false`. acm_certificate_arn | Conditionally | String | The ACM certificate Amazon Resource Name (ARN). The ACM certificate takes preference over Amazon S3 or a local file system certificate. Required if `use_acm_certificate_for_ssl` is set to true. acm_private_key_password | No | String | ACM private key password that decrypts the private key. If not provided, Data Prepper generates a random password. acm_certificate_timeout_millis | No | Integer | Timeout, in milliseconds, that ACM takes to get certificates. Default value is 120000. @@ -43,12 +43,106 @@ Clients should send HTTP `POST` requests to the endpoint `/log/ingest`. The `http` protocol only supports the JSON UTF-8 codec for incoming requests, for example, `[{"key1": "value1"}, {"key2": "value2"}]`. -#### Example: Ingest data with cURL +## Example -The following cURL command can be used to ingest data: +The following examples demonstrate different configurations that can be used with the `http` source. +### Minimal HTTP source + +The following is the minimal configuration using all default values: + +```yaml +minimal-http-pipeline: + source: + http: + sink: + - stdout: {} +``` +{% include copy.html %} + +You can test this pipeline using the following command: + +```bash +curl -s "http://localhost:2021/log/ingest" \ + -H "Content-Type: application/json" \ + --data '[{"msg":"one"},{"msg":"two"}]' +``` +{% include copy.html %} + +You should see the following output in the Data Prepper logs: + +``` +{"msg":"one"} +{"msg":"two"} +``` + +### Custom path using the pipeline name and health check + +The following example uses a custom path, configures a custom port, and enables health checks: + +```yaml +audit-pipeline: + source: + http: + port: 2022 + path: "/${pipelineName}/logs" # -> /audit-pipeline/logs + health_check_service: true + unauthenticated_health_check: true + sink: + - stdout: {} ``` -curl "http://localhost:2021/log/ingest" --data '[{"key1": "value1"}, {"key2": "value2"}]' +{% include copy.html %} + +You can use the following command to check the pipeline health: + +```bash +curl -s "http://localhost:2022/health" +``` +{% include copy.html %} + +You can ingest data using the following command: + +```bash +curl -s "http://localhost:2022/audit-pipeline/logs" \ + -H "Content-Type: application/json" \ + --data '[{"event":"login","user":"alice"}]' +``` +{% include copy.html %} + +### Basic authentication on the source + +The following example configures a custom port and path, enables health checks, and configures basic authentication: + +```yaml +secure-intake-pipeline: + source: + http: + port: 2023 + path: /ingest + authentication: + http_basic: + username: ingest + password: s3cr3t + health_check_service: true + unauthenticated_health_check: true + sink: + - stdout: {} + - opensearch: + hosts: ["https://opensearch:9200"] + insecure: true + username: admin + password: admin_password + index_type: custom + index: demo-%{yyyy.MM.dd} +``` +{% include copy.html %} + +You can test this pipeline using the following command: + +```bash +curl -s -u ingest:s3cr3t "http://localhost:2023/ingest" \ + -H "Content-Type: application/json" \ + --data '[{"service":"web","status":"ok"}]' ``` {% include copy.html %} From 1eb2d41e129b10610d57d705a5ac525877261612 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Tue, 4 Nov 2025 15:31:06 +0000 Subject: [PATCH 13/34] adding file source page (#11355) * adding file source page Signed-off-by: Anton Rubin * fixing valke errors Signed-off-by: Anton Rubin * Update file.md Signed-off-by: AntonEliatra * Apply suggestions from code review Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/sources/file.md Signed-off-by: Nathan Bower --------- Signed-off-by: Anton Rubin Signed-off-by: AntonEliatra Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Nathan Bower Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- .../pipelines/configuration/sources/file.md | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 _data-prepper/pipelines/configuration/sources/file.md diff --git a/_data-prepper/pipelines/configuration/sources/file.md b/_data-prepper/pipelines/configuration/sources/file.md new file mode 100644 index 00000000000..0f9a2d5ec58 --- /dev/null +++ b/_data-prepper/pipelines/configuration/sources/file.md @@ -0,0 +1,92 @@ +--- +layout: default +title: File +parent: Sources +grand_parent: Pipelines +nav_order: 24 +--- + +# File source + +The `file` plugin reads events from a local file once when the pipeline starts. It's useful for loading seed data, testing processors and sinks, or replaying a fixed dataset. This source *does not monitor* the file for new lines after startup. + +Option | Required | Type | Description +:--- | :--- | :--- | :--- +`path` | Yes | String | An absolute path to the input file inside the Data Prepper container, for example, `/usr/share/data-prepper/data/input.jsonl`. +`format` | No | String | Specifies how to interpret the file content. Valid values are `json` and `plain`. Use `json` when your file has one JSON object per line or a JSON array. Use `plain` for raw text lines. Default is `plain`. +`record_type` | No | String | The type of output record produced by the source. Valid values are `event` and `string`. Use `event` to produce structured events expected by downstream processors and the OpenSearch sink. Default is `string`. + +### Example + +The following examples demonstrate how different file types can be processed. + +### JSON file + +The following example processes a JSON file: + +```yaml +file-to-opensearch: + source: + file: + path: /usr/share/data-prepper/data/input.ndjson + format: json + record_type: event + sink: + - opensearch: + hosts: ["https://opensearch:9200"] + index: file-demo + username: admin + password: admin_pass + insecure: true +``` +{% include copy.html %} + +### Plain text file + +A raw text file can be processed using the following pipeline: + +```yaml +plain-file-to-opensearch: + source: + file: + path: /usr/share/data-prepper/data/app.log + format: plain + record_type: event + processor: + - grok: + match: + message: + - '%{TIMESTAMP_ISO8601:timestamp} \[%{LOGLEVEL:level}\] %{GREEDYDATA:msg}' + sink: + - opensearch: + hosts: ["https://opensearch:9200"] + index: plain-file-demo + username: admin + password: admin_pass + insecure: true +``` +{% include copy.html %} + +### CSV file + +You can process a CSV file using the `csv` processor: + +```yaml +csv-file-to-opensearch: + source: + file: + path: /usr/share/data-prepper/data/ingest.csv + format: plain + record_type: event + processor: + - csv: + column_names: ["time","level","message"] + sink: + - opensearch: + hosts: ["https://opensearch:9200"] + index: csv-demo + username: admin + password: admin_pass + insecure: true +``` +{% include copy.html %} From b8d3d5d5bf683c31d0033414a39d0955f27fca0f Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Tue, 4 Nov 2025 10:32:52 -0500 Subject: [PATCH 14/34] Add explain filtering functionality for ISM docs (#11462) * Add explain filtering functionality for ISM docs Signed-off-by: Fanit Kolchina * Update _im-plugin/ism/api.md Co-authored-by: bowenlan-amzn Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Make response hidden Signed-off-by: Fanit Kolchina * Update _im-plugin/ism/api.md Signed-off-by: Nathan Bower --------- Signed-off-by: Fanit Kolchina Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Nathan Bower Co-authored-by: bowenlan-amzn Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- _im-plugin/ism/api.md | 186 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) diff --git a/_im-plugin/ism/api.md b/_im-plugin/ism/api.md index 8bcc4b79865..f58e8adcd6b 100644 --- a/_im-plugin/ism/api.md +++ b/_im-plugin/ism/api.md @@ -653,6 +653,192 @@ GET _plugins/_ism/explain/index_1?show_policy=true The `plugins.index_state_management.policy_id` setting is deprecated starting from ODFE version 1.13.0. We retain this field in the response API for consistency. +## Explain index with filtering +**Introduced 2.12** +{: .label .label-purple } + +You can use the `POST` method with the Explain API to filter the results based on specific criteria. This allows you to query indexes based on their policy ID, current state, or action type. + +#### Endpoints + +``` +POST _plugins/_ism/explain/ +``` + +#### Request body + +The request body supports the following optional filters. + +| Parameter | Type | Description | +|:----------|:-----|:------------| +| `policy_id` | String | Filter results to show only indexes managed by the specified policy ID. | +| `state` | String | Filter results to show only indexes currently in the specified state. | +| `action_type` | String | Filter results to show only indexes currently executing the specified action type. | +| `failed` | Boolean | Filter results to show only failed managed indexes. | +All filters are optional. If a filter is not specified, indexes with any value for that parameter will be included in the results. The API returns only indexes that match all specified filters. + +#### Example request: Filter by policy ID + +```json +POST _plugins/_ism/explain/log-* +{ + "filter": { + "policy_id": "hot-warm-delete-policy" + } +} +``` +{% include copy-curl.html %} + +#### Example request: Filter by state and action type + +```json +POST _plugins/_ism/explain/app-* +{ + "filter": { + "state": "warm", + "action_type": "allocation" + } +} +``` +{% include copy-curl.html %} + +#### Example request: Filter by all criteria + +```json +POST _plugins/_ism/explain/data-* +{ + "filter": { + "policy_id": "data-lifecycle-policy", + "state": "hot", + "action_type": "rollover" + } +} +``` +{% include copy-curl.html %} + +#### Example response + +
+ + Response + + {: .text-delta} + +```json +{ + "test-logs-001": { + "index.plugins.index_state_management.policy_id": "test-lifecycle-policy", + "index": "test-logs-001", + "index_uuid": "LmJgKNatQZWHQu-qIHlcJw", + "policy_id": "test-lifecycle-policy", + "enabled": true, + "policy": { + "policy_id": "test-lifecycle-policy", + "description": "Lifecycle policy for log data: hot -> warm -> cold -> delete", + "last_updated_time": 1730308440926, + "schema_version": 18, + "error_notification": null, + "default_state": "hot", + "states": [ + { + "name": "hot", + "actions": [ + { + "rollover": { + "min_doc_count": 10000, + "min_size": "1gb", + "min_index_age": "1d" + } + } + ], + "transitions": [ + { + "state_name": "warm", + "conditions": { + "min_index_age": "7d" + } + } + ] + }, + { + "name": "warm", + "actions": [ + { + "replica_count": { + "number_of_replicas": 0 + } + } + ], + "transitions": [ + { + "state_name": "cold", + "conditions": { + "min_index_age": "30d" + } + } + ] + }, + { + "name": "cold", + "actions": [], + "transitions": [ + { + "state_name": "delete", + "conditions": { + "min_index_age": "90d" + } + } + ] + }, + { + "name": "delete", + "actions": [ + { + "delete": {} + } + ], + "transitions": [] + } + ], + "ism_template": null + }, + "policy_seq_no": 0, + "policy_primary_term": 1, + "rolled_over": false, + "index_creation_date": 1730308447399, + "state": { + "name": "hot", + "start_time": 1730308447644 + }, + "action": { + "name": "rollover", + "start_time": 1730308447644, + "index": 0, + "failed": false, + "consumed_retries": 0, + "last_retry_time": 0 + }, + "step": { + "name": "attempt_rollover", + "start_time": 1730308447644, + "step_status": "starting" + }, + "retry_info": { + "failed": false, + "consumed_retries": 0 + }, + "info": { + "message": "Currently checking rollover conditions" + }, + "enabled": true, + "enabled_time": 1730308447644 + }, + "total_managed_indices": 1 +} +``` + +
+ --- ## Delete policy From aa299edab8e61570cd43f165f837e2bb5f6ab938 Mon Sep 17 00:00:00 2001 From: Bo Zhang Date: Tue, 4 Nov 2025 07:47:26 -0800 Subject: [PATCH 15/34] Update documentation for arrays that semantic field cannot support it (#11482) * Update documentation for arrays that semantic field cannot support it Signed-off-by: Bo Zhang * Update _mappings/supported-field-types/index.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --------- Signed-off-by: Bo Zhang Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Arya Soni --- _mappings/supported-field-types/index.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/_mappings/supported-field-types/index.md b/_mappings/supported-field-types/index.md index 9f361acf9b1..073d3377315 100644 --- a/_mappings/supported-field-types/index.md +++ b/_mappings/supported-field-types/index.md @@ -125,6 +125,9 @@ PUT testindex1/_doc/2 } ``` +The `semantic` field cannot contain an array of values because it's mapped to an embedding field (`rank_features` or `knn_vector`), which supports only a single vector. +{: .note} + ## Multifields Multifields are used to index the same field differently. Strings are often mapped as `text` for full-text queries and `keyword` for exact-value queries. From 734d8ab45c383fae912f33a3fc70826afa423c47 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 6 Nov 2025 13:58:59 +0000 Subject: [PATCH 16/34] expanding example for split string processor (#11246) * expanding example for split string processor Signed-off-by: Anton Rubin * Update split-string.md Signed-off-by: AntonEliatra --------- Signed-off-by: Anton Rubin Signed-off-by: AntonEliatra Signed-off-by: Arya Soni --- .../configuration/processors/split-string.md | 99 ++++++++++++++++--- 1 file changed, 83 insertions(+), 16 deletions(-) diff --git a/_data-prepper/pipelines/configuration/processors/split-string.md b/_data-prepper/pipelines/configuration/processors/split-string.md index a2844f41c91..a141983b427 100644 --- a/_data-prepper/pipelines/configuration/processors/split-string.md +++ b/_data-prepper/pipelines/configuration/processors/split-string.md @@ -29,36 +29,103 @@ source | N/A | N/A | The key to split. delimiter | No | N/A | The separator character responsible for the split. Cannot be defined at the same time as `delimiter_regex`. At least `delimiter` or `delimiter_regex` must be defined. delimiter_regex | No | N/A | The regex string responsible for the split. Cannot be defined at the same time as `delimiter`. At least `delimiter` or `delimiter_regex` must be defined. -### Usage +### Example To get started, create the following `pipeline.yaml` file: ```yaml -pipeline: +split-string-all-configs-pipeline: source: - file: - path: "/full/path/to/logs_json.log" - record_type: "event" - format: "json" + http: + path: /logs + ssl: false + processor: - split_string: + # 1) The top-level list of split "entries" entries: - - source: "message" + # 2) Use `source` + `delimiter` (comma) + - source: csv_line delimiter: "," + + # 3) Another `source` + `delimiter` (pipe) + - source: tags + delimiter: "|" + + # 4) `source` + `delimiter` (slash) to split a path + - source: path + delimiter: "/" + + # 5) `source` + `delimiter_regex` (semicolon + optional spaces) + - source: semicolons + delimiter_regex: ";\\s*" + sink: - - stdout: + - opensearch: + hosts: ["https://opensearch:9200"] + insecure: true + username: admin + password: admin_pass + index_type: custom + index: split-string-demo-%{yyyy.MM.dd} + ``` {% include copy.html %} -Next, create a log file named `logs_json.log`. After that, replace the `path` in the file source of your `pipeline.yaml` file with your file path. For more detailed information, see [Configuring OpenSearch Data Prepper]({{site.url}}{{site.baseurl}}/data-prepper/getting-started/#2-configuring-data-prepper). +You can test the pipeline using the following command: -Before you run Data Prepper, the source appears in the following format: - -```json -{"message": "hello,world"} +```bash +curl -sS -X POST "http://localhost:2021/logs" \ + -H "Content-Type: application/json" \ + -d '[ + { + "csv_line": "x,y", + "tags": "beta|test", + "path": "usr/local/bin", + "semicolons": "alpha;beta ; gamma" + } + ]' ``` -After you run Data Prepper, the source is converted to the following format: +{% include copy.html %} + +The document stored in OpenSearch contains the following information: ```json -{"message":["hello","world"]} -``` \ No newline at end of file +{ + ... + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "split-string-demo-2025.10.15", + "_id": "YSAz6JkBrcmuDURMmTeo", + "_score": 1, + "_source": { + "csv_line": [ + "x", + "y" + ], + "tags": [ + "beta", + "test" + ], + "path": [ + "usr", + "local", + "bin" + ], + "semicolons": [ + "alpha", + "beta ", + "gamma" + ] + } + } + ] + } +} +``` From f2d1880cf26433571d6a8049b8c937cf8bc2b966 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 6 Nov 2025 14:01:06 +0000 Subject: [PATCH 17/34] Expanding conditional routing example (#11237) * expanding on routes example Signed-off-by: Anton Rubin * expanding on routes example Signed-off-by: Anton Rubin * Update pipelines.md Signed-off-by: AntonEliatra * Update pipelines.md Signed-off-by: AntonEliatra * Apply suggestions from code review Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --------- Signed-off-by: Anton Rubin Signed-off-by: AntonEliatra Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Arya Soni --- _data-prepper/pipelines/pipelines.md | 78 +++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 13 deletions(-) diff --git a/_data-prepper/pipelines/pipelines.md b/_data-prepper/pipelines/pipelines.md index 98e344ff602..973456e6dfa 100644 --- a/_data-prepper/pipelines/pipelines.md +++ b/_data-prepper/pipelines/pipelines.md @@ -65,31 +65,83 @@ If a pipeline component fails to process and send an event, then the source rece Pipelines also support conditional routing, which enables the routing of events to different sinks based on specific conditions. To add conditional routing, specify a list of named routes using the `route` component and assign specific routes to sinks using the `routes` property. Any sink with the `routes` property only accepts events matching at least one of the routing conditions. -In the following example pipeline, `application-logs` is a named route with a condition set to `/log_type == "application"`. The route uses [Data Prepper expressions](https://github.com/opensearch-project/data-prepper/tree/main/examples) to define the condition. Data Prepper routes events satisfying this condition to the first OpenSearch sink. By default, Data Prepper routes all events to sinks without a defined route, as shown in the third OpenSearch sink of the given pipeline: +In the following pipeline, routes are defined at the pipeline level under `route`. The route uses [Data Prepper expressions](https://github.com/opensearch-project/data-prepper/tree/main/examples) to define the condition. Two named routes are declared: + +- `errors: /level == "ERROR"` + +- `slow_requests: /latency_ms != null and /latency_ms >= 1000` + +Each OpenSearch sink can opt in to one or more routes using the `routes:` setting. Events that satisfy a route's condition are delivered to the sinks that reference that route. For example, the first sink receives events matching `errors`, and the second sink receives events matching `slow_requests`. + +By default, any sink without a `routes:` list receives all events, regardless of whether they matched other routes. In the following example, the third sink has no `routes:` setting, so it receives all events, including those already routed to the first two sinks: ```yml -conditional-routing-sample-pipeline: +routes-demo-pipeline: source: http: - processor: + path: /logs + ssl: false + route: - - application-logs: '/log_type == "application"' - - http-logs: '/log_type == "apache"' + - errors: '/level == "ERROR"' + - slow_requests: '/latency_ms != null and /latency_ms >= 1000' + sink: + # 1) Only events matching the "errors" route - opensearch: - hosts: [ "https://opensearch:9200" ] - index: application_logs - routes: [application-logs] + hosts: ["https://opensearch:9200"] + insecure: true + username: admin + password: admin_pass + index_type: custom + index: routed-errors-%{yyyy.MM.dd} + routes: [errors] + + # 2) Only events matching the "slow_requests" route - opensearch: - hosts: [ "https://opensearch:9200" ] - index: http_logs - routes: [http-logs] + hosts: ["https://opensearch:9200"] + insecure: true + username: admin + password: admin_pass + index_type: custom + index: routed-slow-%{yyyy.MM.dd} + routes: [slow_requests] + + # 3) All events - opensearch: - hosts: [ "https://opensearch:9200" ] - index: all_logs + hosts: ["https://opensearch:9200"] + insecure: true + username: admin + password: admin_pass + index_type: custom + index: routed-other-%{yyyy.MM.dd} ``` {% include copy.html %} +You can test this pipeline using the following command: + +```bash +curl -sS -X POST "http://localhost:2021/logs" \ + -H "Content-Type: application/json" \ + -d '[ + {"level":"ERROR","message":"DB connection failed","latency_ms":120}, + {"level":"INFO","message":"GET /api/items","latency_ms":1500}, + {"level":"INFO","message":"health check ok","latency_ms":42} + ]' +``` +{% include copy.html %} + +The documents are stored in the corresponding indexes: + +``` +health status index uuid pri rep docs.count docs.deleted store.size pri.store.size +... +green open routed-other-2025.10.14 IBZTXO3ySBGky0tIHRaRmg 1 1 3 0 5.4kb 5.4kb +green open routed-slow-2025.10.14 J-hzZ9m8RkWvpMKC_oQLVQ 1 1 1 0 5kb 5kb +green open routed-errors-2025.10.14 v3r7JzPfQVOS8dWOBF1o2w 1 1 1 0 5kb 5kb +... +``` + ## Next steps - See [Common uses cases]({{site.url}}{{site.baseurl}}/data-prepper/common-use-cases/common-use-cases/) for example configurations. From 27a957ba42108d7d9e06c5e52c6a2fb3c7be2d0d Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 6 Nov 2025 09:01:29 -0500 Subject: [PATCH 18/34] Add field masking search limitation (#11489) * Add field masking search limitation Signed-off-by: Fanit Kolchina * Remove redundancy Signed-off-by: Fanit Kolchina * Update _security/access-control/field-masking.md Signed-off-by: Nathan Bower --------- Signed-off-by: Fanit Kolchina Signed-off-by: Nathan Bower Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- _security/access-control/field-masking.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/_security/access-control/field-masking.md b/_security/access-control/field-masking.md index 0211a509933..1ac89cf9456 100644 --- a/_security/access-control/field-masking.md +++ b/_security/access-control/field-masking.md @@ -12,7 +12,24 @@ redirect_from: If you don't want to remove fields from a document using [field-level security]({{site.url}}{{site.baseurl}}/security/access-control/field-level-security/), you can mask their values. Currently, field masking is only available for string-based fields and replaces the field's value with a cryptographic hash. -Field masking works alongside field-level security on the same per-role, per-index basis. You can allow certain roles to see sensitive fields in plain text and mask them for others. A search result with a masked field might look like the following: +Field masking works alongside field-level security on the same per-role, per-index basis. You can allow certain roles to see sensitive fields in plain text and mask them for others. + +## Important limitation: Search functionality + +**Fields with masking applied cannot be searched.** When you apply field masking to a field, you will not be able to search for terms within that field, even if the terms are not masked by your pattern. This occurs because field masking is applied after indexing, while search operations rely on the inverted index created during the indexing process. +{: .warning} + +For example, if you have a field `message` with the value `"User john.doe@example.com accessed the system"` and apply pattern-based masking to hide email addresses, the displayed result might show `"User ***@***.*** accessed the system"`. However, you will not be able to search for `"User"`, `"accessed"`, or `"system"` in this field, even though these terms are not masked. + +### Workarounds + +If you need to maintain search functionality on partially masked fields, consider these alternatives: + +- **Use separate fields**: Split your data into separate fields—one for searchable content and another for sensitive data that needs masking. +- **Index transformation**: Create a separate index with pre-applied masking transformations rather than using dynamic field masking. +- **Field-level security**: Instead of masking, use [field-level security]({{site.url}}{{site.baseurl}}/security/access-control/field-level-security/) to completely hide sensitive fields from unauthorized users. + +A search result with a masked field might appear similar to the following: ```json { From 8ecca3811b5547e08ca729f7b9044e5268e3a04f Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 6 Nov 2025 14:03:30 +0000 Subject: [PATCH 19/34] adding example to write json processor data prepper (#11282) * adding example to write json processor data prepper Signed-off-by: Anton Rubin * Update write-json.md Signed-off-by: AntonEliatra * Update _data-prepper/pipelines/configuration/processors/write-json.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/write-json.md Signed-off-by: Nathan Bower --------- Signed-off-by: Anton Rubin Signed-off-by: AntonEliatra Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Nathan Bower Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- .../configuration/processors/write-json.md | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/_data-prepper/pipelines/configuration/processors/write-json.md b/_data-prepper/pipelines/configuration/processors/write-json.md index eb838f968f1..e56a4a4683a 100644 --- a/_data-prepper/pipelines/configuration/processors/write-json.md +++ b/_data-prepper/pipelines/configuration/processors/write-json.md @@ -23,3 +23,111 @@ Option | Description | Example source | Mandatory field that specifies the name of the field in the event containing the message or object to be parsed. | If `source` is set to `"message"` and the input is `{"message": {"key1":"value1", "key2":{"key3":"value3"}}}`, then the `write_json` processor outputs the event as `"{\"key1\":\"value1\",\"key2\":{\"key3\":\"value3\"}}"`. target | An optional field that specifies the name of the field in which the resulting JSON string should be stored. If `target` is not specified, then the `source` field is used. | `key1` +## Example + +The following example uses `write_json` twice, first to copy the `details` object into a new JSON string field named `target` and then to overwrite the original `payload` field when target is omitted: + +```yaml +write-json-demo-pipeline: + source: + http: + path: /logs + ssl: false + + processor: + # 1) Copy the nested "details" object into a JSON string at "details_json" + - write_json: + source: details + target: details_json + + # 2) Overwrite "payload" with its JSON-string representation + - write_json: + # no target -> result stored back into "payload" + source: payload + sink: + - opensearch: + hosts: ["https://opensearch:9200"] + insecure: true + username: admin + password: admin_pass + index_type: custom + index: write-json-demo-%{yyyy.MM.dd} +``` +{% include copy.html %} + +You can test this pipeline using the following command: + +```bash +curl -sS -X POST "http://localhost:2021/logs" \ + -H "Content-Type: application/json" \ + -d '[ + { + "message": "order created", + "details": {"order_id": 123, "items": [{"sku": "A1", "qty": 2}], "expedited": true}, + "payload": {"user": {"id": "u-42", "role": "admin"}, "ip": "10.0.0.5"} + }, + { + "message": "order updated", + "details": {"order_id": 124, "items": [{"sku": "B9", "qty": 1}], "expedited": false}, + "payload": {"user": {"id": "u-77", "role": "viewer"}, "ip": "10.0.0.9"} + } + ]' +``` +{% include copy.html %} + +The documents stored in OpenSearch contain the following information: + +```json +{ + ... + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "write-json-demo-2025.10.15", + "_id" : "YMQA6ZkB1u9wkbZgz8wu", + "_score" : 1.0, + "_source" : { + "message" : "order created", + "details" : { + "order_id" : 123, + "items" : [ + { + "sku" : "A1", + "qty" : 2 + } + ], + "expedited" : true + }, + "payload" : "{\"user\":{\"id\":\"u-42\",\"role\":\"admin\"},\"ip\":\"10.0.0.5\"}", + "details_json" : "{\"order_id\":123,\"items\":[{\"sku\":\"A1\",\"qty\":2}],\"expedited\":true}" + } + }, + { + "_index" : "write-json-demo-2025.10.15", + "_id" : "YcQA6ZkB1u9wkbZgz8wu", + "_score" : 1.0, + "_source" : { + "message" : "order updated", + "details" : { + "order_id" : 124, + "items" : [ + { + "sku" : "B9", + "qty" : 1 + } + ], + "expedited" : false + }, + "payload" : "{\"user\":{\"id\":\"u-77\",\"role\":\"viewer\"},\"ip\":\"10.0.0.9\"}", + "details_json" : "{\"order_id\":124,\"items\":[{\"sku\":\"B9\",\"qty\":1}],\"expedited\":false}" + } + } + ] + } +} +``` From 4ad656e24601760ca85cfc95aed9230186d29c93 Mon Sep 17 00:00:00 2001 From: Luke Cousins Date: Thu, 6 Nov 2025 14:50:52 +0000 Subject: [PATCH 20/34] Correct kubectl commands (#11492) * Correct kubectl commands Signed-off-by: Luke Cousins * Apply suggestions from code review Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Update _install-and-configure/install-opensearch/operator.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --------- Signed-off-by: Luke Cousins Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Arya Soni --- _install-and-configure/install-opensearch/operator.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_install-and-configure/install-opensearch/operator.md b/_install-and-configure/install-opensearch/operator.md index 0633989368a..b31407f6a11 100644 --- a/_install-and-configure/install-opensearch/operator.md +++ b/_install-and-configure/install-opensearch/operator.md @@ -123,9 +123,9 @@ kubectl get crds | grep opensearch ``` {% include copy.html %} -If you deployed the operator using [Helm charts](#use-a-helm-chart), to ensure that Kubernetes recognizes the OpenSearch Kubernetes Operator as a namespace, enter `k get ns | grep opensearch`. Both `opensearch` and `opensearch-operator-system` should appear as `Active`. +If you deployed the operator using [Helm charts](#use-a-helm-chart), verify that Kubernetes recognizes the OpenSearch Kubernetes Operator namespaces by running `kubectl get ns | grep opensearch`. Both `opensearch` and `opensearch-operator-system` should appear as `Active`. -With the operator active, use `k get pod -n opensearch-operator-system` to make sure that the operator's pods are running. +With the operator active, verify that the operator's pods are running by executing `kubectl get pod -n opensearch-operator-system`. All pods should have the `Running` status. ``` NAME READY STATUS RESTARTS AGE From c2da0e92445d64eefe9337a17c347bbe1dfed9c7 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 6 Nov 2025 12:08:26 -0500 Subject: [PATCH 21/34] Update cluster-settings.md (#11503) Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Arya Soni --- .../configuring-opensearch/cluster-settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_install-and-configure/configuring-opensearch/cluster-settings.md b/_install-and-configure/configuring-opensearch/cluster-settings.md index 3e19fd38c5b..55e89a38e4c 100644 --- a/_install-and-configure/configuring-opensearch/cluster-settings.md +++ b/_install-and-configure/configuring-opensearch/cluster-settings.md @@ -145,7 +145,7 @@ OpenSearch supports the following cluster-level shard, block, and task settings: OpenSearch supports the following cluster-level search settings: -- `cluster.search.ignore_awareness_attributes` (Boolean): Controls whether awareness attributes are considered during shard query routing. If `true` (default), the cluster ignores awareness attributes and uses Adaptive Replica Selection (ARS) to choose the optimal shard copy, reducing query response latency. Set this to `false` for routing decisions to prioritize awareness attributes instead of performance-based selection. +- `cluster.search.ignore_awareness_attributes` (Boolean): Controls whether awareness attributes are considered during shard query routing. If `true`, the cluster ignores awareness attributes and uses Adaptive Replica Selection (ARS) to choose the optimal shard copy, reducing query response latency. Set this to `false` for routing decisions to prioritize awareness attributes instead of performance-based selection. Default is `false` (`true` for Amazon OpenSearch Service). ## Cluster-level slow log settings From 4236b7bc6d66dd1d0b0fef1a532d4527e8dc6fde Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 6 Nov 2025 15:19:03 -0500 Subject: [PATCH 22/34] Add 2.19.4 to version history (#11494) * Add 2.19.4 to version history Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Update _about/version-history.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --------- Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Arya Soni --- _about/version-history.md | 1 + 1 file changed, 1 insertion(+) diff --git a/_about/version-history.md b/_about/version-history.md index 18a997ac169..6ad9bd6f2de 100644 --- a/_about/version-history.md +++ b/_about/version-history.md @@ -15,6 +15,7 @@ OpenSearch version | Release highlights | Release date [3.2.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.2.0.md) | Updates Search Relevance Workbench. Makes gRPC APIs generally available. Introduces derived source, updates workload management, semantic field, and star tree functionality. Adds experimental Agentic Memory APIs and Job Scheduler APIs. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.2.0.md). | 19 August 2025 [3.1.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.1.0.md) | Makes GPU acceleration for vector index builds generally available. Introduces memory-optimized search for Faiss indexes using Lucene HNSW, semantic field type for streamlined semantic search, and Search Relevance Workbench for search quality optimization. Makes star-tree indexes generally available with support for comprehensive query types. Enhances observability with ML Commons metrics integration, custom index support for OpenTelemetry data, and new PPL commands for JSON manipulation. Improves agent management with Update Agent API and persistent MCP tools. Includes security enhancements with immutable user objects and new resource sharing framework. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.1.0.md). | 24 June 2025 [3.0.0](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.0.0.md) | Upgrades to Lucene 10 for improved indexing and vector search. Adds experimental gRPC support and pull-based ingestion from Kafka and Kinesis. Introduces GPU acceleration for vector operations and semantic sentence highlighting. Improves range query performance and hybrid search with z-score normalization. Adds plan-execute-reflect agents and native MCP protocol support for agentic workflows. Enhances security with a new Java agent replacing the Security Manager. Includes PPL query improvements with lookup, join, and subsearch commands. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-3.0.0.md). | 06 May 2025 +[2.19.4](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.4.md) | Fixes critical security vulnerabilities across multiple components including ML Commons, Query Insights Dashboards, and SQL. Resolves bugs in Flow Framework multi-tenancy, Security wildcard matching, and Query Insights time validation. Includes extensive CVE fixes and dependency updates across dashboards plugins. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.4.md). | 06 November 2025 [2.19.3](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.3.md) | Improves Flow Framework with enhanced memory handling and workflow step processing. Fixes several Query Insights and Query Insights Dashboards issues. Implements security updates across multiple components. Updates infrastructure components and documentation across multiple plugins. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.3.md). | 22 July 2025 [2.19.2](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.2.md) | Improves query insights with better index handling, a new verbose API parameter, and a default index template. Fixes bugs across Query Insights, Observability, Flow Framework, and Dashboards. Includes multiple CVE fixes, test enhancements, and a new PGP key for artifact verification. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.2.md). | 29 April 2025 [2.19.1](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.1.md) | Adds execution hint for cardinality aggregator. Includes bug fixes for ML Commons, Query Insights Dashboards, and Remote Metadata SDK. Contains maintenance updates for several components. For a full list of release highlights, see the [Release Notes](https://github.com/opensearch-project/opensearch-build/blob/main/release-notes/opensearch-release-notes-2.19.1.md). | 27 February 2025 From e6920e50211946145ed16596e312c3b09a9a771e Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 10 Nov 2025 12:43:40 -0500 Subject: [PATCH 23/34] Fix broken external links (#11508) Signed-off-by: Fanit Kolchina Signed-off-by: Arya Soni --- _clients/ruby.md | 2 +- _dashboards/management/accelerate-external-data.md | 2 +- _dashboards/management/scheduled-query-acceleration.md | 2 +- _observing-your-data/ad/dashboards-anomaly-detection.md | 2 +- _observing-your-data/alerting/dashboards-alerting.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/_clients/ruby.md b/_clients/ruby.md index 7d582927c60..9148351bd23 100644 --- a/_clients/ruby.md +++ b/_clients/ruby.md @@ -7,7 +7,7 @@ has_children: false # Ruby client -The OpenSearch Ruby client allows you to interact with your OpenSearch clusters through Ruby methods rather than HTTP methods and raw JSON. For the client's complete API documentation and additional examples, see the [`opensearch-transport`](https://rubydoc.info/gems/opensearch-transport), [`opensearch-api`](https://rubydoc.info/gems/opensearch-api), [`opensearch-dsl`](https://rubydoc.info/gems/opensearch-dsl), and [`opensearch-ruby`](https://rubydoc.info/gems/opensearch-ruby/) gem documentation. +The OpenSearch Ruby client allows you to interact with your OpenSearch clusters through Ruby methods rather than HTTP methods and raw JSON. For the client's complete API documentation, see the [opensearch-ruby repository](https://github.com/opensearch-project/opensearch-ruby) documentation. For additional examples, see [`opensearch-transport`](https://rubygems.org/gems/opensearch-transport/), [`opensearch-api`](https://rubygems.org/gems/opensearch-api/), [`opensearch-dsl`](https://rubygems.org/gems/opensearch-dsl/), and [`opensearch-ruby`](https://rubygems.org/gems/opensearch-ruby/) gem documentation. This getting started guide illustrates how to connect to OpenSearch, index documents, and run queries. For the client source code, see the [opensearch-ruby repo](https://github.com/opensearch-project/opensearch-ruby). diff --git a/_dashboards/management/accelerate-external-data.md b/_dashboards/management/accelerate-external-data.md index 61c08c01f88..98bc2fad45b 100644 --- a/_dashboards/management/accelerate-external-data.md +++ b/_dashboards/management/accelerate-external-data.md @@ -77,7 +77,7 @@ OpenSearch creates a new index from the covering index data. You can use this ne 1. For **Index name**, enter a valid index name. Note that each table can have multiple covering indexes. 2. Choose a **Refresh type**. By default, OpenSearch automatically refreshes the index. Otherwise, you must manually trigger a refresh using a REFRESH statement. -3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in a file system compatible with the Hadoop Distributed File System (HDFS). For more information, see [Starting streaming queries](https://spark.apache.org/docs/3.5.1/structured-streaming-programming-guide.html#starting-streaming-queries). +3. Enter a **Checkpoint location**, which is a path for refresh job checkpoints. The location must be a path in a file system compatible with the Hadoop Distributed File System (HDFS). For more information, see [Starting streaming queries](https://spark.apache.org/docs/latest/streaming/apis-on-dataframes-and-datasets.html#starting-streaming-queries). 4. Define the covering index fields by selecting **(add fields here)** under **Covering index definition**. 5. Select **Create acceleration** to apply your covering index settings. 6. View the covering index query details and then click **Run**. OpenSearch adds your index to the left navigation pane. diff --git a/_dashboards/management/scheduled-query-acceleration.md b/_dashboards/management/scheduled-query-acceleration.md index 3aa83e6fc5c..c0d653e2114 100644 --- a/_dashboards/management/scheduled-query-acceleration.md +++ b/_dashboards/management/scheduled-query-acceleration.md @@ -106,7 +106,7 @@ When creating indexes using an accelerated query, you can specify the following |:--- | :--- | | `auto_refresh` | Enables automatic refresh for the index. If `true`, the index refreshes automatically at the specified interval. If `false`, the refresh operation must be triggered manually using the `REFRESH` statement. Default is `false`. | | `refresh_interval` | Defines the amount of time between index refresh operations for the index, which determines how frequently new data is ingested into the index. This is applicable only when `auto_refresh` is enabled. The interval determines how frequently new data is integrated and can be specified in formats like `1 minute` or `10 seconds`. For valid time units, see [Time units](#time-units).| -| `scheduler_mode` | Specifies the scheduling mode for auto-refresh (internal or external scheduling). The external scheduler requires a `checkpoint_location` (a path for refresh job checkpoints) for state management. For more information, see [Starting streaming queries](https://spark.apache.org/docs/3.5.1/structured-streaming-programming-guide.html#starting-streaming-queries). Valid values are `internal` and `external`.| +| `scheduler_mode` | Specifies the scheduling mode for auto-refresh (internal or external scheduling). The external scheduler requires a `checkpoint_location` (a path for refresh job checkpoints) for state management. For more information, see [Starting streaming queries](https://spark.apache.org/docs/latest/streaming/apis-on-dataframes-and-datasets.html#starting-streaming-queries). Valid values are `internal` and `external`.| For more information and additional available parameters, see [Flint index refresh](https://github.com/opensearch-project/opensearch-spark/blob/main/docs/index.md#flint-index-refresh). diff --git a/_observing-your-data/ad/dashboards-anomaly-detection.md b/_observing-your-data/ad/dashboards-anomaly-detection.md index 2379506d638..85864f2419a 100644 --- a/_observing-your-data/ad/dashboards-anomaly-detection.md +++ b/_observing-your-data/ad/dashboards-anomaly-detection.md @@ -27,7 +27,7 @@ Anomaly detection visualizations are displayed as time-series charts that give y Keep in mind the following requirements when setting up or creating anomaly detection visualizations. The visualization: -- Must be a [Vizlib line chart](https://community.vizlib.com/support/solutions/articles/35000107262-vizlib-line-chart-introduction) +- Must be a [Vizlib line chart](https://docs-vizlib.insightsoftware.com/hc/en-us/articles/39676102755341-Line-Chart-Overview) - Must contain at least a Y-axis metric aggregation - Must not have non-Y-axis metric aggregation types - Must use the date histogram aggregation type for the X-axis bucket diff --git a/_observing-your-data/alerting/dashboards-alerting.md b/_observing-your-data/alerting/dashboards-alerting.md index 4a5d01dde4f..073b521def5 100644 --- a/_observing-your-data/alerting/dashboards-alerting.md +++ b/_observing-your-data/alerting/dashboards-alerting.md @@ -36,7 +36,7 @@ Alerting visualizations are displayed as time-series charts that give you a snap Keep in mind the following requirements when setting up or creating alerting visualizations. The visualization: -- Must be a [Vizlib line chart](https://community.vizlib.com/support/solutions/articles/35000107262-vizlib-line-chart-introduction) +- Must be a [Vizlib line chart](https://docs-vizlib.insightsoftware.com/hc/en-us/articles/39676102755341-Line-Chart-Overview) - Must contain at least a Y-axis metric aggregation - Must not have non-Y-axis metric aggregation types - Must use the date histogram aggregation type for the X-axis bucket From 421b438f42891bfcd7e11d59baeefb2eec59fdc5 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Mon, 10 Nov 2025 20:17:10 +0000 Subject: [PATCH 24/34] adding examples to key value processor data prepper (#11232) * adding examples to key value processor data prepper Signed-off-by: Anton Rubin * Update key-value.md Signed-off-by: AntonEliatra * Update key-value.md Signed-off-by: AntonEliatra * Apply suggestions from code review Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --------- Signed-off-by: Anton Rubin Signed-off-by: AntonEliatra Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Arya Soni --- .../configuration/processors/key-value.md | 340 ++++++++++++++++++ 1 file changed, 340 insertions(+) diff --git a/_data-prepper/pipelines/configuration/processors/key-value.md b/_data-prepper/pipelines/configuration/processors/key-value.md index 7d2a553d5f2..a1ec3b8b3ac 100644 --- a/_data-prepper/pipelines/configuration/processors/key-value.md +++ b/_data-prepper/pipelines/configuration/processors/key-value.md @@ -11,6 +11,346 @@ nav_order: 170 You can use the `key_value` processor to parse the specified field into key-value pairs. You can customize the `key_value` processor to parse field information with the following options. The type for each of the following options is `string`. +## Examples + +The following examples demonstrate several configurations you can use with this processor. + +The examples don't use security and are for demonstration purposes only. We strongly recommend configuring SSL before using these examples in production. +{: .warning} + +### Key-value parsing, normalization, and deduplication + +The following example parses the `message` field into `key=value` pairs, normalizes and cleans the keys, prefixes them with `meta_`, deduplicates values, and drops keys without values into `parsed_kv`: + +```yaml +kv-basic-pipeline: + source: + http: + path: /logs + ssl: false + + processor: + - key_value: + # Read key=value pairs from the "message" field (default anyway) + source: message + # Write parsed pairs into a nested object "parsed_kv" + destination: parsed_kv + + # Split pairs on '&' and split key vs value on '=' + field_split_characters: "&" + value_split_characters: "=" + + # Normalize keys and trim garbage whitespace around keys/values + transform_key: lowercase + delete_key_regex: "\\s+" # remove spaces from keys + delete_value_regex: "^\\s+|\\s+$" # trim leading/trailing spaces + + # Add a prefix to every key (after normalization + delete_key_regex) + prefix: meta_ + + # Keep a single unique value for duplicate keys + skip_duplicate_values: true + + # Drop keys whose value is empty/absent (e.g., `empty=` or `novalue`) + drop_keys_with_no_value: true + + sink: + - opensearch: + hosts: ["https://opensearch:9200"] + insecure: true + username: admin + password: admin_password + index_type: custom + index: kv-basic-%{yyyy.MM.dd} +``` +{% include copy.html %} + +You can test this pipeline using the following command: + +```bash +curl -sS -X POST "http://localhost:2021/logs" \ + -H "Content-Type: application/json" \ + -d '[ + {"message":"key1=value1&key1=value1&Key Two = value two & empty=&novalue"}, + {"message":"ENV = prod & TEAM = core & owner = alice "} + ]' +``` +{% include copy.html %} + +The documents stored in OpenSearch contain the following information: + +```json +{ + ... + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "kv-basic-2025.10.14", + "_id": "M6d84pkB3P3jd6EROH_f", + "_score": 1, + "_source": { + "message": "key1=value1&key1=value1&Key Two = value two & empty=&novalue", + "parsed_kv": { + "meta_key1": "value1", + "meta_empty": "", + "meta_keytwo": "value two" + } + } + }, + { + "_index": "kv-basic-2025.10.14", + "_id": "NKd84pkB3P3jd6EROH_f", + "_score": 1, + "_source": { + "message": "ENV = prod & TEAM = core & owner = alice ", + "parsed_kv": { + "meta_owner": "alice", + "meta_team": "core", + "meta_env": "prod" + } + } + } + ] + } +} +``` + +### Grouped values to root + +The following example parses the `payload` field by using `&&` to separate pairs and `==` to separate keys and values. It preserves bracketed groups as single values, writes the parsed results to the event root without overwriting existing fields, and records any unmatched tokens as `null`: + +```yaml +kv-grouping-pipeline: + source: + http: + path: /logs + ssl: false + + processor: + - key_value: + source: "payload" + destination: null + + field_split_characters: "&&" # pair delimiter (OK with grouping) + value_split_characters: null # disable the default "=" + key_value_delimiter_regex: "==" # exact '==' for key/value + + value_grouping: true + remove_brackets: false + overwrite_if_destination_exists: false + non_match_value: null + + sink: + - opensearch: + hosts: ["https://opensearch:9200"] + insecure: true + username: admin + password: "admin_pass" + index_type: custom + index: "kv-regex-%{yyyy.MM.dd}" +``` +{% include copy.html %} + +You can test this pipeline using the following command: + +```bash +curl -sS -X POST "http://localhost:2021/logs" \ + -H "Content-Type: application/json" \ + -d '[ + { + "payload":"a==1&&b==[x=y,z=w]&&c==(inner=thing)&&http==http://example.com path", + "a":"keep-me" + }, + { + "payload":"good==yes&&broken-token&&url==https://opensearch.org home", + "note":"second doc" + } + ]' +``` +{% include copy.html %} + +The documents stored in OpenSearch contain the following information: + +```json +{ + ... + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "kv-regex-2025.10.14", + "_id": "FuCX4pkB344hN2Iu62oT", + "_score": 1, + "_source": { + "payload": "a==1&&b==[x=y,z=w]&&c==(inner=thing)&&http==http://example.com path", + "a": "keep-me", + "b": "[x=y,z=w]", + "c": "(inner=thing)", + "http": "http://example.com path" + } + }, + { + "_index": "kv-regex-2025.10.14", + "_id": "F-CX4pkB344hN2Iu62oT", + "_score": 1, + "_source": { + "payload": "good==yes&&broken-token&&url==https://opensearch.org home", + "note": "second doc", + "broken-token": null, + "good": "yes", + "url": "https://opensearch.org home" + } + } + ] + } +} +``` + +### Conditional recursive key-value parsing + +The following example parses bracketed nested `key=value` structures from `body` into `parsed.*` only when `/type == "nested"`. It preserves group hierarchy, enforces strict nesting rules, applies default fields, and leaves non-nested events unchanged: + +```yaml +kv-conditional-recursive-pipeline: + source: + http: + path: /logs + ssl: false + + processor: + - key_value: + source: "body" + destination: "parsed" + + key_value_when: '/type == "nested"' + recursive: true + + # Split rules (per docs; not regex) + field_split_characters: "&" + value_split_characters: "=" + + # Grouping & quoting (per docs) + value_grouping: true + string_literal_character: "\"" + remove_brackets: false + + # Keep only some top-level keys; then set defaults + include_keys: ["item1","item2","owner"] + default_values: + owner: "unknown" + region: "eu-west-1" + + strict_grouping: true + tags_on_failure: ["keyvalueprocessor_failure"] + + sink: + - opensearch: + hosts: ["https://opensearch:9200"] + insecure: true + username: admin + password: "admin_pass" + index_type: custom + index: "kv-recursive-%{yyyy.MM.dd}" +``` +{% include copy.html %} + +You can test this pipeline using the following command: + +```bash +curl -sS -X POST "http://localhost:2021/logs" \ + -H "Content-Type: application/json" \ + -d '[ + { + "type":"nested","body":"item1=[a=1&b=(c=3&d=)]&item2=2&owner=alice" + }, + { + "type":"flat","body":"item1=[should=not&be=parsed]&item2=42" + }, + { + "type":"nested","body":"item1=[desc=\"a=b + c=d\"&x=1]&item2=2" + } +]' +``` +{% include copy.html %} + +The documents stored in OpenSearch contain the following information: + +```json +{ + ... + "hits": { + "total": { + "value": 3, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "kv-recursive-2025.10.14", + "_id": "Q7fC4pkBc0UY8I7pZ6vZ", + "_score": 1, + "_source": { + "type": "nested", + "body": "item1=[a=1&b=(c=3&d=)]&item2=2&owner=alice", + "parsed": { + "owner": "alice", + "item2": "2", + "item1": { + "a": "1", + "b": { + "c": "3", + "d": { + "e": "5" + } + } + }, + "region": "eu-west-1" + } + } + }, + { + "_index": "kv-recursive-2025.10.14", + "_id": "RLfC4pkBc0UY8I7pZ6vZ", + "_score": 1, + "_source": { + "type": "flat", + "body": "item1=[should=not&be=parsed]&item2=42" + } + }, + { + "_index": "kv-recursive-2025.10.14", + "_id": "RbfC4pkBc0UY8I7pZ6vZ", + "_score": 1, + "_source": { + "type": "nested", + "body": """item1=[desc="a=b + c=d"&x=1]&item2=2""", + "parsed": { + "owner": "unknown", + "item2": "2", + "item1": { + "desc": "\"a=b + c=d\"", + "x": "1" + }, + "region": "eu-west-1" + } + } + } + ] + } +} +``` + +## Configuration + Option | Description | Example :--- | :--- | :--- `source` | The message field to be parsed. Optional. Default value is `message`. | If `source` is `"message1"`, `{"message1": {"key1=value1"}, "message2": {"key2=value2"}}` parses into `{"message1": {"key1=value1"}, "message2": {"key2=value2"}, "parsed_message": {"key1": "value1"}}`. From ca8f9ab43af51058c54d13d0cf20ef841742acb1 Mon Sep 17 00:00:00 2001 From: Sagar Darji <42430138+darjisagar7@users.noreply.github.com> Date: Tue, 11 Nov 2025 01:49:06 +0530 Subject: [PATCH 25/34] =?UTF-8?q?Updating=20the=20Cross=20Cluster=20Replic?= =?UTF-8?q?ation=20documentation=20for=20the=20index=20le=E2=80=A6=20(#114?= =?UTF-8?q?96)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Updating the Cross Cluster Replication documentation for the index level ops batch size setting Signed-off-by: Sagar Darji * Apply suggestions from code review Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Update _tuning-your-cluster/replication-plugin/settings.md Signed-off-by: Nathan Bower --------- Signed-off-by: Sagar Darji Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Nathan Bower Co-authored-by: Sagar Darji Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- .../replication-plugin/settings.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/_tuning-your-cluster/replication-plugin/settings.md b/_tuning-your-cluster/replication-plugin/settings.md index 1a9162fd2ba..080980f0d97 100644 --- a/_tuning-your-cluster/replication-plugin/settings.md +++ b/_tuning-your-cluster/replication-plugin/settings.md @@ -9,7 +9,7 @@ redirect_from: # Replication settings -The replication plugin adds several settings to the standard OpenSearch cluster settings. +The replication plugin adds several settings to the standard OpenSearch cluster and index settings. The settings are dynamic, so you can change the default behavior of the plugin without restarting your cluster. To learn more about static and dynamic settings, see [Configuring OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/index/). You can mark settings as `persistent` or `transient`. @@ -25,17 +25,26 @@ PUT _cluster/settings } ``` -These settings manage the resources consumed by remote recoveries. We don’t recommend changing these settings; the defaults should work well for most use cases. +These settings manage the resources consumed by remote recoveries. We don't recommend changing these settings; the defaults should work well for most use cases. + +## Cluster-level settings + +You can specify these settings at the cluster level to control the default behavior of replication across all indexes in the cluster. These settings apply globally unless overridden by index-level settings. Setting | Default | Description :--- | :--- | :--- -`plugins.replication.follower.index.recovery.chunk_size` | 10 MB | The chunk size requested by the follower cluster during file transfer. Specify the chunk size as a value and unit, for example, 10 MB, 5 KB. See [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -`plugins.replication.follower.index.recovery.max_concurrent_file_chunks` | 4 | The number of file chunk requests that can be sent in parallel for each recovery. -`plugins.replication.follower.index.ops_batch_size` | 50000 | The number of operations that can be fetched at a time during the syncing phase of replication. `plugins.replication.follower.concurrent_readers_per_shard` | 2 | The number of concurrent requests from the follower cluster per shard during the syncing phase of replication. `plugins.replication.autofollow.fetch_poll_interval` | 30s | How often auto-follow tasks poll the leader cluster for new matching indexes. `plugins.replication.follower.metadata_sync_interval` | 60s | How often the follower cluster polls the leader cluster for updated index metadata. `plugins.replication.translog.retention_lease.pruning.enabled` | true | If enabled, prunes the translog based on retention leases on the leader index. `plugins.replication.translog.retention_size` | 512 MB | Controls the size of the translog on the leader index. `plugins.replication.replicate.delete_index` | false | If enabled, the follower index is automatically deleted whenever the corresponding leader index is deleted. +`plugins.replication.follower.index.ops_batch_size` | 50000 | The number of operations that can be fetched at a time during the sync phase of replication. + +## Index-level settings +You can specify these settings when creating a follower index or update them for existing follower indexes. These settings control the behavior of individual indexes during replication. + +Setting | Default | Description +:--- |:------| :--- +`index.plugins.replication.follower.ops_batch_size` | 50000 | The number of operations that can be fetched at a time during the sync phase of replication for the specific index. This setting overrides the cluster-level setting. From 2ab87ab3642484616865ae0ea0395de739ca7019 Mon Sep 17 00:00:00 2001 From: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> Date: Mon, 10 Nov 2025 14:33:43 -0600 Subject: [PATCH 26/34] Update documentation for delete_entries and select_entries processors (#11476) * Update delete_entries processor to add new features Signed-off-by: Kennedy Onyia * update select_entries processor documentation to account for new include_keys_regex feature Signed-off-by: Kennedy Onyia * fix style check errors and include additional pipeline configurations to clarify new features. Signed-off-by: Kennedy Onyia * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Apply suggestions from code review Signed-off-by: Nathan Bower --------- Signed-off-by: Kennedy Onyia Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Nathan Bower Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- .../processors/delete-entries.md | 90 ++++++++++++++++--- .../processors/select-entries.md | 57 ++++++++++-- 2 files changed, 131 insertions(+), 16 deletions(-) diff --git a/_data-prepper/pipelines/configuration/processors/delete-entries.md b/_data-prepper/pipelines/configuration/processors/delete-entries.md index 6e12db2103c..7bf014c885e 100644 --- a/_data-prepper/pipelines/configuration/processors/delete-entries.md +++ b/_data-prepper/pipelines/configuration/processors/delete-entries.md @@ -5,10 +5,14 @@ parent: Processors grand_parent: Pipelines nav_order: 110 --- - + # Delete entries processor -The `delete_entries` processor deletes entries, such as key-value pairs, from an event. You can define the keys you want to delete in the `with-keys` field following `delete_entries` in the YAML configuration file. Those keys and their values are deleted. +The `delete_entries` processor removes entries, such as key-value pairs, from an event. Use the `with_keys` field to specify +the exact keys to delete. To delete keys that match a regular expression pattern, use the `with_keys_regex` field. You can +prevent deletion of specific events when using regular expressions by configuring the `exclude_from_regex` field. + +The only way to configure both `with_keys` and `with_keys_regex` in the same `delete_entries` processor is by using the `entries` field. ## Configuration @@ -21,22 +25,27 @@ This table is autogenerated. Do not edit it. - source: https://github.com/opensearch-project/data-prepper/blob/c4455a7785bc2da4358067c217be7085e0bc8d0f/data-prepper-plugins/mutate-event-processors/src/main/java/org/opensearch/dataprepper/plugins/processor/mutateevent/DeleteEntryProcessorConfig.java --> -| Option | Required | Description | -:--- | :--- | :--- -| `with_keys` | Yes | An array of keys for the entries to be deleted. | +| Option | Required | Description | +:--- |:---------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +| `with_keys` | No | An array that specifies the keys of the entries to delete. | +| `with_keys_regex` | No | An array of regular expression (regex) patterns used to match the keys of entries to delete. | +| `exclude_from_delete` | No | A set of entries to exclude from deletion when using the `with_keys_regex` configuration. | +| `entries` | No | A list of entries to delete from the event. | +| `delete_when` | No | Defines the condition under which the deletion is performed. For example, `value="/some_key == null"` deletes the key only if `/some_key` is null or does not exist. | +|`delete_from_element_when` | No | Defines the condition that determines whether a key–value pair should be removed from each element in the list specified by `iterate_on`. The condition is evaluated for each element, and deletion occurs only if the element's key matches one defined in `with_keys` or `with_keys_regex` and satisfies the condition. | +| `iterate_on` | No | Specifies the key of the list field that contains objects to iterate over. The processor applies any configured deletion rules, such as `with_keys`, `with_keys_regex`, or `delete_from_element_when`, to each element in the list. | ## Usage - To get started, create the following `pipeline.yaml` file: ```yaml pipeline: source: ... - .... + .... processor: - delete_entries: - with_keys: ["message"] + with_keys: [ "message" ] sink: ``` {% include copy.html %} @@ -52,7 +61,68 @@ For example, before you run the `delete_entries` processor, if the `logs_json.lo When you run the `delete_entries` processor, it parses the message into the following output: ```json -{"message2": "goodbye"} +{"message2": "goodbye", "message3": "test"} +``` + + +### Deleting keys that match a pattern + +First, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + ... + .... + processor: + - delete_entries: + with_keys_regex: [ "^tes.*" ] + exclude_from_delete: [ "test" ] + sink: +``` +{% include copy.html %} + +If your `logs_json.log` file contains the following event record: + +```json +{"test": "friends", "test2": "are", "test3": "kind"} +``` + +When you run the `delete_entries` processor, it parses the message into the following output: + +```json +{"test": "friends"} +``` + +### Combining multiple deletion rules + +First, create the following `pipeline.yaml` file: + +```yaml +pipeline: + source: + ... + .... + processor: + - delete_entries: + entries: + - with_keys: [ "message" ] + - with_keys_regex: [ "^tes.*" ] + exclude_from_delete: [ "test" ] + sink: +``` +{% include copy.html %} + +If your `logs_json.log` file contains the following event record: + +```json +{"message": "hello", "message2": "goodbye", "test": "friends", "test2": "are", "test3": "kind"} +``` + +When you run the `delete_entries` processor, it parses the message into the following output: + +```json +{"message2": "goodbye","test": "friends"} ``` -> If `message` does not exist in the event, then no action occurs. +> If the `with_keys`, `with_keys_regex`, or `exclude_from_delete` values do not match any event keys, then no action occurs. diff --git a/_data-prepper/pipelines/configuration/processors/select-entries.md b/_data-prepper/pipelines/configuration/processors/select-entries.md index 78fb4cdcdbe..d9561e856b0 100644 --- a/_data-prepper/pipelines/configuration/processors/select-entries.md +++ b/_data-prepper/pipelines/configuration/processors/select-entries.md @@ -16,13 +16,14 @@ Only the selected entries remain in the processed event and while all other entr You can configure the `select_entries` processor using the following options. | Option | Required | Description | -| :--- | :--- | :--- | -| `include_keys` | Yes | A list of keys to be selected from an event. | -| `select_when` | No | A [conditional expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"'`, that will be evaluated to determine whether the processor will be run on the event. If the condition is not met, then the event continues through the pipeline unmodified with all the original fields present. | +| :--- |:---------| :--- | +| `include_keys` | No | A list of keys to be selected from an event. | +| `include_keys_regex` | No | A regular expression (regex) pattern that matches the keys to be selected from an event. | +| `select_when` | No | A [conditional expression]({{site.url}}{{site.baseurl}}/data-prepper/pipelines/expression-syntax/), such as `/some-key == "test"`, that is evaluated to determine whether the processor will be executed on the event. If the condition is not met, then the event continues through the pipeline unmodified, with all the original fields present. | ## Usage -The following example shows how to configure the `select_entries` processor in the `pipeline.yaml` file: +First, create the following `pipeline.yaml` file: ```yaml pipeline: @@ -41,15 +42,59 @@ pipeline: For example, when your source contains the following event record: ```json -{"message": "hello", "key1" : "value1", "key2" : "value2", "some_key" : "test"} +{ + "message": "hello", + "key1" : "value1", + "key2" : "value2", + "some_key" : "test" +} ``` -The `select_entries` processor includes only `key1` and `key2` in the processed output: +After processing, only the keys listed in `include_keys` are retained in the event; all other keys are removed: ```json {"key1": "value1", "key2": "value2"} ``` +### Selecting keys using a regex + +The following example shows how to configure the `include_keys_regex` field in the `pipeline.yaml` file: + +```yaml +pipeline: + source: + ... + .... + processor: + - select_entries: + include_keys: [ "key1", "key2" ] + include_keys_regex: ["^ran.*"] + select_when: '/some_key == "test"' + sink: +``` +{% include copy.html %} + +For example, when your source contains the following event record: + +```json +{ + "message": "hello", + "key1" : "value1", + "key2" : "value2", + "some_key" : "test", + "random1": "another", + "random2" : "set", + "random3": "of", + "random4": "values" +} +``` + +The processor retains keys explicitly listed in `include_keys` and any keys matching the` include_keys_regex` pattern, removing all other keys from the event: + +```json +{"key1": "value1", "key2": "value2", "random1": "another", "random2" : "set", "random3": "of", "random4": "values"} +``` + ### Accessing nested fields Use `/` to access nested fields. From 1626e43ab6ce47920e301748715c65c54efc49b7 Mon Sep 17 00:00:00 2001 From: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> Date: Mon, 10 Nov 2025 16:05:23 -0600 Subject: [PATCH 27/34] delete incorrect output value in delete_entries processor example (#11516) Signed-off-by: Kennedy Onyia Signed-off-by: Arya Soni --- .../pipelines/configuration/processors/delete-entries.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data-prepper/pipelines/configuration/processors/delete-entries.md b/_data-prepper/pipelines/configuration/processors/delete-entries.md index 7bf014c885e..37c69758736 100644 --- a/_data-prepper/pipelines/configuration/processors/delete-entries.md +++ b/_data-prepper/pipelines/configuration/processors/delete-entries.md @@ -61,7 +61,7 @@ For example, before you run the `delete_entries` processor, if the `logs_json.lo When you run the `delete_entries` processor, it parses the message into the following output: ```json -{"message2": "goodbye", "message3": "test"} +{"message2": "goodbye"} ``` From 266aa77794aabe7fc53107fdb56f28e4496e6cd0 Mon Sep 17 00:00:00 2001 From: Arya Soni Date: Tue, 11 Nov 2025 16:56:46 +0530 Subject: [PATCH 28/34] [DOC] Downloadable PDF Developer Guides Signed-off-by: Arya Soni --- .github/workflows/generate-pdfs.yml | 74 ++++++++ .gitignore | 3 + DEVELOPER_GUIDE.md | 75 ++++++++ generate-pdfs.js | 276 ++++++++++++++++++++++++++++ package.json | 15 ++ pdf-config.json | 91 +++++++++ 6 files changed, 534 insertions(+) create mode 100644 .github/workflows/generate-pdfs.yml create mode 100755 generate-pdfs.js create mode 100644 package.json create mode 100644 pdf-config.json diff --git a/.github/workflows/generate-pdfs.yml b/.github/workflows/generate-pdfs.yml new file mode 100644 index 00000000000..87fd07a958e --- /dev/null +++ b/.github/workflows/generate-pdfs.yml @@ -0,0 +1,74 @@ +name: Generate PDFs + +on: + workflow_dispatch: + schedule: + # Run weekly on Sundays at 2 AM UTC + - cron: "0 2 * * 0" + # Optional: Run after main branch updates (uncomment if desired) + # push: + # branches: + # - main + +jobs: + generate-pdfs: + if: github.repository == 'opensearch-project/documentation-website' + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.4.5' + bundler-cache: true + + - name: Build Jekyll site + env: + JEKYLL_ENV: production + run: | + bundle exec jekyll build --future + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '18' + cache: 'npm' + + - name: Install dependencies + run: npm ci + + - name: Generate PDFs + run: | + npm run generate-pdfs + + - name: List generated PDFs + run: | + echo "Generated PDFs:" + ls -lh pdfs/ || echo "No PDFs generated" + + - name: Upload PDFs as artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: opensearch-documentation-pdfs + path: pdfs/*.pdf + retention-days: 30 + + # Optional: Create a GitHub release with PDFs + # Uncomment and configure if you want to automatically create releases + # - name: Create GitHub Release + # if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + # uses: softprops/action-gh-release@v1 + # with: + # files: pdfs/*.pdf + # tag_name: pdfs-${{ github.run_number }} + # name: Documentation PDFs + # body: | + # Automatically generated PDF documentation for OpenSearch. + # Generated on ${{ github.run_id }} + # env: + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + diff --git a/.gitignore b/.gitignore index 09b607173de..d9accf2eebc 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ Gemfile.lock .jekyll-cache .project vendor/bundle +node_modules +pdfs +package-lock.json diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index aa63ce980c5..702bdeef283 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -300,3 +300,78 @@ cd spec-insert bundle exec rake generate_dry_run_report ``` This will also generate a markdown (.md) file for each API with their rendered components in the `spec-insert/dry_run` folder. This allows you to preview the rendered components for all APIs without modifying the original documentation files. A report summarizing the errors found during the dry-run will be generated in the `spec-insert/dry_run_report.md` file. + +## PDF Generation + +The documentation website supports generating PDF versions of the developer guides and other documentation sections. This feature allows users to download complete documentation sets for offline use, easier searching, and integration with AI tools. + +### Generating PDFs Locally + +To generate PDFs locally: + +1. **Install Node.js dependencies:** + ```shell + npm install + ``` + +2. **Build the Jekyll site:** + ```shell + bundle exec jekyll build + ``` + +3. **Generate PDFs:** + ```shell + npm run generate-pdfs + ``` + + Or generate a PDF for a specific collection: + ```shell + npm run generate-pdfs -- --collection developer-documentation + ``` + +The generated PDFs will be saved in the `pdfs/` directory. + +### PDF Generation Configuration + +PDF generation is configured in `pdf-config.json`. This file defines: +- Which collections to convert to PDFs +- PDF output settings (format, margins, headers, footers) +- Base URL and output directory + +You can customize the configuration by editing `pdf-config.json`. + +### CI/CD Integration + +PDF generation runs automatically in CI/CD through the [generate-pdfs.yml](.github/workflows/generate-pdfs.yml) GitHub Actions workflow. This workflow: + +- Runs weekly on Sundays at 2 AM UTC +- Can be triggered manually via `workflow_dispatch` +- Builds the Jekyll site +- Generates PDFs for all configured collections +- Uploads PDFs as GitHub Actions artifacts + +The workflow runs separately from the main Jekyll build to avoid adding to build time. + +### Available PDFs + +The following documentation sections are available as PDFs (as configured in `pdf-config.json`): + +- OpenSearch Developer Guide +- Getting Started Guide +- API Reference +- Install and Configure Guide +- Cluster Tuning Guide +- Security Guide +- Query DSL Guide +- Search Features Guide +- Vector Search Guide +- Machine Learning Guide + +### Copyright and Usage + +OpenSearch documentation is licensed under the Apache License 2.0, which allows you to: +- Use the PDFs for personal or commercial purposes +- Upload PDFs to AI tools (ChatGPT, NotebookLLM, etc.) for knowledge summarization +- Share and distribute the PDFs + +Proper attribution should be maintained when using the documentation. diff --git a/generate-pdfs.js b/generate-pdfs.js new file mode 100755 index 00000000000..ee6f9165113 --- /dev/null +++ b/generate-pdfs.js @@ -0,0 +1,276 @@ +#!/usr/bin/env node + +/** + * PDF Generation Script for OpenSearch Documentation + * + * This script generates PDF files from the built Jekyll documentation site. + * It uses Puppeteer to render HTML pages from the _site/ directory. + * + * Usage: + * node generate-pdfs.js [--site-dir ] [--output-dir ] [--collection ] + * + * Options: + * --site-dir: Path to _site directory (default: _site) + * --output-dir: Directory to output PDFs (default: pdfs) + * --collection: Generate PDF for specific collection only (optional) + */ + +const fs = require('fs'); +const path = require('path'); +const puppeteer = require('puppeteer'); + +// Load configuration +const configPath = path.join(__dirname, 'pdf-config.json'); +const config = JSON.parse(fs.readFileSync(configPath, 'utf8')); + +// Parse command line arguments +const args = process.argv.slice(2); +const getArg = (flag) => { + const index = args.indexOf(flag); + return index !== -1 && args[index + 1] ? args[index + 1] : null; +}; + +const siteDir = path.resolve(getArg('--site-dir') || '_site'); +const outputDir = getArg('--output-dir') || config.outputDir || 'pdfs'; +const specificCollection = getArg('--collection'); + +// Check if _site directory exists +if (!fs.existsSync(siteDir)) { + console.error(`Error: Site directory not found: ${siteDir}`); + console.error('Please build the Jekyll site first: bundle exec jekyll build'); + process.exit(1); +} + +// Ensure output directory exists +if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); +} + +/** + * Convert a URL path to a file path in _site directory + */ +function pathToFile(siteDir, urlPath) { + // Remove leading slash and ensure it ends with index.html or .html + let filePath = urlPath.replace(/^\//, ''); + if (filePath.endsWith('/')) { + filePath = filePath + 'index.html'; + } else if (!filePath.endsWith('.html')) { + filePath = filePath + '/index.html'; + } + return path.join(siteDir, filePath); +} + +/** + * Convert a file path to a file:// URL + */ +function fileToUrl(filePath) { + // Convert to absolute path and use file:// protocol + const absolutePath = path.resolve(filePath); + // On Windows, we need to handle drive letters differently + if (process.platform === 'win32') { + return `file:///${absolutePath.replace(/\\/g, '/')}`; + } + return `file://${absolutePath}`; +} + +/** + * Discover all pages in a collection by scanning _site directory + */ +function discoverPages(siteDir, startPath) { + const pages = new Set(); + const startFile = pathToFile(siteDir, startPath); + + if (!fs.existsSync(startFile)) { + console.warn(` Warning: Start file not found: ${startFile}`); + return Array.from(pages); + } + + // Read the HTML file to find links + const visited = new Set(); + const queue = [startPath]; + + while (queue.length > 0) { + const currentPath = queue.shift(); + if (visited.has(currentPath)) continue; + visited.add(currentPath); + + const filePath = pathToFile(siteDir, currentPath); + if (!fs.existsSync(filePath)) continue; + + pages.add(currentPath); + + try { + const content = fs.readFileSync(filePath, 'utf8'); + // Extract href attributes from anchor tags + const hrefRegex = /]+href=["']([^"']+)["']/gi; + const links = []; + let match; + + while ((match = hrefRegex.exec(content)) !== null) { + const href = match[1]; + // Skip external links, anchors, and non-HTML links + if (href.startsWith('http://') || + href.startsWith('https://') || + href.startsWith('#') || + href.startsWith('mailto:') || + href.match(/\.(css|js|json|png|jpg|gif|svg|woff|woff2)$/i)) { + continue; + } + + // Resolve relative paths + let resolvedPath; + if (href.startsWith('/')) { + resolvedPath = href.endsWith('/') ? href : `${href}/`; + } else { + // Relative path - resolve from current path + const currentDir = currentPath.substring(0, currentPath.lastIndexOf('/') + 1); + const resolved = path.posix.resolve(currentDir, href); + resolvedPath = resolved.endsWith('/') ? resolved : `${resolved}/`; + } + + // Only include paths under the same collection + if (resolvedPath.startsWith(startPath) && !visited.has(resolvedPath)) { + links.push(resolvedPath); + } + } + + queue.push(...links); + } catch (error) { + console.warn(` Warning: Could not read ${filePath}: ${error.message}`); + } + } + + return Array.from(pages).sort(); +} + +/** + * Generate PDF for a single collection + */ +async function generateCollectionPDF(browser, collection, siteDir) { + console.log(`\nGenerating PDF for: ${collection.title}`); + console.log(` Collection: ${collection.name}`); + console.log(` Start path: ${collection.startPath}`); + + // Discover all pages in the collection + const pages = discoverPages(siteDir, collection.startPath); + + if (pages.length === 0) { + console.warn(` Warning: No pages found for collection ${collection.name}`); + return null; + } + + console.log(` Found ${pages.length} pages`); + + const pdfPage = await browser.newPage(); + + try { + const pdfPath = path.join(outputDir, collection.filename); + const indexFile = pathToFile(siteDir, collection.startPath); + const indexUrl = fileToUrl(indexFile); + + console.log(` Generating PDF from: ${indexUrl}`); + + await pdfPage.goto(indexUrl, { + waitUntil: 'networkidle0', + timeout: config.waitTimeout || 30000 + }); + + // Wait for content to load + if (config.waitForSelector) { + try { + await pdfPage.waitForSelector(config.waitForSelector, { timeout: 10000 }); + } catch (e) { + // Continue if selector not found + } + } + + // Generate PDF + await pdfPage.pdf({ + path: pdfPath, + format: config.pdfOptions.format || 'A4', + printBackground: config.pdfOptions.printBackground !== false, + margin: config.pdfOptions.margin || {}, + displayHeaderFooter: config.pdfOptions.displayHeaderFooter !== false, + headerTemplate: config.pdfOptions.headerTemplate || '', + footerTemplate: config.pdfOptions.footerTemplate || '', + }); + + console.log(` ✓ Generated: ${pdfPath}`); + return pdfPath; + } catch (error) { + console.error(` ✗ Error generating PDF for ${collection.name}: ${error.message}`); + return null; + } finally { + await pdfPage.close(); + } +} + +/** + * Main function + */ +async function main() { + console.log('OpenSearch Documentation PDF Generator'); + console.log('=====================================\n'); + console.log(`Site Directory: ${siteDir}`); + console.log(`Output Directory: ${outputDir}\n`); + + // Filter collections if specific one is requested + let collectionsToProcess = config.collections; + if (specificCollection) { + collectionsToProcess = config.collections.filter(c => c.name === specificCollection); + if (collectionsToProcess.length === 0) { + console.error(`Error: Collection "${specificCollection}" not found in configuration`); + process.exit(1); + } + } + + console.log(`Processing ${collectionsToProcess.length} collection(s)...\n`); + + // Launch browser + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox'] + }); + + try { + const results = []; + + for (const collection of collectionsToProcess) { + const pdfPath = await generateCollectionPDF(browser, collection, siteDir); + if (pdfPath) { + results.push({ + collection: collection.name, + title: collection.title, + filename: collection.filename, + path: pdfPath + }); + } + } + + console.log('\n====================================='); + console.log('PDF Generation Complete'); + console.log('=====================================\n'); + console.log(`Generated ${results.length} PDF(s):\n`); + results.forEach(r => { + console.log(` ✓ ${r.title}`); + console.log(` File: ${r.path}\n`); + }); + + } catch (error) { + console.error('Fatal error:', error); + process.exit(1); + } finally { + await browser.close(); + } +} + +// Run if called directly +if (require.main === module) { + main().catch(error => { + console.error('Unhandled error:', error); + process.exit(1); + }); +} + +module.exports = { main }; + diff --git a/package.json b/package.json new file mode 100644 index 00000000000..3c4be00e825 --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "opensearch-docs-pdf-generator", + "version": "1.0.0", + "description": "PDF generation tool for OpenSearch documentation", + "scripts": { + "generate-pdfs": "node generate-pdfs.js" + }, + "dependencies": { + "puppeteer": "^21.5.0" + }, + "engines": { + "node": ">=18.0.0" + } +} + diff --git a/pdf-config.json b/pdf-config.json new file mode 100644 index 00000000000..52ad00f68cb --- /dev/null +++ b/pdf-config.json @@ -0,0 +1,91 @@ +{ + "outputDir": "pdfs", + "collections": [ + { + "name": "developer-documentation", + "title": "OpenSearch Developer Guide", + "filename": "opensearch-developer-guide.pdf", + "description": "Complete developer documentation for OpenSearch", + "startPath": "/developer-documentation/" + }, + { + "name": "getting-started", + "title": "OpenSearch Getting Started Guide", + "filename": "opensearch-getting-started.pdf", + "description": "Getting started guide for OpenSearch", + "startPath": "/getting-started/" + }, + { + "name": "api-reference", + "title": "OpenSearch API Reference", + "filename": "opensearch-api-reference.pdf", + "description": "Complete API reference documentation", + "startPath": "/api-reference/" + }, + { + "name": "install-and-configure", + "title": "OpenSearch Install and Configure Guide", + "filename": "opensearch-install-configure.pdf", + "description": "Installation and configuration guide", + "startPath": "/install-and-configure/" + }, + { + "name": "tuning-your-cluster", + "title": "OpenSearch Cluster Tuning Guide", + "filename": "opensearch-cluster-tuning.pdf", + "description": "Guide for creating and tuning your cluster", + "startPath": "/tuning-your-cluster/" + }, + { + "name": "security", + "title": "OpenSearch Security Guide", + "filename": "opensearch-security.pdf", + "description": "Security configuration and management guide", + "startPath": "/security/" + }, + { + "name": "query-dsl", + "title": "OpenSearch Query DSL Guide", + "filename": "opensearch-query-dsl.pdf", + "description": "Query DSL documentation", + "startPath": "/query-dsl/" + }, + { + "name": "search-plugins", + "title": "OpenSearch Search Features Guide", + "filename": "opensearch-search-features.pdf", + "description": "Search features and plugins documentation", + "startPath": "/search-plugins/" + }, + { + "name": "vector-search", + "title": "OpenSearch Vector Search Guide", + "filename": "opensearch-vector-search.pdf", + "description": "Vector search documentation", + "startPath": "/vector-search/" + }, + { + "name": "ml-commons-plugin", + "title": "OpenSearch Machine Learning Guide", + "filename": "opensearch-machine-learning.pdf", + "description": "Machine learning plugin documentation", + "startPath": "/ml-commons-plugin/" + } + ], + "pdfOptions": { + "format": "A4", + "printBackground": true, + "margin": { + "top": "20mm", + "right": "15mm", + "bottom": "20mm", + "left": "15mm" + }, + "displayHeaderFooter": true, + "headerTemplate": "
", + "footerTemplate": "
/
" + }, + "waitForSelector": ".main-content", + "waitTimeout": 30000 +} + From 4228b1efb9356a1ff779acc7db31a65e80b56c47 Mon Sep 17 00:00:00 2001 From: Arya Soni Date: Thu, 30 Oct 2025 00:29:54 +0530 Subject: [PATCH 29/34] [DOC] Can't find k1 parameter using search (not indexed?) Signed-off-by: Arya Soni --- _search-plugins/keyword-search.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/_search-plugins/keyword-search.md b/_search-plugins/keyword-search.md index f23aa9c20ac..bd75abf988a 100644 --- a/_search-plugins/keyword-search.md +++ b/_search-plugins/keyword-search.md @@ -3,6 +3,8 @@ layout: default title: Keyword search has_children: false nav_order: 10 +meta_description: Learn about BM25 keyword search in OpenSearch, including how to configure BM25 parameters k1 and b for better search relevance +meta_keywords: BM25, keyword search, k1, b, term frequency, inverse document frequency, TF/IDF, search relevance, Okapi BM25 --- # Keyword search @@ -165,7 +167,12 @@ PUT /testindex ## Configuring BM25 similarity -You can configure BM25 similarity parameters at the index level as follows: +You can configure BM25 similarity parameters at the index level. The BM25 algorithm supports two key parameters: `k1` (term saturation parameter) and `b` (length normalization parameter). These parameters control how BM25 scores documents: + +- The `k1` parameter controls term frequency saturation, determining how quickly the relevance score increases as term frequency grows. +- The `b` parameter controls the impact of document length on scoring. + +You can configure these parameters at the index level as follows: ```json PUT /testindex From d7c41d2b9ca71996041781bc4497eccd823648ab Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:26:35 -0400 Subject: [PATCH 30/34] Add copy buttons and highlighting to data prepper code samples (#11465) Signed-off-by: Fanit Kolchina Signed-off-by: Arya Soni --- .../common-use-cases/trace-analytics.md | 21 ++++++++++--------- .../configuration/processors/date.md | 3 +++ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/_data-prepper/common-use-cases/trace-analytics.md b/_data-prepper/common-use-cases/trace-analytics.md index 59d12003d18..c2b30b15847 100644 --- a/_data-prepper/common-use-cases/trace-analytics.md +++ b/_data-prepper/common-use-cases/trace-analytics.md @@ -230,7 +230,7 @@ raw-trace-pipeline: # Configure same as the otel-trace-pipeline workers: 8 # We recommend using the default value for the raw-trace-pipeline. - delay: 3000 + delay: "3000" source: pipeline: name: otel-trace-pipeline @@ -250,9 +250,9 @@ raw-trace-pipeline: - otel_traces_group: hosts: [ "https://opensearch:9200" ] # Change to your credentials - username: admin - password: admin_password - # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + username: "admin" + password: "admin" + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate #cert: /path/to/cert # If you are connecting to an Amazon OpenSearch Service domain without # Fine-Grained Access Control, enable these settings. Comment out the @@ -264,9 +264,9 @@ raw-trace-pipeline: hosts: [ "https://opensearch:9200" ] index_type: trace-analytics-raw # Change to your credentials - username: admin - password: admin_password - # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + username: "admin" + password: "admin" + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate #cert: /path/to/cert # If you are connecting to an Amazon OpenSearch Service domain without # Fine-Grained Access Control, enable these settings. Comment out the @@ -302,9 +302,9 @@ service-map-pipeline: hosts: [ "https://opensearch:9200" ] index_type: trace-analytics-service-map # Change to your credentials - username: admin - password: admin_password - # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate + username: "admin" + password: "admin" + # Add a certificate file if you are accessing an OpenSearch cluster with a self-signed certificate #cert: /path/to/cert # If you are connecting to an Amazon OpenSearch Service domain without # Fine-Grained Access Control, enable these settings. Comment out the @@ -582,6 +582,7 @@ This will send sample telemetry to the alias `otel-v1-apm-span` and store the do }, ... ``` +{% include copy.html %} After you run OpenTelemetry in your service environment, you must configure your application to use the OpenTelemetry Collector. The OpenTelemetry Collector typically runs alongside your application. diff --git a/_data-prepper/pipelines/configuration/processors/date.md b/_data-prepper/pipelines/configuration/processors/date.md index 6cabc303535..1719d78da0d 100644 --- a/_data-prepper/pipelines/configuration/processors/date.md +++ b/_data-prepper/pipelines/configuration/processors/date.md @@ -186,6 +186,7 @@ The documents stored in OpenSearch contain the following information: } } ``` +{% include copy.html %} ### Parse a timestamp to convert its format and time zone @@ -340,6 +341,7 @@ The documents stored in OpenSearch contain the following information: } } ``` +{% include copy.html %} ### Timestamp formats with different day spacing @@ -454,3 +456,4 @@ The documents stored in OpenSearch contain the following information: } } ``` +{% include copy.html %} From af48c5ebc2d91615bad531c5261f0cd865cdfea2 Mon Sep 17 00:00:00 2001 From: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> Date: Mon, 10 Nov 2025 14:33:43 -0600 Subject: [PATCH 31/34] Update documentation for delete_entries and select_entries processors (#11476) * Update delete_entries processor to add new features Signed-off-by: Kennedy Onyia * update select_entries processor documentation to account for new include_keys_regex feature Signed-off-by: Kennedy Onyia * fix style check errors and include additional pipeline configurations to clarify new features. Signed-off-by: Kennedy Onyia * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/select-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Update _data-prepper/pipelines/configuration/processors/delete-entries.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Apply suggestions from code review Signed-off-by: Nathan Bower --------- Signed-off-by: Kennedy Onyia Signed-off-by: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Nathan Bower Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower Signed-off-by: Arya Soni --- .../pipelines/configuration/processors/delete-entries.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data-prepper/pipelines/configuration/processors/delete-entries.md b/_data-prepper/pipelines/configuration/processors/delete-entries.md index 37c69758736..7bf014c885e 100644 --- a/_data-prepper/pipelines/configuration/processors/delete-entries.md +++ b/_data-prepper/pipelines/configuration/processors/delete-entries.md @@ -61,7 +61,7 @@ For example, before you run the `delete_entries` processor, if the `logs_json.lo When you run the `delete_entries` processor, it parses the message into the following output: ```json -{"message2": "goodbye"} +{"message2": "goodbye", "message3": "test"} ``` From 5aa9b18e110bcbc565256e98091e5beb4572e624 Mon Sep 17 00:00:00 2001 From: Kennedy Onyia <145404406+kennedy-onyia@users.noreply.github.com> Date: Mon, 10 Nov 2025 16:05:23 -0600 Subject: [PATCH 32/34] delete incorrect output value in delete_entries processor example (#11516) Signed-off-by: Kennedy Onyia Signed-off-by: Arya Soni --- .../pipelines/configuration/processors/delete-entries.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data-prepper/pipelines/configuration/processors/delete-entries.md b/_data-prepper/pipelines/configuration/processors/delete-entries.md index 7bf014c885e..37c69758736 100644 --- a/_data-prepper/pipelines/configuration/processors/delete-entries.md +++ b/_data-prepper/pipelines/configuration/processors/delete-entries.md @@ -61,7 +61,7 @@ For example, before you run the `delete_entries` processor, if the `logs_json.lo When you run the `delete_entries` processor, it parses the message into the following output: ```json -{"message2": "goodbye", "message3": "test"} +{"message2": "goodbye"} ``` From 9d3c57b7ce9fc8c3558ed70888b145aaf0235472 Mon Sep 17 00:00:00 2001 From: Arya Soni Date: Tue, 11 Nov 2025 16:56:46 +0530 Subject: [PATCH 33/34] [DOC] Downloadable PDF Developer Guides Signed-off-by: Arya Soni --- .github/workflows/generate-pdfs.yml | 74 ++++++++ .gitignore | 3 + DEVELOPER_GUIDE.md | 75 ++++++++ generate-pdfs.js | 276 ++++++++++++++++++++++++++++ package.json | 15 ++ pdf-config.json | 91 +++++++++ 6 files changed, 534 insertions(+) create mode 100644 .github/workflows/generate-pdfs.yml create mode 100755 generate-pdfs.js create mode 100644 package.json create mode 100644 pdf-config.json diff --git a/.github/workflows/generate-pdfs.yml b/.github/workflows/generate-pdfs.yml new file mode 100644 index 00000000000..87fd07a958e --- /dev/null +++ b/.github/workflows/generate-pdfs.yml @@ -0,0 +1,74 @@ +name: Generate PDFs + +on: + workflow_dispatch: + schedule: + # Run weekly on Sundays at 2 AM UTC + - cron: "0 2 * * 0" + # Optional: Run after main branch updates (uncomment if desired) + # push: + # branches: + # - main + +jobs: + generate-pdfs: + if: github.repository == 'opensearch-project/documentation-website' + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.4.5' + bundler-cache: true + + - name: Build Jekyll site + env: + JEKYLL_ENV: production + run: | + bundle exec jekyll build --future + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '18' + cache: 'npm' + + - name: Install dependencies + run: npm ci + + - name: Generate PDFs + run: | + npm run generate-pdfs + + - name: List generated PDFs + run: | + echo "Generated PDFs:" + ls -lh pdfs/ || echo "No PDFs generated" + + - name: Upload PDFs as artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: opensearch-documentation-pdfs + path: pdfs/*.pdf + retention-days: 30 + + # Optional: Create a GitHub release with PDFs + # Uncomment and configure if you want to automatically create releases + # - name: Create GitHub Release + # if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + # uses: softprops/action-gh-release@v1 + # with: + # files: pdfs/*.pdf + # tag_name: pdfs-${{ github.run_number }} + # name: Documentation PDFs + # body: | + # Automatically generated PDF documentation for OpenSearch. + # Generated on ${{ github.run_id }} + # env: + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + diff --git a/.gitignore b/.gitignore index 09b607173de..d9accf2eebc 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ Gemfile.lock .jekyll-cache .project vendor/bundle +node_modules +pdfs +package-lock.json diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index aa63ce980c5..702bdeef283 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -300,3 +300,78 @@ cd spec-insert bundle exec rake generate_dry_run_report ``` This will also generate a markdown (.md) file for each API with their rendered components in the `spec-insert/dry_run` folder. This allows you to preview the rendered components for all APIs without modifying the original documentation files. A report summarizing the errors found during the dry-run will be generated in the `spec-insert/dry_run_report.md` file. + +## PDF Generation + +The documentation website supports generating PDF versions of the developer guides and other documentation sections. This feature allows users to download complete documentation sets for offline use, easier searching, and integration with AI tools. + +### Generating PDFs Locally + +To generate PDFs locally: + +1. **Install Node.js dependencies:** + ```shell + npm install + ``` + +2. **Build the Jekyll site:** + ```shell + bundle exec jekyll build + ``` + +3. **Generate PDFs:** + ```shell + npm run generate-pdfs + ``` + + Or generate a PDF for a specific collection: + ```shell + npm run generate-pdfs -- --collection developer-documentation + ``` + +The generated PDFs will be saved in the `pdfs/` directory. + +### PDF Generation Configuration + +PDF generation is configured in `pdf-config.json`. This file defines: +- Which collections to convert to PDFs +- PDF output settings (format, margins, headers, footers) +- Base URL and output directory + +You can customize the configuration by editing `pdf-config.json`. + +### CI/CD Integration + +PDF generation runs automatically in CI/CD through the [generate-pdfs.yml](.github/workflows/generate-pdfs.yml) GitHub Actions workflow. This workflow: + +- Runs weekly on Sundays at 2 AM UTC +- Can be triggered manually via `workflow_dispatch` +- Builds the Jekyll site +- Generates PDFs for all configured collections +- Uploads PDFs as GitHub Actions artifacts + +The workflow runs separately from the main Jekyll build to avoid adding to build time. + +### Available PDFs + +The following documentation sections are available as PDFs (as configured in `pdf-config.json`): + +- OpenSearch Developer Guide +- Getting Started Guide +- API Reference +- Install and Configure Guide +- Cluster Tuning Guide +- Security Guide +- Query DSL Guide +- Search Features Guide +- Vector Search Guide +- Machine Learning Guide + +### Copyright and Usage + +OpenSearch documentation is licensed under the Apache License 2.0, which allows you to: +- Use the PDFs for personal or commercial purposes +- Upload PDFs to AI tools (ChatGPT, NotebookLLM, etc.) for knowledge summarization +- Share and distribute the PDFs + +Proper attribution should be maintained when using the documentation. diff --git a/generate-pdfs.js b/generate-pdfs.js new file mode 100755 index 00000000000..ee6f9165113 --- /dev/null +++ b/generate-pdfs.js @@ -0,0 +1,276 @@ +#!/usr/bin/env node + +/** + * PDF Generation Script for OpenSearch Documentation + * + * This script generates PDF files from the built Jekyll documentation site. + * It uses Puppeteer to render HTML pages from the _site/ directory. + * + * Usage: + * node generate-pdfs.js [--site-dir ] [--output-dir ] [--collection ] + * + * Options: + * --site-dir: Path to _site directory (default: _site) + * --output-dir: Directory to output PDFs (default: pdfs) + * --collection: Generate PDF for specific collection only (optional) + */ + +const fs = require('fs'); +const path = require('path'); +const puppeteer = require('puppeteer'); + +// Load configuration +const configPath = path.join(__dirname, 'pdf-config.json'); +const config = JSON.parse(fs.readFileSync(configPath, 'utf8')); + +// Parse command line arguments +const args = process.argv.slice(2); +const getArg = (flag) => { + const index = args.indexOf(flag); + return index !== -1 && args[index + 1] ? args[index + 1] : null; +}; + +const siteDir = path.resolve(getArg('--site-dir') || '_site'); +const outputDir = getArg('--output-dir') || config.outputDir || 'pdfs'; +const specificCollection = getArg('--collection'); + +// Check if _site directory exists +if (!fs.existsSync(siteDir)) { + console.error(`Error: Site directory not found: ${siteDir}`); + console.error('Please build the Jekyll site first: bundle exec jekyll build'); + process.exit(1); +} + +// Ensure output directory exists +if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); +} + +/** + * Convert a URL path to a file path in _site directory + */ +function pathToFile(siteDir, urlPath) { + // Remove leading slash and ensure it ends with index.html or .html + let filePath = urlPath.replace(/^\//, ''); + if (filePath.endsWith('/')) { + filePath = filePath + 'index.html'; + } else if (!filePath.endsWith('.html')) { + filePath = filePath + '/index.html'; + } + return path.join(siteDir, filePath); +} + +/** + * Convert a file path to a file:// URL + */ +function fileToUrl(filePath) { + // Convert to absolute path and use file:// protocol + const absolutePath = path.resolve(filePath); + // On Windows, we need to handle drive letters differently + if (process.platform === 'win32') { + return `file:///${absolutePath.replace(/\\/g, '/')}`; + } + return `file://${absolutePath}`; +} + +/** + * Discover all pages in a collection by scanning _site directory + */ +function discoverPages(siteDir, startPath) { + const pages = new Set(); + const startFile = pathToFile(siteDir, startPath); + + if (!fs.existsSync(startFile)) { + console.warn(` Warning: Start file not found: ${startFile}`); + return Array.from(pages); + } + + // Read the HTML file to find links + const visited = new Set(); + const queue = [startPath]; + + while (queue.length > 0) { + const currentPath = queue.shift(); + if (visited.has(currentPath)) continue; + visited.add(currentPath); + + const filePath = pathToFile(siteDir, currentPath); + if (!fs.existsSync(filePath)) continue; + + pages.add(currentPath); + + try { + const content = fs.readFileSync(filePath, 'utf8'); + // Extract href attributes from anchor tags + const hrefRegex = /]+href=["']([^"']+)["']/gi; + const links = []; + let match; + + while ((match = hrefRegex.exec(content)) !== null) { + const href = match[1]; + // Skip external links, anchors, and non-HTML links + if (href.startsWith('http://') || + href.startsWith('https://') || + href.startsWith('#') || + href.startsWith('mailto:') || + href.match(/\.(css|js|json|png|jpg|gif|svg|woff|woff2)$/i)) { + continue; + } + + // Resolve relative paths + let resolvedPath; + if (href.startsWith('/')) { + resolvedPath = href.endsWith('/') ? href : `${href}/`; + } else { + // Relative path - resolve from current path + const currentDir = currentPath.substring(0, currentPath.lastIndexOf('/') + 1); + const resolved = path.posix.resolve(currentDir, href); + resolvedPath = resolved.endsWith('/') ? resolved : `${resolved}/`; + } + + // Only include paths under the same collection + if (resolvedPath.startsWith(startPath) && !visited.has(resolvedPath)) { + links.push(resolvedPath); + } + } + + queue.push(...links); + } catch (error) { + console.warn(` Warning: Could not read ${filePath}: ${error.message}`); + } + } + + return Array.from(pages).sort(); +} + +/** + * Generate PDF for a single collection + */ +async function generateCollectionPDF(browser, collection, siteDir) { + console.log(`\nGenerating PDF for: ${collection.title}`); + console.log(` Collection: ${collection.name}`); + console.log(` Start path: ${collection.startPath}`); + + // Discover all pages in the collection + const pages = discoverPages(siteDir, collection.startPath); + + if (pages.length === 0) { + console.warn(` Warning: No pages found for collection ${collection.name}`); + return null; + } + + console.log(` Found ${pages.length} pages`); + + const pdfPage = await browser.newPage(); + + try { + const pdfPath = path.join(outputDir, collection.filename); + const indexFile = pathToFile(siteDir, collection.startPath); + const indexUrl = fileToUrl(indexFile); + + console.log(` Generating PDF from: ${indexUrl}`); + + await pdfPage.goto(indexUrl, { + waitUntil: 'networkidle0', + timeout: config.waitTimeout || 30000 + }); + + // Wait for content to load + if (config.waitForSelector) { + try { + await pdfPage.waitForSelector(config.waitForSelector, { timeout: 10000 }); + } catch (e) { + // Continue if selector not found + } + } + + // Generate PDF + await pdfPage.pdf({ + path: pdfPath, + format: config.pdfOptions.format || 'A4', + printBackground: config.pdfOptions.printBackground !== false, + margin: config.pdfOptions.margin || {}, + displayHeaderFooter: config.pdfOptions.displayHeaderFooter !== false, + headerTemplate: config.pdfOptions.headerTemplate || '', + footerTemplate: config.pdfOptions.footerTemplate || '', + }); + + console.log(` ✓ Generated: ${pdfPath}`); + return pdfPath; + } catch (error) { + console.error(` ✗ Error generating PDF for ${collection.name}: ${error.message}`); + return null; + } finally { + await pdfPage.close(); + } +} + +/** + * Main function + */ +async function main() { + console.log('OpenSearch Documentation PDF Generator'); + console.log('=====================================\n'); + console.log(`Site Directory: ${siteDir}`); + console.log(`Output Directory: ${outputDir}\n`); + + // Filter collections if specific one is requested + let collectionsToProcess = config.collections; + if (specificCollection) { + collectionsToProcess = config.collections.filter(c => c.name === specificCollection); + if (collectionsToProcess.length === 0) { + console.error(`Error: Collection "${specificCollection}" not found in configuration`); + process.exit(1); + } + } + + console.log(`Processing ${collectionsToProcess.length} collection(s)...\n`); + + // Launch browser + const browser = await puppeteer.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox'] + }); + + try { + const results = []; + + for (const collection of collectionsToProcess) { + const pdfPath = await generateCollectionPDF(browser, collection, siteDir); + if (pdfPath) { + results.push({ + collection: collection.name, + title: collection.title, + filename: collection.filename, + path: pdfPath + }); + } + } + + console.log('\n====================================='); + console.log('PDF Generation Complete'); + console.log('=====================================\n'); + console.log(`Generated ${results.length} PDF(s):\n`); + results.forEach(r => { + console.log(` ✓ ${r.title}`); + console.log(` File: ${r.path}\n`); + }); + + } catch (error) { + console.error('Fatal error:', error); + process.exit(1); + } finally { + await browser.close(); + } +} + +// Run if called directly +if (require.main === module) { + main().catch(error => { + console.error('Unhandled error:', error); + process.exit(1); + }); +} + +module.exports = { main }; + diff --git a/package.json b/package.json new file mode 100644 index 00000000000..3c4be00e825 --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "opensearch-docs-pdf-generator", + "version": "1.0.0", + "description": "PDF generation tool for OpenSearch documentation", + "scripts": { + "generate-pdfs": "node generate-pdfs.js" + }, + "dependencies": { + "puppeteer": "^21.5.0" + }, + "engines": { + "node": ">=18.0.0" + } +} + diff --git a/pdf-config.json b/pdf-config.json new file mode 100644 index 00000000000..52ad00f68cb --- /dev/null +++ b/pdf-config.json @@ -0,0 +1,91 @@ +{ + "outputDir": "pdfs", + "collections": [ + { + "name": "developer-documentation", + "title": "OpenSearch Developer Guide", + "filename": "opensearch-developer-guide.pdf", + "description": "Complete developer documentation for OpenSearch", + "startPath": "/developer-documentation/" + }, + { + "name": "getting-started", + "title": "OpenSearch Getting Started Guide", + "filename": "opensearch-getting-started.pdf", + "description": "Getting started guide for OpenSearch", + "startPath": "/getting-started/" + }, + { + "name": "api-reference", + "title": "OpenSearch API Reference", + "filename": "opensearch-api-reference.pdf", + "description": "Complete API reference documentation", + "startPath": "/api-reference/" + }, + { + "name": "install-and-configure", + "title": "OpenSearch Install and Configure Guide", + "filename": "opensearch-install-configure.pdf", + "description": "Installation and configuration guide", + "startPath": "/install-and-configure/" + }, + { + "name": "tuning-your-cluster", + "title": "OpenSearch Cluster Tuning Guide", + "filename": "opensearch-cluster-tuning.pdf", + "description": "Guide for creating and tuning your cluster", + "startPath": "/tuning-your-cluster/" + }, + { + "name": "security", + "title": "OpenSearch Security Guide", + "filename": "opensearch-security.pdf", + "description": "Security configuration and management guide", + "startPath": "/security/" + }, + { + "name": "query-dsl", + "title": "OpenSearch Query DSL Guide", + "filename": "opensearch-query-dsl.pdf", + "description": "Query DSL documentation", + "startPath": "/query-dsl/" + }, + { + "name": "search-plugins", + "title": "OpenSearch Search Features Guide", + "filename": "opensearch-search-features.pdf", + "description": "Search features and plugins documentation", + "startPath": "/search-plugins/" + }, + { + "name": "vector-search", + "title": "OpenSearch Vector Search Guide", + "filename": "opensearch-vector-search.pdf", + "description": "Vector search documentation", + "startPath": "/vector-search/" + }, + { + "name": "ml-commons-plugin", + "title": "OpenSearch Machine Learning Guide", + "filename": "opensearch-machine-learning.pdf", + "description": "Machine learning plugin documentation", + "startPath": "/ml-commons-plugin/" + } + ], + "pdfOptions": { + "format": "A4", + "printBackground": true, + "margin": { + "top": "20mm", + "right": "15mm", + "bottom": "20mm", + "left": "15mm" + }, + "displayHeaderFooter": true, + "headerTemplate": "
", + "footerTemplate": "
/
" + }, + "waitForSelector": ".main-content", + "waitTimeout": 30000 +} + From cb2d51bebb7800a7717f7bbcf3c7e6b8d87c5d60 Mon Sep 17 00:00:00 2001 From: Arya Soni Date: Tue, 25 Nov 2025 13:06:27 +0530 Subject: [PATCH 34/34] [DOC] Downloadable PDF Developer Guides --- .github/workflows/generate-pdfs.yml | 81 +----- .gitignore | 2 - DEVELOPER_GUIDE.md | 75 +++--- Gemfile | 4 + _config.yml | 5 + _pdf-generator/README.md | 74 ++++++ _pdf-generator/config.yml | 75 ++++++ _pdf-generator/jekyll-pdf-generator.gemspec | 19 ++ _pdf-generator/lib/jekyll-pdf-generator.rb | 160 ++++++++++++ generate-pdfs.js | 276 -------------------- package.json | 15 -- pdf-config.json | 91 ------- 12 files changed, 388 insertions(+), 489 deletions(-) create mode 100644 _pdf-generator/README.md create mode 100644 _pdf-generator/config.yml create mode 100644 _pdf-generator/jekyll-pdf-generator.gemspec create mode 100644 _pdf-generator/lib/jekyll-pdf-generator.rb delete mode 100755 generate-pdfs.js delete mode 100644 package.json delete mode 100644 pdf-config.json diff --git a/.github/workflows/generate-pdfs.yml b/.github/workflows/generate-pdfs.yml index 87fd07a958e..4c6644f62b0 100644 --- a/.github/workflows/generate-pdfs.yml +++ b/.github/workflows/generate-pdfs.yml @@ -1,74 +1,9 @@ -name: Generate PDFs - -on: - workflow_dispatch: - schedule: - # Run weekly on Sundays at 2 AM UTC - - cron: "0 2 * * 0" - # Optional: Run after main branch updates (uncomment if desired) - # push: - # branches: - # - main - -jobs: - generate-pdfs: - if: github.repository == 'opensearch-project/documentation-website' - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Ruby - uses: ruby/setup-ruby@v1 - with: - ruby-version: '3.4.5' - bundler-cache: true - - - name: Build Jekyll site - env: - JEKYLL_ENV: production - run: | - bundle exec jekyll build --future - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: '18' - cache: 'npm' - - - name: Install dependencies - run: npm ci - - - name: Generate PDFs - run: | - npm run generate-pdfs - - - name: List generated PDFs - run: | - echo "Generated PDFs:" - ls -lh pdfs/ || echo "No PDFs generated" - - - name: Upload PDFs as artifacts - uses: actions/upload-artifact@v4 - if: always() - with: - name: opensearch-documentation-pdfs - path: pdfs/*.pdf - retention-days: 30 - - # Optional: Create a GitHub release with PDFs - # Uncomment and configure if you want to automatically create releases - # - name: Create GitHub Release - # if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' - # uses: softprops/action-gh-release@v1 - # with: - # files: pdfs/*.pdf - # tag_name: pdfs-${{ github.run_number }} - # name: Documentation PDFs - # body: | - # Automatically generated PDF documentation for OpenSearch. - # Generated on ${{ github.run_id }} - # env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} +# PDF generation is now integrated into the Jekyll build process +# PDFs are automatically generated during `jekyll build` and included in _site/pdfs/ +# PDFs will be deployed with the site to S3, accessible at: +# https://docs.opensearch.org/pdfs/.pdf +# +# Note: The main Jekyll build workflow should install wkhtmltopdf: +# - name: Install wkhtmltopdf +# run: sudo apt-get update && sudo apt-get install -y wkhtmltopdf diff --git a/.gitignore b/.gitignore index d9accf2eebc..1ad38e26ed5 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,4 @@ Gemfile.lock .jekyll-cache .project vendor/bundle -node_modules pdfs -package-lock.json diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index 702bdeef283..a9e2b92c8c9 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -305,56 +305,67 @@ This will also generate a markdown (.md) file for each API with their rendered c The documentation website supports generating PDF versions of the developer guides and other documentation sections. This feature allows users to download complete documentation sets for offline use, easier searching, and integration with AI tools. -### Generating PDFs Locally +PDF generation is integrated directly into the Jekyll build process using the `jekyll-pdf-generator` plugin. PDFs are automatically generated during `jekyll build` and placed in `_site/pdfs/`, making them available for deployment alongside the HTML site. -To generate PDFs locally: +### Requirements -1. **Install Node.js dependencies:** - ```shell - npm install - ``` +The PDF generator uses the `pdfkit` gem, which requires `wkhtmltopdf` to be installed on your system: -2. **Build the Jekyll site:** - ```shell - bundle exec jekyll build - ``` +**macOS:** +```shell +brew install wkhtmltopdf +``` + +**Ubuntu/Debian:** +```shell +sudo apt-get install wkhtmltopdf +``` -3. **Generate PDFs:** - ```shell - npm run generate-pdfs - ``` +**Windows:** +Download from [wkhtmltopdf downloads](https://wkhtmltopdf.org/downloads.html) - Or generate a PDF for a specific collection: - ```shell - npm run generate-pdfs -- --collection developer-documentation - ``` +### Generating PDFs -The generated PDFs will be saved in the `pdfs/` directory. +PDFs are automatically generated when you build the Jekyll site: + +```shell +bundle exec jekyll build +``` + +Generated PDFs will be in `_site/pdfs/` and will be deployed with the site to S3, making them accessible at: +- `https://docs.opensearch.org/pdfs/opensearch-developer-guide.pdf` +- `https://docs.opensearch.org/pdfs/opensearch-getting-started.pdf` +- etc. ### PDF Generation Configuration -PDF generation is configured in `pdf-config.json`. This file defines: +PDF generation is configured in `_pdf-generator/config.yml`. This file defines: - Which collections to convert to PDFs -- PDF output settings (format, margins, headers, footers) -- Base URL and output directory +- PDF output settings (page size, margins, etc.) -You can customize the configuration by editing `pdf-config.json`. +To enable/disable PDF generation, set the `enabled` flag in `_config.yml`: -### CI/CD Integration +```yaml +pdf_generator: + enabled: true # Set to false to disable PDF generation +``` -PDF generation runs automatically in CI/CD through the [generate-pdfs.yml](.github/workflows/generate-pdfs.yml) GitHub Actions workflow. This workflow: +### Adding New Collections -- Runs weekly on Sundays at 2 AM UTC -- Can be triggered manually via `workflow_dispatch` -- Builds the Jekyll site -- Generates PDFs for all configured collections -- Uploads PDFs as GitHub Actions artifacts +To add a new collection for PDF generation, edit `_pdf-generator/config.yml`: -The workflow runs separately from the main Jekyll build to avoid adding to build time. +```yaml +collections: + - name: my-collection + title: My Collection Title + filename: my-collection.pdf + description: Description of the collection + start_path: /my-collection/ +``` ### Available PDFs -The following documentation sections are available as PDFs (as configured in `pdf-config.json`): +The following documentation sections are available as PDFs (as configured in `_pdf-generator/config.yml`): - OpenSearch Developer Guide - Getting Started Guide diff --git a/Gemfile b/Gemfile index b1d64c15d84..c476fb65086 100644 --- a/Gemfile +++ b/Gemfile @@ -27,6 +27,7 @@ group :jekyll_plugins do gem 'jekyll-last-modified-at' gem 'jekyll-sitemap' gem 'jekyll-spec-insert', :path => './spec-insert' + gem 'jekyll-pdf-generator', :path => './_pdf-generator' end # Windows does not include zoneinfo files, so bundle the tzinfo-data gem @@ -47,6 +48,9 @@ gem 'typhoeus' gem 'activesupport', '~> 7' gem 'mustache', '~> 1' +# PDF Generator +gem 'pdfkit', '~> 0.8' + group :development, :test do gem 'rspec' gem 'rubocop', '~> 1.44', require: false diff --git a/_config.yml b/_config.yml index 122b2ce6763..da09b528e6f 100644 --- a/_config.yml +++ b/_config.yml @@ -329,6 +329,11 @@ plugins: - jekyll-redirect-from - jekyll-sitemap - jekyll-spec-insert + - jekyll-pdf-generator + +# PDF Generator configuration +pdf_generator: + enabled: true # This format has to conform to RFC822 last-modified-at: diff --git a/_pdf-generator/README.md b/_pdf-generator/README.md new file mode 100644 index 00000000000..7ff0b5128fa --- /dev/null +++ b/_pdf-generator/README.md @@ -0,0 +1,74 @@ +# Jekyll PDF Generator + +A Jekyll plugin that automatically generates PDF versions of documentation collections during the site build process. + +## Overview + +This plugin integrates PDF generation directly into the Jekyll build process. When you run `jekyll build`, PDFs are automatically generated for configured documentation collections and placed in the `_site/pdfs/` directory, making them available for deployment alongside the HTML site. + +## Requirements + +- **wkhtmltopdf**: The plugin uses the `pdfkit` gem, which requires `wkhtmltopdf` to be installed on your system. + +### Installing wkhtmltopdf + +**macOS:** +```bash +brew install wkhtmltopdf +``` + +**Ubuntu/Debian:** +```bash +sudo apt-get install wkhtmltopdf +``` + +**Windows:** +Download from [wkhtmltopdf downloads](https://wkhtmltopdf.org/downloads.html) + +## Configuration + +PDF generation is configured in `_pdf-generator/config.yml`. This file defines: +- Which collections to convert to PDFs +- PDF output settings (page size, margins, etc.) + +To enable/disable PDF generation, set the `enabled` flag in `_config.yml`: + +```yaml +pdf_generator: + enabled: true # Set to false to disable PDF generation +``` + +## Usage + +PDFs are automatically generated during the Jekyll build process: + +```bash +bundle exec jekyll build +``` + +Generated PDFs will be in `_site/pdfs/` and will be deployed with the site to S3, making them accessible at: +- `https://docs.opensearch.org/pdfs/opensearch-developer-guide.pdf` +- `https://docs.opensearch.org/pdfs/opensearch-getting-started.pdf` +- etc. + +## Adding New Collections + +To add a new collection for PDF generation, edit `_pdf-generator/config.yml`: + +```yaml +collections: + - name: my-collection + title: My Collection Title + filename: my-collection.pdf + description: Description of the collection + start_path: /my-collection/ +``` + +## Troubleshooting + +If PDF generation fails: +1. Ensure `wkhtmltopdf` is installed and in your PATH +2. Check that the collection's `start_path` points to a valid page +3. Verify the plugin is enabled in `_config.yml` +4. Check Jekyll build logs for error messages + diff --git a/_pdf-generator/config.yml b/_pdf-generator/config.yml new file mode 100644 index 00000000000..906fe1b9a05 --- /dev/null +++ b/_pdf-generator/config.yml @@ -0,0 +1,75 @@ +# PDF Generator Configuration +# This file defines which documentation collections should be converted to PDFs + +collections: + - name: developer-documentation + title: OpenSearch Developer Guide + filename: opensearch-developer-guide.pdf + description: Complete developer documentation for OpenSearch + start_path: /developer-documentation/ + + - name: getting-started + title: OpenSearch Getting Started Guide + filename: opensearch-getting-started.pdf + description: Getting started guide for OpenSearch + start_path: /getting-started/ + + - name: api-reference + title: OpenSearch API Reference + filename: opensearch-api-reference.pdf + description: Complete API reference documentation + start_path: /api-reference/ + + - name: install-and-configure + title: OpenSearch Install and Configure Guide + filename: opensearch-install-configure.pdf + description: Installation and configuration guide + start_path: /install-and-configure/ + + - name: tuning-your-cluster + title: OpenSearch Cluster Tuning Guide + filename: opensearch-cluster-tuning.pdf + description: Guide for creating and tuning your cluster + start_path: /tuning-your-cluster/ + + - name: security + title: OpenSearch Security Guide + filename: opensearch-security.pdf + description: Security configuration and management guide + start_path: /security/ + + - name: query-dsl + title: OpenSearch Query DSL Guide + filename: opensearch-query-dsl.pdf + description: Query DSL documentation + start_path: /query-dsl/ + + - name: search-plugins + title: OpenSearch Search Features Guide + filename: opensearch-search-features.pdf + description: Search features and plugins documentation + start_path: /search-plugins/ + + - name: vector-search + title: OpenSearch Vector Search Guide + filename: opensearch-vector-search.pdf + description: Vector search documentation + start_path: /vector-search/ + + - name: ml-commons-plugin + title: OpenSearch Machine Learning Guide + filename: opensearch-machine-learning.pdf + description: Machine learning plugin documentation + start_path: /ml-commons-plugin/ + +# PDF generation options +pdf_options: + page_size: A4 + margin_top: 20mm + margin_right: 15mm + margin_bottom: 20mm + margin_left: 15mm + print_media_type: true + disable_smart_shrinking: false + encoding: UTF-8 + diff --git a/_pdf-generator/jekyll-pdf-generator.gemspec b/_pdf-generator/jekyll-pdf-generator.gemspec new file mode 100644 index 00000000000..7e47f9e8aad --- /dev/null +++ b/_pdf-generator/jekyll-pdf-generator.gemspec @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +Gem::Specification.new do |spec| + spec.name = 'jekyll-pdf-generator' + spec.version = '0.1.0' + spec.authors = ['OpenSearch Contributors'] + spec.email = [] + + spec.summary = 'A Jekyll plugin for generating PDF versions of documentation collections.' + + spec.files = Dir['lib/**/*.rb'] + spec.require_paths = ['lib'] + + spec.metadata['rubygems_mfa_required'] = 'true' + spec.required_ruby_version = '>= 3.1.0' + + spec.add_dependency 'pdfkit', '~> 0.8' +end + diff --git a/_pdf-generator/lib/jekyll-pdf-generator.rb b/_pdf-generator/lib/jekyll-pdf-generator.rb new file mode 100644 index 00000000000..01fbc2e9ba2 --- /dev/null +++ b/_pdf-generator/lib/jekyll-pdf-generator.rb @@ -0,0 +1,160 @@ +# frozen_string_literal: true + +require 'pdfkit' +require 'yaml' +require 'fileutils' +require 'pathname' + +module Jekyll + module PdfGenerator + # PDF Generator plugin for Jekyll + class Generator + def initialize(site) + @site = site + @config = load_config + @pdf_options = build_pdf_options + configure_pdfkit + end + + def generate + return unless enabled? + + Jekyll.logger.info 'PDF Generator:', 'Starting PDF generation...' + + collections = @config['collections'] || [] + return if collections.empty? + + site_dir = @site.dest + output_dir = File.join(site_dir, 'pdfs') + FileUtils.mkdir_p(output_dir) + + collections.each do |collection| + generate_collection_pdf(collection, site_dir, output_dir) + end + + Jekyll.logger.info 'PDF Generator:', 'PDF generation complete.' + end + + private + + def enabled? + @site.config['pdf_generator'] && @site.config['pdf_generator']['enabled'] != false + end + + def load_config + config_path = File.join(@site.source, '_pdf-generator', 'config.yml') + return {} unless File.exist?(config_path) + + YAML.safe_load(File.read(config_path)) || {} + rescue StandardError => e + Jekyll.logger.warn 'PDF Generator:', "Error loading config: #{e.message}" + {} + end + + def build_pdf_options + opts = @config['pdf_options'] || {} + options = {} + + options['page-size'] = opts['page_size'] || 'A4' + options['margin-top'] = opts['margin_top'] || '20mm' + options['margin-right'] = opts['margin_right'] || '15mm' + options['margin-bottom'] = opts['margin_bottom'] || '20mm' + options['margin-left'] = opts['margin_left'] || '15mm' + options['print-media-type'] = opts['print_media_type'] != false + options['disable-smart-shrinking'] = opts['disable_smart_shrinking'] == true + options['encoding'] = opts['encoding'] || 'UTF-8' + options['quiet'] = true + + options + end + + def generate_collection_pdf(collection, site_dir, output_dir) + collection_name = collection['name'] + filename = collection['filename'] + start_path = collection['start_path'] + title = collection['title'] || collection_name + + Jekyll.logger.info 'PDF Generator:', "Generating PDF for #{title}..." + + # Find the index page for this collection + index_path = find_index_page(site_dir, start_path) + unless index_path + Jekyll.logger.warn 'PDF Generator:', "Index page not found for #{start_path}" + return + end + + # Convert to file:// URL + file_url = "file://#{File.expand_path(index_path)}" + + begin + # Create PDFKit instance + kit = PDFKit.new(file_url, @pdf_options) + + # Generate PDF + pdf_path = File.join(output_dir, filename) + pdf_data = kit.to_pdf + + # Write PDF file + File.binwrite(pdf_path, pdf_data) + + Jekyll.logger.info 'PDF Generator:', "✓ Generated: #{filename}" + rescue StandardError => e + Jekyll.logger.error 'PDF Generator:', "Error generating PDF for #{title}: #{e.message}" + Jekyll.logger.error 'PDF Generator:', e.backtrace.join("\n") if @site.config['verbose'] + end + end + + def find_index_page(site_dir, start_path) + # Remove leading slash and normalize path + path = start_path.sub(%r{^/}, '').sub(%r{/$}, '') + + # Try index.html first + index_path = File.join(site_dir, path, 'index.html') + return index_path if File.exist?(index_path) + + # Try without index.html (if path is already a file) + file_path = File.join(site_dir, "#{path}.html") + return file_path if File.exist?(file_path) + + # Try with .html extension + html_path = File.join(site_dir, path, '.html') + return html_path if File.exist?(html_path) + + nil + end + + def configure_pdfkit + # Try to find wkhtmltopdf in common locations + wkhtmltopdf_path = find_wkhtmltopdf + if wkhtmltopdf_path + PDFKit.configure do |config| + config.wkhtmltopdf = wkhtmltopdf_path + end + end + end + + def find_wkhtmltopdf + # Try common locations for wkhtmltopdf + common_paths = [ + '/usr/local/bin/wkhtmltopdf', + '/usr/bin/wkhtmltopdf', + `which wkhtmltopdf 2>/dev/null`.strip + ].reject(&:empty?) + + common_paths.each do |path| + return path if File.exist?(path) && File.executable?(path) + end + + # If not found, let PDFKit use its default detection + nil + end + end + end +end + +# Register Jekyll hook to generate PDFs after site is written +Jekyll::Hooks.register :site, :post_write do |site| + generator = Jekyll::PdfGenerator::Generator.new(site) + generator.generate +end + diff --git a/generate-pdfs.js b/generate-pdfs.js deleted file mode 100755 index ee6f9165113..00000000000 --- a/generate-pdfs.js +++ /dev/null @@ -1,276 +0,0 @@ -#!/usr/bin/env node - -/** - * PDF Generation Script for OpenSearch Documentation - * - * This script generates PDF files from the built Jekyll documentation site. - * It uses Puppeteer to render HTML pages from the _site/ directory. - * - * Usage: - * node generate-pdfs.js [--site-dir ] [--output-dir ] [--collection ] - * - * Options: - * --site-dir: Path to _site directory (default: _site) - * --output-dir: Directory to output PDFs (default: pdfs) - * --collection: Generate PDF for specific collection only (optional) - */ - -const fs = require('fs'); -const path = require('path'); -const puppeteer = require('puppeteer'); - -// Load configuration -const configPath = path.join(__dirname, 'pdf-config.json'); -const config = JSON.parse(fs.readFileSync(configPath, 'utf8')); - -// Parse command line arguments -const args = process.argv.slice(2); -const getArg = (flag) => { - const index = args.indexOf(flag); - return index !== -1 && args[index + 1] ? args[index + 1] : null; -}; - -const siteDir = path.resolve(getArg('--site-dir') || '_site'); -const outputDir = getArg('--output-dir') || config.outputDir || 'pdfs'; -const specificCollection = getArg('--collection'); - -// Check if _site directory exists -if (!fs.existsSync(siteDir)) { - console.error(`Error: Site directory not found: ${siteDir}`); - console.error('Please build the Jekyll site first: bundle exec jekyll build'); - process.exit(1); -} - -// Ensure output directory exists -if (!fs.existsSync(outputDir)) { - fs.mkdirSync(outputDir, { recursive: true }); -} - -/** - * Convert a URL path to a file path in _site directory - */ -function pathToFile(siteDir, urlPath) { - // Remove leading slash and ensure it ends with index.html or .html - let filePath = urlPath.replace(/^\//, ''); - if (filePath.endsWith('/')) { - filePath = filePath + 'index.html'; - } else if (!filePath.endsWith('.html')) { - filePath = filePath + '/index.html'; - } - return path.join(siteDir, filePath); -} - -/** - * Convert a file path to a file:// URL - */ -function fileToUrl(filePath) { - // Convert to absolute path and use file:// protocol - const absolutePath = path.resolve(filePath); - // On Windows, we need to handle drive letters differently - if (process.platform === 'win32') { - return `file:///${absolutePath.replace(/\\/g, '/')}`; - } - return `file://${absolutePath}`; -} - -/** - * Discover all pages in a collection by scanning _site directory - */ -function discoverPages(siteDir, startPath) { - const pages = new Set(); - const startFile = pathToFile(siteDir, startPath); - - if (!fs.existsSync(startFile)) { - console.warn(` Warning: Start file not found: ${startFile}`); - return Array.from(pages); - } - - // Read the HTML file to find links - const visited = new Set(); - const queue = [startPath]; - - while (queue.length > 0) { - const currentPath = queue.shift(); - if (visited.has(currentPath)) continue; - visited.add(currentPath); - - const filePath = pathToFile(siteDir, currentPath); - if (!fs.existsSync(filePath)) continue; - - pages.add(currentPath); - - try { - const content = fs.readFileSync(filePath, 'utf8'); - // Extract href attributes from anchor tags - const hrefRegex = /]+href=["']([^"']+)["']/gi; - const links = []; - let match; - - while ((match = hrefRegex.exec(content)) !== null) { - const href = match[1]; - // Skip external links, anchors, and non-HTML links - if (href.startsWith('http://') || - href.startsWith('https://') || - href.startsWith('#') || - href.startsWith('mailto:') || - href.match(/\.(css|js|json|png|jpg|gif|svg|woff|woff2)$/i)) { - continue; - } - - // Resolve relative paths - let resolvedPath; - if (href.startsWith('/')) { - resolvedPath = href.endsWith('/') ? href : `${href}/`; - } else { - // Relative path - resolve from current path - const currentDir = currentPath.substring(0, currentPath.lastIndexOf('/') + 1); - const resolved = path.posix.resolve(currentDir, href); - resolvedPath = resolved.endsWith('/') ? resolved : `${resolved}/`; - } - - // Only include paths under the same collection - if (resolvedPath.startsWith(startPath) && !visited.has(resolvedPath)) { - links.push(resolvedPath); - } - } - - queue.push(...links); - } catch (error) { - console.warn(` Warning: Could not read ${filePath}: ${error.message}`); - } - } - - return Array.from(pages).sort(); -} - -/** - * Generate PDF for a single collection - */ -async function generateCollectionPDF(browser, collection, siteDir) { - console.log(`\nGenerating PDF for: ${collection.title}`); - console.log(` Collection: ${collection.name}`); - console.log(` Start path: ${collection.startPath}`); - - // Discover all pages in the collection - const pages = discoverPages(siteDir, collection.startPath); - - if (pages.length === 0) { - console.warn(` Warning: No pages found for collection ${collection.name}`); - return null; - } - - console.log(` Found ${pages.length} pages`); - - const pdfPage = await browser.newPage(); - - try { - const pdfPath = path.join(outputDir, collection.filename); - const indexFile = pathToFile(siteDir, collection.startPath); - const indexUrl = fileToUrl(indexFile); - - console.log(` Generating PDF from: ${indexUrl}`); - - await pdfPage.goto(indexUrl, { - waitUntil: 'networkidle0', - timeout: config.waitTimeout || 30000 - }); - - // Wait for content to load - if (config.waitForSelector) { - try { - await pdfPage.waitForSelector(config.waitForSelector, { timeout: 10000 }); - } catch (e) { - // Continue if selector not found - } - } - - // Generate PDF - await pdfPage.pdf({ - path: pdfPath, - format: config.pdfOptions.format || 'A4', - printBackground: config.pdfOptions.printBackground !== false, - margin: config.pdfOptions.margin || {}, - displayHeaderFooter: config.pdfOptions.displayHeaderFooter !== false, - headerTemplate: config.pdfOptions.headerTemplate || '', - footerTemplate: config.pdfOptions.footerTemplate || '', - }); - - console.log(` ✓ Generated: ${pdfPath}`); - return pdfPath; - } catch (error) { - console.error(` ✗ Error generating PDF for ${collection.name}: ${error.message}`); - return null; - } finally { - await pdfPage.close(); - } -} - -/** - * Main function - */ -async function main() { - console.log('OpenSearch Documentation PDF Generator'); - console.log('=====================================\n'); - console.log(`Site Directory: ${siteDir}`); - console.log(`Output Directory: ${outputDir}\n`); - - // Filter collections if specific one is requested - let collectionsToProcess = config.collections; - if (specificCollection) { - collectionsToProcess = config.collections.filter(c => c.name === specificCollection); - if (collectionsToProcess.length === 0) { - console.error(`Error: Collection "${specificCollection}" not found in configuration`); - process.exit(1); - } - } - - console.log(`Processing ${collectionsToProcess.length} collection(s)...\n`); - - // Launch browser - const browser = await puppeteer.launch({ - headless: true, - args: ['--no-sandbox', '--disable-setuid-sandbox'] - }); - - try { - const results = []; - - for (const collection of collectionsToProcess) { - const pdfPath = await generateCollectionPDF(browser, collection, siteDir); - if (pdfPath) { - results.push({ - collection: collection.name, - title: collection.title, - filename: collection.filename, - path: pdfPath - }); - } - } - - console.log('\n====================================='); - console.log('PDF Generation Complete'); - console.log('=====================================\n'); - console.log(`Generated ${results.length} PDF(s):\n`); - results.forEach(r => { - console.log(` ✓ ${r.title}`); - console.log(` File: ${r.path}\n`); - }); - - } catch (error) { - console.error('Fatal error:', error); - process.exit(1); - } finally { - await browser.close(); - } -} - -// Run if called directly -if (require.main === module) { - main().catch(error => { - console.error('Unhandled error:', error); - process.exit(1); - }); -} - -module.exports = { main }; - diff --git a/package.json b/package.json deleted file mode 100644 index 3c4be00e825..00000000000 --- a/package.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "name": "opensearch-docs-pdf-generator", - "version": "1.0.0", - "description": "PDF generation tool for OpenSearch documentation", - "scripts": { - "generate-pdfs": "node generate-pdfs.js" - }, - "dependencies": { - "puppeteer": "^21.5.0" - }, - "engines": { - "node": ">=18.0.0" - } -} - diff --git a/pdf-config.json b/pdf-config.json deleted file mode 100644 index 52ad00f68cb..00000000000 --- a/pdf-config.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "outputDir": "pdfs", - "collections": [ - { - "name": "developer-documentation", - "title": "OpenSearch Developer Guide", - "filename": "opensearch-developer-guide.pdf", - "description": "Complete developer documentation for OpenSearch", - "startPath": "/developer-documentation/" - }, - { - "name": "getting-started", - "title": "OpenSearch Getting Started Guide", - "filename": "opensearch-getting-started.pdf", - "description": "Getting started guide for OpenSearch", - "startPath": "/getting-started/" - }, - { - "name": "api-reference", - "title": "OpenSearch API Reference", - "filename": "opensearch-api-reference.pdf", - "description": "Complete API reference documentation", - "startPath": "/api-reference/" - }, - { - "name": "install-and-configure", - "title": "OpenSearch Install and Configure Guide", - "filename": "opensearch-install-configure.pdf", - "description": "Installation and configuration guide", - "startPath": "/install-and-configure/" - }, - { - "name": "tuning-your-cluster", - "title": "OpenSearch Cluster Tuning Guide", - "filename": "opensearch-cluster-tuning.pdf", - "description": "Guide for creating and tuning your cluster", - "startPath": "/tuning-your-cluster/" - }, - { - "name": "security", - "title": "OpenSearch Security Guide", - "filename": "opensearch-security.pdf", - "description": "Security configuration and management guide", - "startPath": "/security/" - }, - { - "name": "query-dsl", - "title": "OpenSearch Query DSL Guide", - "filename": "opensearch-query-dsl.pdf", - "description": "Query DSL documentation", - "startPath": "/query-dsl/" - }, - { - "name": "search-plugins", - "title": "OpenSearch Search Features Guide", - "filename": "opensearch-search-features.pdf", - "description": "Search features and plugins documentation", - "startPath": "/search-plugins/" - }, - { - "name": "vector-search", - "title": "OpenSearch Vector Search Guide", - "filename": "opensearch-vector-search.pdf", - "description": "Vector search documentation", - "startPath": "/vector-search/" - }, - { - "name": "ml-commons-plugin", - "title": "OpenSearch Machine Learning Guide", - "filename": "opensearch-machine-learning.pdf", - "description": "Machine learning plugin documentation", - "startPath": "/ml-commons-plugin/" - } - ], - "pdfOptions": { - "format": "A4", - "printBackground": true, - "margin": { - "top": "20mm", - "right": "15mm", - "bottom": "20mm", - "left": "15mm" - }, - "displayHeaderFooter": true, - "headerTemplate": "
", - "footerTemplate": "
/
" - }, - "waitForSelector": ".main-content", - "waitTimeout": 30000 -} -