From fbb34cf5a3554ac371efbd4e8abbbf8d7fdb0bb6 Mon Sep 17 00:00:00 2001 From: Ian Hoang Date: Mon, 3 Nov 2025 15:20:21 -0600 Subject: [PATCH 1/9] Add Synthetic Data Generation & Features to OSB Documentation Signed-off-by: Ian Hoang --- _benchmark/features/index.md | 20 + .../custom-logic-sdg.md | 227 ++++++++++ .../synthetic-data-generation/index.md | 41 ++ .../synthetic-data-generation/mapping-sdg.md | 396 ++++++++++++++++++ .../synthetic-data-generation/tips.md | 15 + .../reference/commands/generate-data.md | 66 +++ 6 files changed, 765 insertions(+) create mode 100644 _benchmark/features/index.md create mode 100644 _benchmark/features/synthetic-data-generation/custom-logic-sdg.md create mode 100644 _benchmark/features/synthetic-data-generation/index.md create mode 100644 _benchmark/features/synthetic-data-generation/mapping-sdg.md create mode 100644 _benchmark/features/synthetic-data-generation/tips.md create mode 100644 _benchmark/reference/commands/generate-data.md diff --git a/_benchmark/features/index.md b/_benchmark/features/index.md new file mode 100644 index 00000000000..be224bebfdb --- /dev/null +++ b/_benchmark/features/index.md @@ -0,0 +1,20 @@ +--- +layout: default +title: Features +nav_order: 30 +has_children: true +has_toc: false +redirect_from: + - /benchmark/features/ + - /benchmark/features/index/ +more_cards: + - heading: "Synthetic Data Generation" + description: "Generate synthetic data for benchmarking" + link: "/benchmark/features/synthetic-data-generation/" +--- + +# Features + +On top of general benchmarking, OpenSearch Benchmark comes with other features. See the following sections for more information on each. + +{% include cards.html cards=page.more_cards %} \ No newline at end of file diff --git a/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md new file mode 100644 index 00000000000..2a53dd48319 --- /dev/null +++ b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md @@ -0,0 +1,227 @@ +--- +layout: default +title: Custom Logic Approach +nav_order: 35 +parent: Synthetic Data Generation +grand_parent: Features +--- + +# Generating Data with Custom Logic + +To invoke synthetic data generation, you'll need to provide either one of the two required input files: +* OpenSearch index mappings +* Custom logic (via Python module) + +This document explores using custom logic to generate synthetic data. + +### Prerequisites + +* **Required**: Custom logic defined in Python module +* **Optional**: Synthetic Data Generation Config + +Python module with custom logic **must include** `generate_synthetic_data(providers, **custom_lists)` within. +{: .important} + +### Overview + +This approach offers the most granular control over how synthetic data is produced in OpenSearch Benchmark. This is especially useful for users who understand the distribution of their data and the relationship between different fields. + +An example of what a valid Python module with custom logic that can be provided is shown below: + +```shell +from mimesis.providers.base import BaseProvider +from mimesis.enums import TimestampFormat + +import random + +GEOGRAPHIC_CLUSTERS = { + 'Manhattan': { + 'center': {'lat': 40.7831, 'lon': -73.9712}, + 'radius': 0.05 # degrees + }, + 'Brooklyn': { + 'center': {'lat': 40.6782, 'lon': -73.9442}, + 'radius': 0.05 + }, + 'Austin': { + 'center': {'lat': 30.2672, 'lon': -97.7431}, + 'radius': 0.1 # Increased radius to cover more of Austin + } +} + +def generate_location(cluster): + """Generate a random location within a cluster""" + center = GEOGRAPHIC_CLUSTERS[cluster]['center'] + radius = GEOGRAPHIC_CLUSTERS[cluster]['radius'] + lat = center['lat'] + random.uniform(-radius, radius) + lon = center['lon'] + random.uniform(-radius, radius) + return {'lat': lat, 'lon': lon} + +class NumericString(BaseProvider): + class Meta: + name = "numeric_string" + + @staticmethod + def generate(length=5) -> str: + return ''.join([str(random.randint(0, 9)) for _ in range(length)]) + +class MultipleChoices(BaseProvider): + class Meta: + name = "multiple_choices" + + @staticmethod + def generate(choices, num_of_choices=5) -> str: + import logging + logger = logging.getLogger(__name__) + logger.info("Choices: %s", choices) + logger.info("Length: %s", num_of_choices) + total_choices_available = len(choices) - 1 + + return [choices[random.randint(0, total_choices_available)] for _ in range(num_of_choices)] + +def generate_synthetic_document(providers, **custom_lists): + generic = providers['generic'] + random_mimesis = providers['random'] + + first_name = generic.person.first_name() + last_name = generic.person.last_name() + city = random.choice(list(GEOGRAPHIC_CLUSTERS.keys())) + + # Driver Document + document = { + "dog_driver_id": f"DD{generic.numeric_string.generate(length=4)}", + "dog_name": random_mimesis.choice(custom_lists['dog_names']), + "dog_breed": random_mimesis.choice(custom_lists['dog_breeds']), + "license_number": f"{random_mimesis.choice(custom_lists['license_plates'])}{generic.numeric_string.generate(length=4)}", + "favorite_treats": random_mimesis.choice(custom_lists['treats']), + "preferred_tip": random_mimesis.choice(custom_lists['tips']), + "vehicle_type": random_mimesis.choice(custom_lists['vehicle_types']), + "vehicle_make": random_mimesis.choice(custom_lists['vehicle_makes']), + "vehicle_model": random_mimesis.choice(custom_lists['vehicle_models']), + "vehicle_year": random_mimesis.choice(custom_lists['vehicle_years']), + "vehicle_color": random_mimesis.choice(custom_lists['vehicle_colors']), + "license_plate": random_mimesis.choice(custom_lists['license_plates']), + "current_location": generate_location(city), + "status": random.choice(['available', 'busy', 'offline']), + "current_ride": f"R{generic.numeric_string.generate(length=6)}", + "account_status": random_mimesis.choice(custom_lists['account_status']), + "join_date": generic.datetime.formatted_date(), + "total_rides": generic.numeric.integer_number(start=1, end=200), + "rating": generic.numeric.float_number(start=1.0, end=5.0, precision=2), + "earnings": { + "today": { + "amount": generic.numeric.float_number(start=1.0, end=5.0, precision=2), + "currency": "USD" + }, + "this_week": { + "amount": generic.numeric.float_number(start=1.0, end=5.0, precision=2), + "currency": "USD" + }, + "this_month": { + "amount": generic.numeric.float_number(start=1.0, end=5.0, precision=2), + "currency": "USD" + } + }, + "last_grooming_check": "2023-12-01", + "owner": { + "first_name": first_name, + "last_name": last_name, + "email": f"{first_name}{last_name}@gmail.com" + }, + "special_skills": generic.multiple_choices.generate(custom_lists['skills'], num_of_choices=3), + "bark_volume": generic.numeric.float_number(start=1.0, end=10.0, precision=2), + "tail_wag_speed": generic.numeric.float_number(start=1.0, end=10.0, precision=1) + } + + return document +``` +This example Python module has custom logic to generate documents related to dog drivers for a fictional ride-sharing company called *Pawber*, who uses OpenSearch to store and search across large volumes of ride-sharing data. + +In the module above, notice that there's function called `generate_synthetic_data(providers, **custom_lists)`. OpenSearch Benchmark expects that all custom modules provided must have this function defined along with its parameters. This function informs OpenSearch Benchmark on how to generate a synthetic document. +{: .important} + +Next, we'll see how we can use this to generate documents. + +### Command Parameters + +```shell +osb generate-data --custom-module ~/Desktop/http-logs.py --index-name http-logs-regenerated --output-path ~/Desktop/sdg_outputs/ --total-size 2 +``` + +* `generate-data` (required): sub-command that activates synthetic data generation in OpenSearch Benchmark +* `--custom-module` or `-m` (required): Path to Python logic that includes custom logic + +For `--custom-module` parameter, the custom Python module provided must include `generate_synthetic_data(providers, **custom_lists)`. +{: .important} + +* `--index-name` or `-n` (required): Name of data corpora generated +* `--output-path` or `-p` (required): Path where data should be generated in +* `--total-size` or `-s` (required): Total amount of data that should be generated in GB +* `--custom-config` or `-c` (optional): Path to YAML config defining rules for how data should be generated. This is further explored in the subsequent section +* `--test-document` or `-t` (optional): When flag is present, OSB generates a single synthetic document and outputs to the console. Provides users a way to verify that the example document generated is aligned with expectations. When the flag is not present, the entire data corpora will be generated + +### Example Output + +``` + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + + +[NOTE] ✨ Dashboard link to monitor processes and task streams: [http://127.0.0.1:8787/status] +[NOTE] ✨ For users who are running generation on a virtual machine, consider SSH port forwarding (tunneling) to localhost to view dashboard. +[NOTE] Example of localhost command for SSH port forwarding (tunneling) from an AWS EC2 instance: +ssh -i -N -L localhost:8787:localhost:8787 ec2-user@ + +Total GB to generate: [1] +Average document size in bytes: [412] +Max file size in GB: [40] + +100%|███████████████████████████████████████████████████████████████████| 100.07G/100.07G [3:35:29<00:00, 3.98MB/s] + +Generated 24271844660 docs in 12000 seconds. Total dataset size is 100.21GB. +✅ Visit the following path to view synthetically generated data: /home/ec2-user/ + +----------------------------------- +[INFO] ✅ SUCCESS (took 272 seconds) +----------------------------------- +``` +This is an example output of what it might look like if you generated 100GB. + + +### Using synthetic data generation config + +Using a synthetic data generation config is not necessary for this approach unless users prefer to store custom logic in the config file for organizational purposes. + +To store custom logic in the config file, the synthetic data generation config must have *CustomGenerationValues* defined and can have *custom_lists* and *custom_providers* defined. + +* **custom_lists** → Key, value pair mapping. Keys are names of lists and values are list of values. +* **custom_providers** → “Custom Providers” from Mimesis. Synthetic data generation in OpenSearch Benchmark uses Mimesis under the hood. These should be defined in the same file as the custom Python module supplied. + +Example of synthetic data generation config with *CustomGenerationValues* defined: + +```yml +CustomGenerationValues: + # For users who want to generate data via a custom Python module + custom_lists: + # Custom lists for users who are using a custom Python module and want to consolidate all values in this YAML file + dog_names: [Hana, Youpie, Charlie, Lucy, Cooper, Luna, Rocky, Daisy, Buddy, Molly] + dog_breeds: [Jindo, Labrador, German Shepherd, Golden Retriever, Bulldog, Poodle, Beagle, Rottweiler, Boxer, Dachshund, Chihuahua] + treats: [cookies, pup_cup, jerky] + custom_providers: + # OSB's synthetic data generator uses mimesis and custom providers are essentially custom Python classes that adds more functionality to Mimesis + - NumericString + - MultipleChoices +``` + +To use the synthetic data generation config with CustomGenerationValues defined, supply the following parameter to the generate-data command: + +```shell +--custom-config ~/Desktop/sdg-config.yml +``` + +OpenSearch Benchmark will now be using those custom_lists and custom_providers defined when generating synthetic data. + diff --git a/_benchmark/features/synthetic-data-generation/index.md b/_benchmark/features/synthetic-data-generation/index.md new file mode 100644 index 00000000000..08f8f9c8e54 --- /dev/null +++ b/_benchmark/features/synthetic-data-generation/index.md @@ -0,0 +1,41 @@ +--- +layout: default +title: Synthetic Data Generation +nav_order: 5 +has_children: true +parent: Features +has_toc: false +redirect_from: + - /benchmark/features/synthetic-data-generation/ + - /benchmark/features/synthetic-data-generation/index/ +more_cards: + - heading: "OpenSearch Index Mappings Approach" + description: "Generating synthetic data with OpenSearch Index Mappings" + link: "/benchmark/features/synthetic-data-generation/mapping-sdg/" + - heading: "Custom Logic Approach" + description: "Generate synthetic data with custom logic" + link: "/benchmark/features/synthetic-data-generation/custom-logic-sdg/" + - heading: "Tips and Tricks" + description: "Advice to help you generate data efficiently" + link: "/benchmark/features/synthetic-data-generation/tips/" +--- + +# Synthetic Data Generation + +## Overview + +Starting in 2.0, OpenSearch Benchmark comes with its own synthetic data generator that can generate any dataset, for any use case, at any scale. OpenSearch Benchmark’s synthetic data generator currently uses Random data generation and Rule-based generation to generate synthetic data. + +* Random data generation is data generated with random values. This is useful for testing stress testing systems. +* Rule-based synthesized data is data generated by specific rules defined by users. This is useful for stress testing systems and benchmarking use-cases. + +However, there are plans to support other techniques such as: + +* **Data masking and anonymization**: altering existing data’s specific fields to ensure privacy +* **Data transformation**: taking existing data and using statistical methods to generate new data + +## Usage + +Users can generate synthetic data with the subcommand generate-data. There are currently two ways to generate synthetic data in OpenSearch — with an OpenSearch index mapping or a custom Python module. The following pages explore generating data with OpenSearch index mappings, generating data with a custom Python module, and general tips and tricks users can use. + +{% include cards.html cards=page.more_cards %} \ No newline at end of file diff --git a/_benchmark/features/synthetic-data-generation/mapping-sdg.md b/_benchmark/features/synthetic-data-generation/mapping-sdg.md new file mode 100644 index 00000000000..c160b937209 --- /dev/null +++ b/_benchmark/features/synthetic-data-generation/mapping-sdg.md @@ -0,0 +1,396 @@ +--- +layout: default +title: Mapping Approach +nav_order: 15 +parent: Synthetic Data Generation +grand_parent: Features +--- + +# Generating Data with OpenSearch Index Mappings + +To invoke synthetic data generation, you'll need to provide either one of the two required input files: +* OpenSearch index mappings +* Custom logic (via Python module) + +This document explores using OpenSearch index mappings to generate synthetic data. + +### Prerequisites + +* **Required**: OpenSearch Index Mapping +* **Optional**: Synthetic Data Generation Config + +### Overview +This approach offers a balance between automation and customization. Synthetic data generation in OpenSearch Benchmark can use basic OpenSearch index mappings like this: + +```json +{ + "mappings": { + "properties": { + "title": { + "type": "text", + "analyzer": "standard", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "description": { + "type": "text" + }, + "price": { + "type": "float" + }, + "created_at": { + "type": "date", + "format": "strict_date_optional_time||epoch_millis" + }, + "is_available": { + "type": "boolean" + }, + "category_id": { + "type": "integer" + }, + "tags": { + "type": "keyword" + } + } + }, + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1 + } +} +``` + +or complex OpenSearch index mappings like this: + +```json +{ + "mappings": { + "dynamic": "strict", + "properties": { + "user": { + "type": "object", + "properties": { + "id": { + "type": "keyword" + }, + "email": { + "type": "keyword" + }, + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + }, + "completion": { + "type": "completion" + } + }, + "analyzer": "standard" + }, + "address": { + "type": "object", + "properties": { + "street": { + "type": "text" + }, + "city": { + "type": "keyword" + }, + "state": { + "type": "keyword" + }, + "zip": { + "type": "keyword" + }, + "location": { + "type": "geo_point" + } + } + }, + "preferences": { + "type": "object", + "dynamic": true + } + } + }, + "orders": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "date": { + "type": "date", + "format": "strict_date_optional_time||epoch_millis" + }, + "amount": { + "type": "scaled_float", + "scaling_factor": 100 + }, + "status": { + "type": "keyword" + }, + "items": { + "type": "nested", + "properties": { + "product_id": { + "type": "keyword" + }, + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } + }, + "quantity": { + "type": "short" + }, + "price": { + "type": "float" + }, + "categories": { + "type": "keyword" + } + } + }, + "shipping_address": { + "type": "object", + "properties": { + "street": { + "type": "text" + }, + "city": { + "type": "keyword" + }, + "state": { + "type": "keyword" + }, + "zip": { + "type": "keyword" + }, + "location": { + "type": "geo_point" + } + } + } + } + }, + "activity_log": { + "type": "nested", + "properties": { + "timestamp": { + "type": "date" + }, + "action": { + "type": "keyword" + }, + "ip_address": { + "type": "ip" + }, + "details": { + "type": "object", + "enabled": false + } + } + }, + "metadata": { + "type": "object", + "properties": { + "created_at": { + "type": "date" + }, + "updated_at": { + "type": "date" + }, + "tags": { + "type": "keyword" + }, + "source": { + "type": "keyword" + }, + "version": { + "type": "integer" + } + } + }, + "description": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + }, + "standard": { + "type": "text", + "analyzer": "standard" + } + } + }, + "ranking_scores": { + "type": "object", + "properties": { + "popularity": { + "type": "float" + }, + "relevance": { + "type": "float" + }, + "quality": { + "type": "float" + } + } + }, + "permissions": { + "type": "nested", + "properties": { + "user_id": { + "type": "keyword" + }, + "role": { + "type": "keyword" + }, + "granted_at": { + "type": "date" + } + } + } + } + }, + "settings": { + "number_of_shards": 3, + "number_of_replicas": 2, + "analysis": { + "analyzer": { + "email_analyzer": { + "type": "custom", + "tokenizer": "uax_url_email", + "filter": ["lowercase", "stop"] + } + } + } + } + } +``` + +In the next section, we'll use the example index mappings (or your own) to generate synthetic documents. + +## Command Parameters +A basic command that activates synthetic data generation with an OpenSearch index mapping: +```shell +osb generate-data --index-name --index-mappings --output-path --total-size +``` +* `generate-data` (required): sub-command that activates synthetic data generation in OpenSearch Benchmark +* `--index-mappings` or `-i` (required): Path to OpenSearch index mappings +* `--index-name` or `-n` (required): Name of data corpora generated +* `--output-path` or `-p` (required): Path where data should be generated in +* `--total-size` or `-s` (required): Total amount of data that should be generated in GB +* `--custom-config` or `-c` (optional): Path to YAML config defining rules for how data should be generated. This is further explored in the subsequent section +* `--test-document` or `-t` (optional): When flag is present, OSB generates a single synthetic document and outputs to the console. Provides users a way to verify that the example document generated is aligned with expectations. When the flag is not present, the entire data corpora will be generated + +### Example Output + +``` + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + + +[NOTE] ✨ Dashboard link to monitor processes and task streams: [http://127.0.0.1:8787/status] +[NOTE] ✨ For users who are running generation on a virtual machine, consider SSH port forwarding (tunneling) to localhost to view dashboard. +[NOTE] Example of localhost command for SSH port forwarding (tunneling) from an AWS EC2 instance: +ssh -i -N -L localhost:8787:localhost:8787 ec2-user@ + +Total GB to generate: [1] +Average document size in bytes: [412] +Max file size in GB: [40] + +100%|███████████████████████████████████████████████████████████████████| 100.07G/100.07G [3:35:29<00:00, 3.98MB/s] + +Generated 24271844660 docs in 12000 seconds. Total dataset size is 100.21GB. +✅ Visit the following path to view synthetically generated data: /home/ec2-user/ + +----------------------------------- +[INFO] ✅ SUCCESS (took 272 seconds) +----------------------------------- +``` +This is an example output of what it might look like if you generated 100GB. + +## Using synthetic data generation config + +Users can have more control over how data is generated with the help of the synthetic data generation config. When generating synthetic data with an OpenSearch index mappings, the synthetic data generation config should have *MappingGenerationValues* defined and either *generator_overrides*, *field_overrides*, or both defined. + +* **MappingGenerationValues** → When synthetic data generator is using OpenSearch index mappings to generate synthetic data, it looks for this section for additional instructions +* **generator_overrides** → For each type of generator defined in this section, the synthetic data generator uses these rules to generate synthetic data for that OpenSearch mapping field type. +* **field_overrides** → For each field defined, the synthetic data generator uses these rules to generate synthetic data for that specific field + +If both generator_overrides and field_overrides are defined, field_overrides have a higher precedence than generator_overrides. +{: .important} + +### Example sdg-config.yml + +Example of MappingGenerationValues defined: +```yml +MappingGenerationValues: + # For users who want more granular control over how data is generated when providing an OpenSearch mapping + generator_overrides: + # Overrides all instances of generators with these settings. Specify type and params + integer: + min: 0 + max: 20 + long: + min: 0 + max: 1000 + float: + min: 0.0 + max: 1.0 + double: + min: 0.0 + max: 2000.0 + date: + start_date: "2020-01-01" + end_date: "2023-01-01" + format: "yyyy-mm-dd" + text: + must_include: ["lorem", "ipsum"] + keyword: + choices: ["alpha", "beta", "gamma"] + + field_overrides: + # Specify field name as key of dict. For its values, specify generator and its params. Params must adhere to existing params for each generator + # For nested fields, use dot notation: Example preferences.allergies if allergies is a subfield of preferences object + title: + generator: generate_keyword + params: + choices: ["Helly R", "Mark S", "Irving B"] + + promo_codes: + generator: generate_keyword + params: + choices: ["HOT_SUMMER", "TREATSYUM!"] + + # Nested fields, use dot notation + orders.items.product_id: + generator: generate_keyword + params: + choices: ["Python", "English"] +``` + +To use this synthetic data generation config, append the following parameter and path to the YAML config to the `generate-data` command: + +```shell +--custom-config ~/Desktop/sdg-config.yml +``` + +OpenSearch Benchmark should now be generating synthetic data with these rules in mind. diff --git a/_benchmark/features/synthetic-data-generation/tips.md b/_benchmark/features/synthetic-data-generation/tips.md new file mode 100644 index 00000000000..82b3b4eff30 --- /dev/null +++ b/_benchmark/features/synthetic-data-generation/tips.md @@ -0,0 +1,15 @@ +--- +layout: default +title: Tips & Tricks +nav_order: 45 +parent: Synthetic Data Generation +grand_parent: Features +--- + +# Tips & tricks + +### Visualizing Generation +The URL outputted takes users to a Dask Dashboard that visualizes the generation process. Users can keep track of CPU and memory of each worker as well as obtain a CPU flamegraph of the generation process. This is helpful for optimizing generation when using a custom python module. + +### Use Default Settings +We recommend using the default settings that come with Synthetic Data Generation. Workers should be no more than the CPU count on the load generation host and chunk sizes should be 10,000 docs per chunk. However, users are encouraged to change the max_file_size_gb field as needed. This just changes how much data should be stored in each file generated. diff --git a/_benchmark/reference/commands/generate-data.md b/_benchmark/reference/commands/generate-data.md new file mode 100644 index 00000000000..0f12b69c4e4 --- /dev/null +++ b/_benchmark/reference/commands/generate-data.md @@ -0,0 +1,66 @@ +--- +layout: default +title: generate-data +nav_order: 75 +parent: Command reference +grand_parent: OpenSearch Benchmark Reference +redirect_from: + - /benchmark/commands/generate-data/ +--- + +# generate-data + +### Usage + +Users can generate synthetic data with the subcommand generate-data. There are currently two ways to generate synthetic data in OpenSearch — with an OpenSearch index mapping or a custom Python module. The following pages explore generating data with OpenSearch index mappings, generating data with a custom Python module, and general tips and tricks users can use. + +For more information and examples, see [Synthetic Data Generation Guide]({{site.url}}{{site.baseurl}}/benchmark/features/synthetic-data-generation/) + +### Options + +```shell +osb generate-data --custom-module ~/Desktop/http-logs.py --index-name http-logs-regenerated --output-path ~/Desktop/sdg_outputs/ --total-size 2 +``` + +* `generate-data` (required): sub-command that activates synthetic data generation in OpenSearch Benchmark +* `--index-mappings` or `-i` (required): Path to OpenSearch index mappings. If present, `--custom-module` cannot be used. +* `--custom-module` or `-m` (required): Path to Python logic that includes custom logic. If present, `--index-mappings` cannot be used. + +Custom Python module must include generate_synthetic_data(providers, **custom_lists) +{: .important} + +* `--index-name` or `-n` (required): Name of data corpora generated +* `--output-path` or `-p` (required): Path where data should be generated in +* `--total-size` or `-s` (required): Total amount of data that should be generated in GB +* `--custom-config` or `-c` (optional): Path to YAML config defining rules for how data should be generated. This is further explored in the subsequent section +* `--test-document` or `-t` (optional): When flag is present, OSB generates a single synthetic document and outputs to the console. Provides users a way to verify that the example document generated is aligned with expectations. When the flag is not present, the entire data corpora will be generated + +### Example Output + +``` + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + + +[NOTE] ✨ Dashboard link to monitor processes and task streams: [http://127.0.0.1:8787/status] +[NOTE] ✨ For users who are running generation on a virtual machine, consider SSH port forwarding (tunneling) to localhost to view dashboard. +[NOTE] Example of localhost command for SSH port forwarding (tunneling) from an AWS EC2 instance: +ssh -i -N -L localhost:8787:localhost:8787 ec2-user@ + +Total GB to generate: [1] +Average document size in bytes: [412] +Max file size in GB: [40] + +100%|███████████████████████████████████████████████████████████████████| 100.07G/100.07G [3:35:29<00:00, 3.98MB/s] + +Generated 24271844660 docs in 12000 seconds. Total dataset size is 100.21GB. +✅ Visit the following path to view synthetically generated data: /home/ec2-user/ + +----------------------------------- +[INFO] ✅ SUCCESS (took 272 seconds) +----------------------------------- +``` \ No newline at end of file From 9a7617df1aec5070af0add920f994bc4cec7082e Mon Sep 17 00:00:00 2001 From: Ian Hoang Date: Mon, 3 Nov 2025 15:25:08 -0600 Subject: [PATCH 2/9] Address format Signed-off-by: Ian Hoang --- _benchmark/features/synthetic-data-generation/tips.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_benchmark/features/synthetic-data-generation/tips.md b/_benchmark/features/synthetic-data-generation/tips.md index 82b3b4eff30..e59b2d148c0 100644 --- a/_benchmark/features/synthetic-data-generation/tips.md +++ b/_benchmark/features/synthetic-data-generation/tips.md @@ -9,7 +9,7 @@ grand_parent: Features # Tips & tricks ### Visualizing Generation -The URL outputted takes users to a Dask Dashboard that visualizes the generation process. Users can keep track of CPU and memory of each worker as well as obtain a CPU flamegraph of the generation process. This is helpful for optimizing generation when using a custom python module. +The URL outputted takes users to a Dask Dashboard that visualizes the generation process. Users can keep track of CPU and memory of each worker as well as obtain a CPU flamegraph of the generation process. This is helpful for monitoring the load generation's resources when generating data or optimizing generation when using a custom python module. ### Use Default Settings -We recommend using the default settings that come with Synthetic Data Generation. Workers should be no more than the CPU count on the load generation host and chunk sizes should be 10,000 docs per chunk. However, users are encouraged to change the max_file_size_gb field as needed. This just changes how much data should be stored in each file generated. +We recommend using the default settings that come with Synthetic Data Generation. Workers should be no more than the CPU count on the load generation host and chunk sizes should be 10,000 docs per chunk. However, users are encouraged to change the `max_file_size_gb` field as needed. This just changes how much data should be stored in each file generated. From d689b3a017f4ca394175b528fb419f1c338b5904 Mon Sep 17 00:00:00 2001 From: Ian Hoang Date: Wed, 19 Nov 2025 13:44:46 -0600 Subject: [PATCH 3/9] Add generating vectors advanced section Signed-off-by: Ian Hoang --- .../advanced/generating-vectors.md | 504 ++++++++++++++++++ .../advanced/index.md | 21 + .../synthetic-data-generation/index.md | 3 + 3 files changed, 528 insertions(+) create mode 100644 _benchmark/features/synthetic-data-generation/advanced/generating-vectors.md create mode 100644 _benchmark/features/synthetic-data-generation/advanced/index.md diff --git a/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md b/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md new file mode 100644 index 00000000000..f2f1b547083 --- /dev/null +++ b/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md @@ -0,0 +1,504 @@ +--- +layout: default +title: Generating vectors +nav_order: 15 +parent: Advanced +grand_parent: Synthetic Data Generation +--- + +# Generating vectors + +This document covers how to generate synthetic dense and sparse vectors with OpenSearch Benchmark's synthetic data generator using mappings. + +## Concepts + +### KNN Vectors (Dense Vectors) + +Dense vectors (known as knn_vector mapping field type in OpenSearch) are numerical representations of data like text or images where most or all dimensions have non-zero values. + +**Example**: Embedding for the word "dog" +```json +{ + "embedding": [0.234, -0.567, 0.123, 0.891, -0.234, 0.456, ..., 0.789] +} +``` + +### Sparse Vectors + +Think of sparse vectors as a dictionary of important words with their importance scores. + +**Example text**: "Korean jindos are hunting dogs that have a reputation for being loyal, independent, and confident." + +**Sparse vector representation of example text**: +```json +{ + "5432": 0.85, // "korean" - very important (specific descriptor) + "7821": 0.78, // "jindos" - very important (breed name) + "2": 0.45, // "dog" - moderately important (general category) + "9999": 0.32, // "loyal" - somewhat important (characteristic) + "1111": 0.12 // "things" - less important (common word) +} +``` +--- + +## Basic Usage + +### Generating dense vectors + +Generate random 128-dimensional vectors with minimal configuration. + +**1. Create mapping file** (`simple-knn-mapping.json`): +```json +{ + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "title": {"type": "text"}, + "my_embedding": { + "type": "knn_vector", + "dimension": 128 + } + } + } +} +``` + +**2. Generate data**: +```bash +opensearch-benchmark generate-data \ + --index-name my-vectors \ + --index-mappings simple-knn-mapping.json \ + --output-path ./output \ + --total-size 1 +``` + +**Generated document**: +Of the documents generated, the `my_embedding` field for one document might look like this: +```json +{ + "title": "Sample text 42", + "my_embedding": [0.234, -0.567, 0.123, ..., 0.891] // 128 random floats [-1.0, 1.0] +} +``` + +### Generating sparse vectors + +Generate sparse vectors with default configuration (10 tokens). + +**1. Create mapping file** (`simple-sparse-mapping.json`): +```json +{ + "mappings": { + "properties": { + "content": {"type": "text"}, + "sparse_embedding": { + "type": "sparse_vector" + } + } + } +} +``` + +**2. Generate data** (same command pattern): +```bash +opensearch-benchmark generate-data \ + --index-name my-sparse \ + --index-mappings simple-sparse-mapping.json \ + --output-path ./output \ + --total-size 1 +``` + +**Generated output**: + +Of the documents generated, a document with the `sparse_embedding` field might look like this: +```json +{ + "content": "Sample text content", + "sparse_embedding": { + "1000": 0.3421, + "1100": 0.5234, + "1200": 0.7821, + "1300": 0.1523, + "1400": 0.9102, + "1500": 0.4567, + "1600": 0.2341, + "1700": 0.6789, + "1800": 0.8123, + "1900": 0.3456 + } +} +``` + +With just an OpenSearch index mapping, OSB can generate synthetic dense and sparse vectors. However, the output is the most basic form of synthetic vectors. To achieve more realistic distributions and clusterings, we recommend using parameters outlined in the following section. + +--- + +## Dense Vectors (KNN Vector) Parameters + +The following are parameters that users can add to their SDG Config (YAML Config) to finetune generation of dense vectors. + +#### `dimension` (required) + +**What it does**: Specifies the number of dimensions in the vector. + +**Where to specify**: In the mapping (required) or in config params (optional override). + +**Impact**: +- **Memory**: Higher dimensions = more storage + - 128D ≈ 0.5 KB per vector + - 768D ≈ 3 KB per vector + - 1536D ≈ 6 KB per vector +- **Performance**: More dimensions = slower indexing and search +- **Quality**: Must match your actual embedding model's output + +**Common values**: + +| Dimension | Use Case | Example Models | +|-----------|----------|----------------| +| 128 | Lightweight, custom models | Custom embeddings, fast search | +| 384 | General purpose | sentence-transformers/all-MiniLM-L6-v2 | +| 768 | Standard NLP | BERT-Base, DistilBERT, MPNet | +| 1024 | High quality NLP | BERT-Large | +| 1536 | OpenAI standard | text-embedding-ada-002, text-embedding-3-small | +| 3072 | OpenAI premium | text-embedding-3-large | + +**Example**: +```yaml +field_overrides: + my_embedding: + generator: generate_knn_vector + params: + dimension: 768 # Override mapping dimension if needed +``` + +**Best practice**: Always match your production embedding model's dimension. + +--- + +#### `sample_vectors` (optional, highly recommended) + +**What it does**: Provides base vectors that the generator will add noise to, creating realistic variations and clusters. + +**Why it matters**: +- **Without**: Generates random uniform vectors across entire space (unrealistic, poor search quality) +- **With**: Creates natural clusters around sample vectors (realistic, good search quality) + +**Format**: List of lists, where each inner list is a complete vector. + +```yaml +field_overrides: + product_embedding: + generator: generate_knn_vector + params: + dimension: 768 + sample_vectors: + - [0.12, -0.34, 0.56, ..., 0.23] # Vector 1 (768 values) + - [-0.23, 0.45, -0.12, ..., -0.15] # Vector 2 (768 values) + - [0.34, 0.21, -0.45, ..., 0.42] # Vector 3 (768 values) +``` + +**How many sample vectors?** +- **Minimum**: 3-5 for basic clustering +- **Recommended**: 5-10 for realistic distribution +- **Maximum**: 20+ for complex multi-cluster scenarios + +**How to obtain sample vectors**: + +**Option 1: Using actual embeddings from your domain (Recommended)**: Use actual embeddings from your domain, representing different semantic clusters. Random generation without sample vectors produces unrealistic data unsuitable for search quality testing. + +**Option 2: Using sentence-transformers** in Python: +```python +from sentence_transformers import SentenceTransformer + +model = SentenceTransformer('all-MiniLM-L6-v2') + +# Create representative texts from different categories +texts = [ + "Electronics and gadgets", + "Clothing and fashion", + "Home and kitchen appliances", + "Books and literature", + "Sports and outdoor equipment" +] + +embeddings = model.encode(texts) +print(embeddings.tolist()) # Copy to your SDG config (YAML config) +``` + +--- + +#### `noise_factor` (default: 0.1) + +**What it does**: Controls the amount of noise added to base vectors. +- For **gaussian**: Standard deviation of normal distribution +- For **uniform**: Range of uniform distribution (±noise_factor) + +**Impact on data**: + +| noise_factor | Effect | Use Case | +|--------------|--------|----------| +| 0.01 - 0.05 | Tight clustering, minimal variation | Duplicate detection, near-exact matches | +| 0.1 - 0.2 | Natural variation within topic | General semantic search, recommendations | +| 0.3 - 0.5 | Wide dispersion, diverse concepts | Broad topic matching, discovery | +| > 0.5 | Very scattered, overlapping clusters | Testing edge cases, stress testing | + +**Configuration**: +```yaml +field_overrides: + tight_clustering: + generator: generate_knn_vector + params: + sample_vectors: [...] + noise_factor: 0.05 # Tight clusters + + diverse_results: + generator: generate_knn_vector + params: + sample_vectors: [...] + noise_factor: 0.2 # More variation +``` + +**Best practice**: Start with 0.1, then adjust based on search recall/precision requirements. + +--- + +#### `distribution_type` (default: "gaussian") + +**What it does**: Specifies the type of noise distribution. + +**Options**: +- **`gaussian`**: Normal distribution N(0, noise_factor) + - Most realistic (natural variation with occasional outliers) + - Produces smooth clusters + - Some values can extend beyond expected range + +- **`uniform`**: Uniform distribution [-noise_factor, +noise_factor] + - Bounded variation (no extreme outliers) + - More predictable results + - Flat probability across range + +**Configuration**: +```yaml +field_overrides: + realistic_embedding: + generator: generate_knn_vector + params: + sample_vectors: [...] + noise_factor: 0.1 + distribution_type: gaussian # More realistic + + controlled_embedding: + generator: generate_knn_vector + params: + sample_vectors: [...] + noise_factor: 0.1 + distribution_type: uniform # More predictable +``` + +**Best practice**: Use `gaussian` for production-like benchmarks. + +--- + +#### `normalize` (default: false) + +**What it does**: L2 normalizes vectors after noise addition, making their magnitude (length) exactly 1.0. + +**When to use normalize=true**: + +| Index Configuration | normalize Setting | Why | +|---------------------|-------------------|-----| +| `space_type: cosinesimil` | **true** | Cosine similarity only cares about direction; pre-normalizing makes search faster (dot product = cosine sim) | +| `space_type: l2` | **false** | L2 distance uses magnitude; normalizing loses information | +| `space_type: innerproduct` | **false** | Inner product uses magnitude as part of similarity | + +**Real-world models**: +- **OpenAI embeddings**: Already normalized (set true) +- **sentence-transformers**: Often normalized (check model docs, usually true) +- **BERT raw output**: Not normalized (set false, then normalize in index config) + +**Configuration**: +```yaml +field_overrides: + # For cosine similarity search + cosine_embedding: + generator: generate_knn_vector + params: + dimension: 384 + sample_vectors: [...] + normalize: true # Required for accurate cosine similarity + + # For L2 distance search + l2_embedding: + generator: generate_knn_vector + params: + dimension: 768 + sample_vectors: [...] + normalize: false # Keep original magnitudes +``` + +**Best practice**: Match your OpenSearch index's `space_type` setting. + +--- + +## Sparse Vectors Parameters + +The following are parameters that users can add to their SDG config to finetune how sparse vectors are generated. + +#### `num_tokens` (default: 10) + +**What it does**: Number of token-weight pairs to generate per vector. + +**Impact**: +- **Low (5-10)**: Very sparse, fast search, may miss some relevant docs +- **Medium (10-25)**: Balanced performance and recall +- **High (50-100)**: Dense sparse representation, comprehensive but slower + +**Typical values by model**: + +| Model/Approach | Typical num_tokens | Use Case | +|----------------|-------------------|----------| +| SPLADE v1 | 10-15 | Standard sparse neural search | +| SPLADE v2 | 15-25 | Improved recall | +| DeepImpact | 8-12 | Efficient sparse search | +| Custom/Hybrid | 20-50 | Rich representations | + +**Configuration**: +```yaml +field_overrides: + sparse_standard: + generator: generate_sparse_vector + params: + num_tokens: 15 # Standard SPLADE-like + + sparse_rich: + generator: generate_sparse_vector + params: + num_tokens: 30 # Richer representation +``` + +**Best practice**: Start with 10-15; increase if recall is insufficient. + +--- + +#### `min_weight` and `max_weight` (defaults: 0.01, 1.0) + +**What they do**: Define the range of token importance weights. + +**Impact**: +- **min_weight**: Filters weak signals (tokens below this are never generated) +- **max_weight**: Caps maximum importance (prevents single token dominance) + +**Common configurations**: + +| Configuration | min | max | Use Case | +|---------------|-----|-----|----------| +| Standard SPLADE | 0.01 | 1.0 | Default, balanced importance | +| Narrow range | 0.1 | 0.9 | More uniform importance | +| Wide range | 0.01 | 2.0 | Strong importance signals | +| High threshold | 0.05 | 1.0 | Filter low-confidence tokens | + +**Configuration**: +```yaml +field_overrides: + sparse_balanced: + generator: generate_sparse_vector + params: + num_tokens: 15 + min_weight: 0.01 + max_weight: 1.0 + + sparse_uniform: + generator: generate_sparse_vector + params: + num_tokens: 20 + min_weight: 0.2 # Higher minimum + max_weight: 0.8 # Lower maximum +``` + +**Constraints**: +- min_weight must be > 0.0 (OpenSearch requires positive weights) +- max_weight must be > min_weight +- Weights are rounded to 4 decimal places + +**Best practice**: Keep min_weight small (0.01-0.05) to allow nuanced weighting. + +--- + +#### `token_id_start` and `token_id_step` (defaults: 1000, 100) + +**What they do**: +- **token_id_start**: First token ID in the sequence +- **token_id_step**: Increment between consecutive token IDs + +**Generated sequence**: `start, start+step, start+2*step, ...` + +**Example** with start=1000, step=100, num_tokens=5: +```json +{ + "1000": 0.3421, // token_id_start + "1100": 0.5234, // start + 1*step + "1200": 0.7821, // start + 2*step + "1300": 0.1523, // start + 3*step + "1400": 0.9102 // start + 4*step +} +``` + +**Use cases**: + +| Configuration | start | step | Use Case | +|---------------|-------|------|----------| +| Default testing | 1000 | 100 | Easy visual separation | +| Realistic vocab | 0 | 1 | Match actual vocab indices | +| Multi-field | 1000, 5000, 10000 | 1 | Separate vocabularies per field | +| Large vocab | 0 | 1 | Simulate 50K+ vocabulary | + +**Configuration**: +```yaml +field_overrides: + # Default: easy debugging + sparse_debug: + generator: generate_sparse_vector + params: + num_tokens: 10 + token_id_start: 1000 + token_id_step: 100 + + # Realistic: actual vocab indices + sparse_realistic: + generator: generate_sparse_vector + params: + num_tokens: 15 + token_id_start: 0 + token_id_step: 1 + + # Multiple fields: separate ranges + sparse_field1: + generator: generate_sparse_vector + params: + token_id_start: 1000 + + sparse_field2: + generator: generate_sparse_vector + params: + token_id_start: 5000 +``` + +**Note**: Token IDs are currently sequential in the generated data. Real sparse vectors have non-sequential IDs based on actual vocabulary, but this doesn't affect OpenSearch indexing or search functionality. + +**Best practice**: Use large step (100) for debugging; use step=1 for production-like data. + +--- + +### When to Use Simple vs Complex Approaches + +| Scenario | Approach | Why | +|----------|----------|-----| +| Learning / Quick Test | Simple (no config) | Fastest setup, good enough for basic testing | +| Load Testing | Simple | Volume matters more than realism | +| Realistic Benchmarks | Complex (with config) | Need realistic clustering and distributions | +| Production Simulation | Complex | Must match actual embedding model behavior | +| Search Quality Testing | Complex | Need proper vector clusters for recall/precision testing | + +**Rule of thumb**: If you're testing search quality or comparing algorithms, use complex configuration with sample vectors. diff --git a/_benchmark/features/synthetic-data-generation/advanced/index.md b/_benchmark/features/synthetic-data-generation/advanced/index.md new file mode 100644 index 00000000000..b01f10a6cc1 --- /dev/null +++ b/_benchmark/features/synthetic-data-generation/advanced/index.md @@ -0,0 +1,21 @@ +--- +layout: default +title: Advanced +nav_order: 30 +parent: Synthetic Data Generation +has_children: true +has_toc: false +redirect_from: + - /benchmark/features/synthetic-data-generation/advanced/ + - /benchmark/features/synthetic-data-generation/advanced/index +more_cards: + - heading: "Generating vectors" + description: "Generating synthetic dense and sparse vectors" + link: "/benchmark/features/synthetic-data-generation/advanced/generating-vectors" +--- + +# Advanced + +This documentation is for advanced options related to synthetic data generation. Read more in the following sections. + +{% include cards.html cards=page.more_cards %} \ No newline at end of file diff --git a/_benchmark/features/synthetic-data-generation/index.md b/_benchmark/features/synthetic-data-generation/index.md index 08f8f9c8e54..6b6135fea7a 100644 --- a/_benchmark/features/synthetic-data-generation/index.md +++ b/_benchmark/features/synthetic-data-generation/index.md @@ -15,6 +15,9 @@ more_cards: - heading: "Custom Logic Approach" description: "Generate synthetic data with custom logic" link: "/benchmark/features/synthetic-data-generation/custom-logic-sdg/" + - heading: "Advanced" + description: "Advanced guide on generating data" + link: "/benchmark/features/synthetic-data-generation/advanced/" - heading: "Tips and Tricks" description: "Advice to help you generate data efficiently" link: "/benchmark/features/synthetic-data-generation/tips/" From 606db9d21abf2265ab5f236113a3a52700b7b21c Mon Sep 17 00:00:00 2001 From: Ian Hoang Date: Thu, 20 Nov 2025 16:13:56 -0600 Subject: [PATCH 4/9] Add feedback from Michael Signed-off-by: Ian Hoang --- .../advanced/generating-vectors.md | 23 +++++++++---------- .../custom-logic-sdg.md | 2 +- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md b/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md index f2f1b547083..42dd8df8502 100644 --- a/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md +++ b/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md @@ -137,11 +137,11 @@ With just an OpenSearch index mapping, OSB can generate synthetic dense and spar ## Dense Vectors (KNN Vector) Parameters -The following are parameters that users can add to their SDG Config (YAML Config) to finetune generation of dense vectors. +The following are parameters that users can add to their SDG Config (YAML Config) to fine-tune generation of dense vectors. #### `dimension` (required) -**What it does**: Specifies the number of dimensions in the vector. +Specifies the number of dimensions in the vector. **Where to specify**: In the mapping (required) or in config params (optional override). @@ -179,13 +179,11 @@ field_overrides: #### `sample_vectors` (optional, highly recommended) -**What it does**: Provides base vectors that the generator will add noise to, creating realistic variations and clusters. +Provides base vectors that the generator will add noise to, creating realistic variations and clusters. Without sample vectors, OSB's synthetic data generator will generate random uniform vectors across entire space, which is unrealistic and offers poor search quality. -**Why it matters**: -- **Without**: Generates random uniform vectors across entire space (unrealistic, poor search quality) -- **With**: Creates natural clusters around sample vectors (realistic, good search quality) +Providing sample vectors allows OSB's synthetic data generator to create more realistic and natural clusters. -**Format**: List of lists, where each inner list is a complete vector. +After you have a list of sample vectors, insert them as a **list of lists**, where each inner list is a complete vector. See the following example of a how sample vectors are provided to the SDG config. ```yaml field_overrides: @@ -199,7 +197,8 @@ field_overrides: - [0.34, 0.21, -0.45, ..., 0.42] # Vector 3 (768 values) ``` -**How many sample vectors?** +**How many sample vectors should be provided?** + - **Minimum**: 3-5 for basic clustering - **Recommended**: 5-10 for realistic distribution - **Maximum**: 20+ for complex multi-cluster scenarios @@ -231,7 +230,7 @@ print(embeddings.tolist()) # Copy to your SDG config (YAML config) #### `noise_factor` (default: 0.1) -**What it does**: Controls the amount of noise added to base vectors. +Controls the amount of noise added to base vectors. - For **gaussian**: Standard deviation of normal distribution - For **uniform**: Range of uniform distribution (±noise_factor) @@ -266,7 +265,7 @@ field_overrides: #### `distribution_type` (default: "gaussian") -**What it does**: Specifies the type of noise distribution. +Specifies the type of noise distribution. **Options**: - **`gaussian`**: Normal distribution N(0, noise_factor) @@ -303,7 +302,7 @@ field_overrides: #### `normalize` (default: false) -**What it does**: L2 normalizes vectors after noise addition, making their magnitude (length) exactly 1.0. +L2 normalizes vectors after noise addition, making their magnitude (length) exactly 1.0. **When to use normalize=true**: @@ -348,7 +347,7 @@ The following are parameters that users can add to their SDG config to finetune #### `num_tokens` (default: 10) -**What it does**: Number of token-weight pairs to generate per vector. +Number of token-weight pairs to generate per vector. **Impact**: - **Low (5-10)**: Very sparse, fast search, may miss some relevant docs diff --git a/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md index 2a53dd48319..aac360ec87d 100644 --- a/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md +++ b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md @@ -17,7 +17,7 @@ This document explores using custom logic to generate synthetic data. ### Prerequisites * **Required**: Custom logic defined in Python module -* **Optional**: Synthetic Data Generation Config +* **Optional**: [Synthetic Data Generation Config](https://github.com/opensearch-project/opensearch-benchmark/blob/main/osbenchmark/resources/sdg-config.yml) Python module with custom logic **must include** `generate_synthetic_data(providers, **custom_lists)` within. {: .important} From 8609c285fd849578048c9160d886ed691d60b717 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Fri, 21 Nov 2025 16:20:44 -0500 Subject: [PATCH 5/9] Doc review Signed-off-by: Fanit Kolchina --- _benchmark/features/index.md | 11 +- .../advanced/generating-vectors.md | 503 ---------------- .../advanced/index.md | 21 - .../custom-logic-sdg.md | 128 +++-- .../generating-vectors.md | 543 ++++++++++++++++++ .../synthetic-data-generation/index.md | 52 +- .../synthetic-data-generation/mapping-sdg.md | 505 ++++++++-------- .../synthetic-data-generation/tips.md | 24 +- _benchmark/reference/commands/aggregate.md | 2 +- .../reference/commands/command-flags.md | 2 +- _benchmark/reference/commands/compare.md | 2 +- _benchmark/reference/commands/download.md | 2 +- _benchmark/reference/commands/execute-test.md | 2 +- .../reference/commands/generate-data.md | 63 +- _benchmark/reference/commands/index.md | 17 +- _benchmark/reference/commands/info.md | 2 +- 16 files changed, 1002 insertions(+), 877 deletions(-) delete mode 100644 _benchmark/features/synthetic-data-generation/advanced/generating-vectors.md delete mode 100644 _benchmark/features/synthetic-data-generation/advanced/index.md create mode 100644 _benchmark/features/synthetic-data-generation/generating-vectors.md diff --git a/_benchmark/features/index.md b/_benchmark/features/index.md index be224bebfdb..35a5f65d987 100644 --- a/_benchmark/features/index.md +++ b/_benchmark/features/index.md @@ -1,20 +1,19 @@ --- layout: default -title: Features +title: Additional features nav_order: 30 has_children: true has_toc: false redirect_from: - /benchmark/features/ - - /benchmark/features/index/ more_cards: - - heading: "Synthetic Data Generation" - description: "Generate synthetic data for benchmarking" + - heading: "Synthetic data generation" + description: "Create synthetic datasets using index mappings or custom Python logic for comprehensive benchmarking and testing." link: "/benchmark/features/synthetic-data-generation/" --- -# Features +# Additional features -On top of general benchmarking, OpenSearch Benchmark comes with other features. See the following sections for more information on each. +In addition to general benchmarking, OpenSearch Benchmark provides several specialized features. {% include cards.html cards=page.more_cards %} \ No newline at end of file diff --git a/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md b/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md deleted file mode 100644 index 42dd8df8502..00000000000 --- a/_benchmark/features/synthetic-data-generation/advanced/generating-vectors.md +++ /dev/null @@ -1,503 +0,0 @@ ---- -layout: default -title: Generating vectors -nav_order: 15 -parent: Advanced -grand_parent: Synthetic Data Generation ---- - -# Generating vectors - -This document covers how to generate synthetic dense and sparse vectors with OpenSearch Benchmark's synthetic data generator using mappings. - -## Concepts - -### KNN Vectors (Dense Vectors) - -Dense vectors (known as knn_vector mapping field type in OpenSearch) are numerical representations of data like text or images where most or all dimensions have non-zero values. - -**Example**: Embedding for the word "dog" -```json -{ - "embedding": [0.234, -0.567, 0.123, 0.891, -0.234, 0.456, ..., 0.789] -} -``` - -### Sparse Vectors - -Think of sparse vectors as a dictionary of important words with their importance scores. - -**Example text**: "Korean jindos are hunting dogs that have a reputation for being loyal, independent, and confident." - -**Sparse vector representation of example text**: -```json -{ - "5432": 0.85, // "korean" - very important (specific descriptor) - "7821": 0.78, // "jindos" - very important (breed name) - "2": 0.45, // "dog" - moderately important (general category) - "9999": 0.32, // "loyal" - somewhat important (characteristic) - "1111": 0.12 // "things" - less important (common word) -} -``` ---- - -## Basic Usage - -### Generating dense vectors - -Generate random 128-dimensional vectors with minimal configuration. - -**1. Create mapping file** (`simple-knn-mapping.json`): -```json -{ - "settings": { - "index.knn": true - }, - "mappings": { - "properties": { - "title": {"type": "text"}, - "my_embedding": { - "type": "knn_vector", - "dimension": 128 - } - } - } -} -``` - -**2. Generate data**: -```bash -opensearch-benchmark generate-data \ - --index-name my-vectors \ - --index-mappings simple-knn-mapping.json \ - --output-path ./output \ - --total-size 1 -``` - -**Generated document**: -Of the documents generated, the `my_embedding` field for one document might look like this: -```json -{ - "title": "Sample text 42", - "my_embedding": [0.234, -0.567, 0.123, ..., 0.891] // 128 random floats [-1.0, 1.0] -} -``` - -### Generating sparse vectors - -Generate sparse vectors with default configuration (10 tokens). - -**1. Create mapping file** (`simple-sparse-mapping.json`): -```json -{ - "mappings": { - "properties": { - "content": {"type": "text"}, - "sparse_embedding": { - "type": "sparse_vector" - } - } - } -} -``` - -**2. Generate data** (same command pattern): -```bash -opensearch-benchmark generate-data \ - --index-name my-sparse \ - --index-mappings simple-sparse-mapping.json \ - --output-path ./output \ - --total-size 1 -``` - -**Generated output**: - -Of the documents generated, a document with the `sparse_embedding` field might look like this: -```json -{ - "content": "Sample text content", - "sparse_embedding": { - "1000": 0.3421, - "1100": 0.5234, - "1200": 0.7821, - "1300": 0.1523, - "1400": 0.9102, - "1500": 0.4567, - "1600": 0.2341, - "1700": 0.6789, - "1800": 0.8123, - "1900": 0.3456 - } -} -``` - -With just an OpenSearch index mapping, OSB can generate synthetic dense and sparse vectors. However, the output is the most basic form of synthetic vectors. To achieve more realistic distributions and clusterings, we recommend using parameters outlined in the following section. - ---- - -## Dense Vectors (KNN Vector) Parameters - -The following are parameters that users can add to their SDG Config (YAML Config) to fine-tune generation of dense vectors. - -#### `dimension` (required) - -Specifies the number of dimensions in the vector. - -**Where to specify**: In the mapping (required) or in config params (optional override). - -**Impact**: -- **Memory**: Higher dimensions = more storage - - 128D ≈ 0.5 KB per vector - - 768D ≈ 3 KB per vector - - 1536D ≈ 6 KB per vector -- **Performance**: More dimensions = slower indexing and search -- **Quality**: Must match your actual embedding model's output - -**Common values**: - -| Dimension | Use Case | Example Models | -|-----------|----------|----------------| -| 128 | Lightweight, custom models | Custom embeddings, fast search | -| 384 | General purpose | sentence-transformers/all-MiniLM-L6-v2 | -| 768 | Standard NLP | BERT-Base, DistilBERT, MPNet | -| 1024 | High quality NLP | BERT-Large | -| 1536 | OpenAI standard | text-embedding-ada-002, text-embedding-3-small | -| 3072 | OpenAI premium | text-embedding-3-large | - -**Example**: -```yaml -field_overrides: - my_embedding: - generator: generate_knn_vector - params: - dimension: 768 # Override mapping dimension if needed -``` - -**Best practice**: Always match your production embedding model's dimension. - ---- - -#### `sample_vectors` (optional, highly recommended) - -Provides base vectors that the generator will add noise to, creating realistic variations and clusters. Without sample vectors, OSB's synthetic data generator will generate random uniform vectors across entire space, which is unrealistic and offers poor search quality. - -Providing sample vectors allows OSB's synthetic data generator to create more realistic and natural clusters. - -After you have a list of sample vectors, insert them as a **list of lists**, where each inner list is a complete vector. See the following example of a how sample vectors are provided to the SDG config. - -```yaml -field_overrides: - product_embedding: - generator: generate_knn_vector - params: - dimension: 768 - sample_vectors: - - [0.12, -0.34, 0.56, ..., 0.23] # Vector 1 (768 values) - - [-0.23, 0.45, -0.12, ..., -0.15] # Vector 2 (768 values) - - [0.34, 0.21, -0.45, ..., 0.42] # Vector 3 (768 values) -``` - -**How many sample vectors should be provided?** - -- **Minimum**: 3-5 for basic clustering -- **Recommended**: 5-10 for realistic distribution -- **Maximum**: 20+ for complex multi-cluster scenarios - -**How to obtain sample vectors**: - -**Option 1: Using actual embeddings from your domain (Recommended)**: Use actual embeddings from your domain, representing different semantic clusters. Random generation without sample vectors produces unrealistic data unsuitable for search quality testing. - -**Option 2: Using sentence-transformers** in Python: -```python -from sentence_transformers import SentenceTransformer - -model = SentenceTransformer('all-MiniLM-L6-v2') - -# Create representative texts from different categories -texts = [ - "Electronics and gadgets", - "Clothing and fashion", - "Home and kitchen appliances", - "Books and literature", - "Sports and outdoor equipment" -] - -embeddings = model.encode(texts) -print(embeddings.tolist()) # Copy to your SDG config (YAML config) -``` - ---- - -#### `noise_factor` (default: 0.1) - -Controls the amount of noise added to base vectors. -- For **gaussian**: Standard deviation of normal distribution -- For **uniform**: Range of uniform distribution (±noise_factor) - -**Impact on data**: - -| noise_factor | Effect | Use Case | -|--------------|--------|----------| -| 0.01 - 0.05 | Tight clustering, minimal variation | Duplicate detection, near-exact matches | -| 0.1 - 0.2 | Natural variation within topic | General semantic search, recommendations | -| 0.3 - 0.5 | Wide dispersion, diverse concepts | Broad topic matching, discovery | -| > 0.5 | Very scattered, overlapping clusters | Testing edge cases, stress testing | - -**Configuration**: -```yaml -field_overrides: - tight_clustering: - generator: generate_knn_vector - params: - sample_vectors: [...] - noise_factor: 0.05 # Tight clusters - - diverse_results: - generator: generate_knn_vector - params: - sample_vectors: [...] - noise_factor: 0.2 # More variation -``` - -**Best practice**: Start with 0.1, then adjust based on search recall/precision requirements. - ---- - -#### `distribution_type` (default: "gaussian") - -Specifies the type of noise distribution. - -**Options**: -- **`gaussian`**: Normal distribution N(0, noise_factor) - - Most realistic (natural variation with occasional outliers) - - Produces smooth clusters - - Some values can extend beyond expected range - -- **`uniform`**: Uniform distribution [-noise_factor, +noise_factor] - - Bounded variation (no extreme outliers) - - More predictable results - - Flat probability across range - -**Configuration**: -```yaml -field_overrides: - realistic_embedding: - generator: generate_knn_vector - params: - sample_vectors: [...] - noise_factor: 0.1 - distribution_type: gaussian # More realistic - - controlled_embedding: - generator: generate_knn_vector - params: - sample_vectors: [...] - noise_factor: 0.1 - distribution_type: uniform # More predictable -``` - -**Best practice**: Use `gaussian` for production-like benchmarks. - ---- - -#### `normalize` (default: false) - -L2 normalizes vectors after noise addition, making their magnitude (length) exactly 1.0. - -**When to use normalize=true**: - -| Index Configuration | normalize Setting | Why | -|---------------------|-------------------|-----| -| `space_type: cosinesimil` | **true** | Cosine similarity only cares about direction; pre-normalizing makes search faster (dot product = cosine sim) | -| `space_type: l2` | **false** | L2 distance uses magnitude; normalizing loses information | -| `space_type: innerproduct` | **false** | Inner product uses magnitude as part of similarity | - -**Real-world models**: -- **OpenAI embeddings**: Already normalized (set true) -- **sentence-transformers**: Often normalized (check model docs, usually true) -- **BERT raw output**: Not normalized (set false, then normalize in index config) - -**Configuration**: -```yaml -field_overrides: - # For cosine similarity search - cosine_embedding: - generator: generate_knn_vector - params: - dimension: 384 - sample_vectors: [...] - normalize: true # Required for accurate cosine similarity - - # For L2 distance search - l2_embedding: - generator: generate_knn_vector - params: - dimension: 768 - sample_vectors: [...] - normalize: false # Keep original magnitudes -``` - -**Best practice**: Match your OpenSearch index's `space_type` setting. - ---- - -## Sparse Vectors Parameters - -The following are parameters that users can add to their SDG config to finetune how sparse vectors are generated. - -#### `num_tokens` (default: 10) - -Number of token-weight pairs to generate per vector. - -**Impact**: -- **Low (5-10)**: Very sparse, fast search, may miss some relevant docs -- **Medium (10-25)**: Balanced performance and recall -- **High (50-100)**: Dense sparse representation, comprehensive but slower - -**Typical values by model**: - -| Model/Approach | Typical num_tokens | Use Case | -|----------------|-------------------|----------| -| SPLADE v1 | 10-15 | Standard sparse neural search | -| SPLADE v2 | 15-25 | Improved recall | -| DeepImpact | 8-12 | Efficient sparse search | -| Custom/Hybrid | 20-50 | Rich representations | - -**Configuration**: -```yaml -field_overrides: - sparse_standard: - generator: generate_sparse_vector - params: - num_tokens: 15 # Standard SPLADE-like - - sparse_rich: - generator: generate_sparse_vector - params: - num_tokens: 30 # Richer representation -``` - -**Best practice**: Start with 10-15; increase if recall is insufficient. - ---- - -#### `min_weight` and `max_weight` (defaults: 0.01, 1.0) - -**What they do**: Define the range of token importance weights. - -**Impact**: -- **min_weight**: Filters weak signals (tokens below this are never generated) -- **max_weight**: Caps maximum importance (prevents single token dominance) - -**Common configurations**: - -| Configuration | min | max | Use Case | -|---------------|-----|-----|----------| -| Standard SPLADE | 0.01 | 1.0 | Default, balanced importance | -| Narrow range | 0.1 | 0.9 | More uniform importance | -| Wide range | 0.01 | 2.0 | Strong importance signals | -| High threshold | 0.05 | 1.0 | Filter low-confidence tokens | - -**Configuration**: -```yaml -field_overrides: - sparse_balanced: - generator: generate_sparse_vector - params: - num_tokens: 15 - min_weight: 0.01 - max_weight: 1.0 - - sparse_uniform: - generator: generate_sparse_vector - params: - num_tokens: 20 - min_weight: 0.2 # Higher minimum - max_weight: 0.8 # Lower maximum -``` - -**Constraints**: -- min_weight must be > 0.0 (OpenSearch requires positive weights) -- max_weight must be > min_weight -- Weights are rounded to 4 decimal places - -**Best practice**: Keep min_weight small (0.01-0.05) to allow nuanced weighting. - ---- - -#### `token_id_start` and `token_id_step` (defaults: 1000, 100) - -**What they do**: -- **token_id_start**: First token ID in the sequence -- **token_id_step**: Increment between consecutive token IDs - -**Generated sequence**: `start, start+step, start+2*step, ...` - -**Example** with start=1000, step=100, num_tokens=5: -```json -{ - "1000": 0.3421, // token_id_start - "1100": 0.5234, // start + 1*step - "1200": 0.7821, // start + 2*step - "1300": 0.1523, // start + 3*step - "1400": 0.9102 // start + 4*step -} -``` - -**Use cases**: - -| Configuration | start | step | Use Case | -|---------------|-------|------|----------| -| Default testing | 1000 | 100 | Easy visual separation | -| Realistic vocab | 0 | 1 | Match actual vocab indices | -| Multi-field | 1000, 5000, 10000 | 1 | Separate vocabularies per field | -| Large vocab | 0 | 1 | Simulate 50K+ vocabulary | - -**Configuration**: -```yaml -field_overrides: - # Default: easy debugging - sparse_debug: - generator: generate_sparse_vector - params: - num_tokens: 10 - token_id_start: 1000 - token_id_step: 100 - - # Realistic: actual vocab indices - sparse_realistic: - generator: generate_sparse_vector - params: - num_tokens: 15 - token_id_start: 0 - token_id_step: 1 - - # Multiple fields: separate ranges - sparse_field1: - generator: generate_sparse_vector - params: - token_id_start: 1000 - - sparse_field2: - generator: generate_sparse_vector - params: - token_id_start: 5000 -``` - -**Note**: Token IDs are currently sequential in the generated data. Real sparse vectors have non-sequential IDs based on actual vocabulary, but this doesn't affect OpenSearch indexing or search functionality. - -**Best practice**: Use large step (100) for debugging; use step=1 for production-like data. - ---- - -### When to Use Simple vs Complex Approaches - -| Scenario | Approach | Why | -|----------|----------|-----| -| Learning / Quick Test | Simple (no config) | Fastest setup, good enough for basic testing | -| Load Testing | Simple | Volume matters more than realism | -| Realistic Benchmarks | Complex (with config) | Need realistic clustering and distributions | -| Production Simulation | Complex | Must match actual embedding model behavior | -| Search Quality Testing | Complex | Need proper vector clusters for recall/precision testing | - -**Rule of thumb**: If you're testing search quality or comparing algorithms, use complex configuration with sample vectors. diff --git a/_benchmark/features/synthetic-data-generation/advanced/index.md b/_benchmark/features/synthetic-data-generation/advanced/index.md deleted file mode 100644 index b01f10a6cc1..00000000000 --- a/_benchmark/features/synthetic-data-generation/advanced/index.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -layout: default -title: Advanced -nav_order: 30 -parent: Synthetic Data Generation -has_children: true -has_toc: false -redirect_from: - - /benchmark/features/synthetic-data-generation/advanced/ - - /benchmark/features/synthetic-data-generation/advanced/index -more_cards: - - heading: "Generating vectors" - description: "Generating synthetic dense and sparse vectors" - link: "/benchmark/features/synthetic-data-generation/advanced/generating-vectors" ---- - -# Advanced - -This documentation is for advanced options related to synthetic data generation. Read more in the following sections. - -{% include cards.html cards=page.more_cards %} \ No newline at end of file diff --git a/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md index aac360ec87d..a1243695c65 100644 --- a/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md +++ b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md @@ -1,34 +1,65 @@ --- layout: default -title: Custom Logic Approach +title: Generating data using custom logic nav_order: 35 -parent: Synthetic Data Generation -grand_parent: Features +parent: Synthetic data generation +grand_parent: Additional features --- -# Generating Data with Custom Logic +# Generating data using custom logic -To invoke synthetic data generation, you'll need to provide either one of the two required input files: -* OpenSearch index mappings -* Custom logic (via Python module) +You can generate synthetic data using custom logic defined in a Python module. This approach offers you the most granular control over how synthetic data is produced in OpenSearch Benchmark. This is especially useful if you understand the distribution of your data and the relationship between different fields. -This document explores using custom logic to generate synthetic data. +## The generate_synthetic_document function -### Prerequisites +Every custom module provided to OpenSearch Benchmark must define the `generate_synthetic_document(providers, **custom_lists)` function. This function defines how OpenSearch Benchmark generates each synthetic document. -* **Required**: Custom logic defined in Python module -* **Optional**: [Synthetic Data Generation Config](https://github.com/opensearch-project/opensearch-benchmark/blob/main/osbenchmark/resources/sdg-config.yml) +### Function parameters -Python module with custom logic **must include** `generate_synthetic_data(providers, **custom_lists)` within. -{: .important} +| Parameter | Required/Optional | Description | +|---|---|---| +| `providers` | Required | A dictionary containing data generation tools. Available providers are `generic` (Mimesis [Generic provider](https://mimesis.name/master/api.html#generic-providers)) and `random` (Mimesis [Random class](https://mimesis.name/master/random_and_seed.html)). To add custom providers, see [Advanced configuration](#advanced-configuration). | +| `**custom_lists` | Optional | Keyword arguments containing predefined lists of values that you can use in your data generation logic. These are defined in your YAML configuration file under `custom_lists` and allow you to separate data values from your Python code. For example, if you define `dog_names: [Buddy, Max, Luna]` in YAML, you can access it as `custom_lists['dog_names']` in your function. This makes it easy to modify data values without changing your Python code. | -### Overview +### Basic function template -This approach offers the most granular control over how synthetic data is produced in OpenSearch Benchmark. This is especially useful for users who understand the distribution of their data and the relationship between different fields. +```python +def generate_synthetic_document(providers, **custom_lists): + # Access the available providers + generic = providers['generic'] + random_provider = providers['random'] -An example of what a valid Python module with custom logic that can be provided is shown below: + # Generate a document using the providers + document = { + 'name': generic.person.full_name(), + 'age': random_provider.randint(18, 80), + 'email': generic.person.email(), + 'timestamp': generic.datetime.datetime() + } -```shell + # Optionally, use custom lists if provided + if 'categories' in custom_lists: + document['category'] = random_provider.choice(custom_lists['categories']) + + return document +``` +{% include copy.html %} + +For more information, see the [Mimesis documentation](https://mimesis.name/master/api.html). + +## Python module example + +The following example Python module demonstrates custom logic for generating documents about dog drivers for a fictional ride-sharing company, *Pawber*, which uses OpenSearch to store and search large volumes of ride-sharing data. + +This example showcases several advanced concepts: +- **[Custom provider classes](#advanced-configuration)** (`NumericString`, `MultipleChoices`) that extend Mimesis functionality +- **[Custom lists](#advanced-configuration)** for data values like dog names, breeds, and treats (referenced as `custom_lists['dog_names']`) +- **Geographic clustering** logic for realistic location data +- **Complex document structures** with nested objects and relationships + +Save this code to a file called `pawber.py` in your desired directory (for example, `~/pawber.py`): + +```python from mimesis.providers.base import BaseProvider from mimesis.enums import TimestampFormat @@ -135,32 +166,22 @@ def generate_synthetic_document(providers, **custom_lists): return document ``` -This example Python module has custom logic to generate documents related to dog drivers for a fictional ride-sharing company called *Pawber*, who uses OpenSearch to store and search across large volumes of ride-sharing data. +{% include copy.html %} -In the module above, notice that there's function called `generate_synthetic_data(providers, **custom_lists)`. OpenSearch Benchmark expects that all custom modules provided must have this function defined along with its parameters. This function informs OpenSearch Benchmark on how to generate a synthetic document. -{: .important} +## Generating data -Next, we'll see how we can use this to generate documents. - -### Command Parameters +To generate synthetic data using custom logic, use the `generate-data` subcommand and provide the required custom Python module, index name, output path, and total amount of data to generate: ```shell -osb generate-data --custom-module ~/Desktop/http-logs.py --index-name http-logs-regenerated --output-path ~/Desktop/sdg_outputs/ --total-size 2 +osb generate-data --custom-module ~/pawber.py --index-name pawber-data --output-path ~/Desktop/sdg_outputs/ --total-size 2 ``` +{% include copy.html %} -* `generate-data` (required): sub-command that activates synthetic data generation in OpenSearch Benchmark -* `--custom-module` or `-m` (required): Path to Python logic that includes custom logic - -For `--custom-module` parameter, the custom Python module provided must include `generate_synthetic_data(providers, **custom_lists)`. -{: .important} +For a complete list of available parameters and their descriptions, see the [`generate-data` command reference]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/generate-data/). -* `--index-name` or `-n` (required): Name of data corpora generated -* `--output-path` or `-p` (required): Path where data should be generated in -* `--total-size` or `-s` (required): Total amount of data that should be generated in GB -* `--custom-config` or `-c` (optional): Path to YAML config defining rules for how data should be generated. This is further explored in the subsequent section -* `--test-document` or `-t` (optional): When flag is present, OSB generates a single synthetic document and outputs to the console. Provides users a way to verify that the example document generated is aligned with expectations. When the flag is not present, the entire data corpora will be generated +## Example output -### Example Output +The following is an example output of generating 100 GB of data: ``` ____ _____ __ ____ __ __ @@ -174,7 +195,7 @@ For `--custom-module` parameter, the custom Python module provided must include [NOTE] ✨ Dashboard link to monitor processes and task streams: [http://127.0.0.1:8787/status] [NOTE] ✨ For users who are running generation on a virtual machine, consider SSH port forwarding (tunneling) to localhost to view dashboard. [NOTE] Example of localhost command for SSH port forwarding (tunneling) from an AWS EC2 instance: -ssh -i -N -L localhost:8787:localhost:8787 ec2-user@ +ssh -i -N -L localhost:8787:localhost:8787 ec2-user@ Total GB to generate: [1] Average document size in bytes: [412] @@ -189,39 +210,48 @@ Generated 24271844660 docs in 12000 seconds. Total dataset size is 100.21GB. [INFO] ✅ SUCCESS (took 272 seconds) ----------------------------------- ``` -This is an example output of what it might look like if you generated 100GB. +## Advanced configuration -### Using synthetic data generation config +You can optionally create a YAML configuration file to store custom data and providers. The configuration file must define a `CustomGenerationValues` parameter. -Using a synthetic data generation config is not necessary for this approach unless users prefer to store custom logic in the config file for organizational purposes. +The following parameters are available in `CustomGenerationValues`. Both parameters are optional. -To store custom logic in the config file, the synthetic data generation config must have *CustomGenerationValues* defined and can have *custom_lists* and *custom_providers* defined. +| Parameter | Required/Optional | Description | +|---|---|---| +| `custom_lists` | Optional | Predefined arrays of values that you can reference in your Python module using `custom_lists['list_name']`. This allows you to separate data values from your code logic, making it easy to modify data values without changing your Python file. For example, `dog_names: [Buddy, Max, Luna]` becomes accessible as `custom_lists['dog_names']`. | +| `custom_providers` | Optional | Custom data generation classes that extend Mimesis functionality. These should be defined as classes in your Python module (like `NumericString` or `MultipleChoices` in the [example](#python-module-example)) and then listed in this parameter by name. This allows you to create specialized data generators beyond what Mimesis provides by default. | -* **custom_lists** → Key, value pair mapping. Keys are names of lists and values are list of values. -* **custom_providers** → “Custom Providers” from Mimesis. Synthetic data generation in OpenSearch Benchmark uses Mimesis under the hood. These should be defined in the same file as the custom Python module supplied. +### Example configuration file -Example of synthetic data generation config with *CustomGenerationValues* defined: +Save your configuration in a YAML file: ```yml CustomGenerationValues: - # For users who want to generate data via a custom Python module + # Generate data using a custom Python module custom_lists: - # Custom lists for users who are using a custom Python module and want to consolidate all values in this YAML file + # Custom lists to consolidate all values in this YAML file dog_names: [Hana, Youpie, Charlie, Lucy, Cooper, Luna, Rocky, Daisy, Buddy, Molly] dog_breeds: [Jindo, Labrador, German Shepherd, Golden Retriever, Bulldog, Poodle, Beagle, Rottweiler, Boxer, Dachshund, Chihuahua] treats: [cookies, pup_cup, jerky] custom_providers: - # OSB's synthetic data generator uses mimesis and custom providers are essentially custom Python classes that adds more functionality to Mimesis + # OSB's synthetic data generator uses Mimesis; custom providers are essentially custom Python classes that adds more functionality to Mimesis - NumericString - MultipleChoices ``` +{% include copy.html %} + + +### Using the configuration -To use the synthetic data generation config with CustomGenerationValues defined, supply the following parameter to the generate-data command: +To use your configuration file, add the `--custom-config` parameter to the `generate-data` command: ```shell ---custom-config ~/Desktop/sdg-config.yml +osb generate-data --custom-module ~/pawber.py --index-name pawber-data --output-path ~/Desktop/sdg_outputs/ --total-size 2 --custom-config ~/Desktop/sdg-config.yml ``` +{% include copy.html %} -OpenSearch Benchmark will now be using those custom_lists and custom_providers defined when generating synthetic data. +## Related documentation +- [`generate-data` command reference]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/generate-data/) +- [Generating data using index mappings]({{site.url}}{{site.baseurl}}/benchmark/features/synthetic-data-generation/mapping-sdg/) diff --git a/_benchmark/features/synthetic-data-generation/generating-vectors.md b/_benchmark/features/synthetic-data-generation/generating-vectors.md new file mode 100644 index 00000000000..0165e99d5e4 --- /dev/null +++ b/_benchmark/features/synthetic-data-generation/generating-vectors.md @@ -0,0 +1,543 @@ +--- +layout: default +title: Generating vectors +nav_order: 40 +parent: Synthetic data generation +grand_parent: Additional features +--- + +# Generating vectors + +You can generate synthetic dense and sparse vectors from mappings using OpenSearch Benchmark's synthetic data generator. + +## Dense vectors + +Dense vectors (represented by the [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/) field type in OpenSearch) are numerical representations of data, such as text or images, in which most or all dimensions have non-zero values. These vectors typically contain floating-point numbers between -1.0 and 1.0, with each dimension contributing to the overall meaning. + +Example embedding for the word "dog": + +```json +{ + "embedding": [0.234, -0.567, 0.123, 0.891, -0.234, 0.456, ..., 0.789] +} +``` + +## Sparse vectors + +Sparse vectors (represented by the [`sparse_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/sparse-vector/) field type in OpenSearch) are vectors in which most dimensions are zero, represented as key-value pairs of non-zero token IDs and their weights. Think of sparse vectors as a dictionary of important words with their importance scores, in which only significant terms are stored. + +Example text: "Korean jindos are hunting dogs that have a reputation for being loyal, independent, and confident." + +Sparse vector representation of example text: + +```json +{ + "5432": 0.85, // "korean" - very important (specific descriptor) + "7821": 0.78, // "jindos" - very important (breed name) + "2": 0.45, // "dog" - moderately important (general category) + "9999": 0.32, // "loyal" - somewhat important (characteristic) + "1111": 0.12 // "things" - less important (common word) +} +``` +--- + +## Basic usage + +The following examples show how to generate vectors with minimal configuration using only OpenSearch index mappings. + +### Generating dense vectors + +Generate random 128-dimensional vectors with minimal configuration. + +**1. Create a mapping file** (`simple-knn-mapping.json`): + +```json +{ + "settings": { + "index.knn": true + }, + "mappings": { + "properties": { + "title": {"type": "text"}, + "my_embedding": { + "type": "knn_vector", + "dimension": 128 + } + } + } +} +``` +{% include copy.html %} + +**2. Generate data**: + +```bash +opensearch-benchmark generate-data \ + --index-name my-vectors \ + --index-mappings simple-knn-mapping.json \ + --output-path ./output \ + --total-size 1 +``` +{% include copy.html %} + +#### Generated output + +In each of the generated documents, the `my_embedding` field might appear as follows: + +```json +{ + "title": "Sample text 42", + "my_embedding": [0.234, -0.567, 0.123, ..., 0.891] // 128 random floats [-1.0, 1.0] +} +``` + +### Generating sparse vectors + +Generate sparse vectors with default configuration (10 tokens). + +**1. Create a mapping file** (`simple-sparse-mapping.json`): + +```json +{ + "mappings": { + "properties": { + "content": {"type": "text"}, + "sparse_embedding": { + "type": "sparse_vector" + } + } + } +} +``` +{% include copy.html %} + +**2. Generate data** (same command pattern): + +```bash +opensearch-benchmark generate-data \ + --index-name my-sparse \ + --index-mappings simple-sparse-mapping.json \ + --output-path ./output \ + --total-size 1 +``` +{% include copy.html %} + +#### Generated output + +In each of the generated documents, the `sparse_embedding` field might appear as follows: + +```json +{ + "content": "Sample text content", + "sparse_embedding": { + "1000": 0.3421, + "1100": 0.5234, + "1200": 0.7821, + "1300": 0.1523, + "1400": 0.9102, + "1500": 0.4567, + "1600": 0.2341, + "1700": 0.6789, + "1800": 0.8123, + "1900": 0.3456 + } +} +``` + +Using only an OpenSearch index mapping, OSB can generate synthetic dense and sparse vectors. However, this produces basic synthetic vectors. For more realistic distributions and clusterings, we recommend configuring the parameters described in the following section. + +--- + +## Dense vector (k-NN vector) parameters + +The following are parameters that you can add to your synthetic data generation configuration file (YAML Config) to fine-tune generation of dense vectors. These parameters are used in the `field_overrides` section with the `generate_knn_vector` generator. For complete configuration details, see [Advanced configuration](/benchmark/features/synthetic-data-generation/mapping-sdg/#advanced-configuration). + +#### dimension + +This parameter specifies the number of dimensions in the vector. Optional. + +**How to specify**: The `dimension` must be defined in your OpenSearch index mapping file. You can optionally override this value in your YAML configuration using the `dimension` parameter in `field_overrides`. + +**Impact**: +- **Memory**: Higher dimensions = more storage + - 128D ≈ 0.5 KB per vector + - 768D ≈ 3 KB per vector + - 1536D ≈ 6 KB per vector +- **Performance**: More dimensions = slower indexing and search +- **Quality**: Must match your actual embedding model's output + +The following table shows common dimension values and their typical use cases. + +| Dimension | Use Case | Example Models | +|-----------|----------|----------------| +| 128 | Lightweight, custom models | Custom embeddings, fast search | +| 384 | General purpose | sentence-transformers/all-MiniLM-L6-v2 | +| 768 | Standard NLP | BERT-Base, DistilBERT, MPNet | +| 1024 | High quality NLP | BERT-Large | +| 1536 | OpenAI standard | text-embedding-ada-002, text-embedding-3-small | +| 3072 | OpenAI premium | text-embedding-3-large | + +**Example**: + +```yaml +field_overrides: + my_embedding: + generator: generate_knn_vector + params: + dimension: 768 # Override mapping dimension if needed +``` +{% include copy.html %} + +**Best practice**: This parameter must match your embedding model's dimension. + +--- + +#### sample_vectors + +This parameter provides base vectors to which the generator adds noise, creating realistic variations and clusters. Optional, but highly recommended. + +Without sample vectors, OSB's synthetic data generator generates random uniform vectors across the entire space, which is unrealistic and offers poor search quality. Providing sample vectors allows OSB's synthetic data generator to create more realistic and natural clusters. + +After you prepare a list of sample vectors, insert them as a **list of lists**, in which each inner list is a complete vector. The following example provides sample vectors in the synthetic data generation configuration file: + +```yaml +field_overrides: + product_embedding: + generator: generate_knn_vector + params: + dimension: 768 + sample_vectors: + - [0.12, -0.34, 0.56, ..., 0.23] # Vector 1 (768 values) + - [-0.23, 0.45, -0.12, ..., -0.15] # Vector 2 (768 values) + - [0.34, 0.21, -0.45, ..., 0.42] # Vector 3 (768 values) +``` +{% include copy.html %} + +Use the following guidelines to determine the number of vectors that you provide: + +- **Minimum**: 3--5 for basic clustering +- **Recommended**: 5--10 for realistic distribution +- **Maximum**: 20+ for complex multi-cluster scenarios + +**How to obtain sample vectors**: + +**Option 1: Using actual embeddings from your domain (Recommended)**: Use actual embeddings from your domain, representing different semantic clusters. Random generation without sample vectors produces unrealistic data unsuitable for search quality testing. + +**Option 2: Using sentence-transformers** in Python: + +```python +from sentence_transformers import SentenceTransformer + +model = SentenceTransformer('all-MiniLM-L6-v2') + +# Create representative texts from different categories +texts = [ + "Electronics and gadgets", + "Clothing and fashion", + "Home and kitchen appliances", + "Books and literature", + "Sports and outdoor equipment" +] + +embeddings = model.encode(texts) +print(embeddings.tolist()) # Copy to your synthetic data generation configuration file (YAML config) +``` +{% include copy.html %} + +--- + +#### distribution_type + +This parameter specifies the type of noise distribution. Optional. Default is `gaussian`. + +**Valid values**: +- `gaussian`: Normal distribution N(0, `noise_factor`) + - Most realistic (natural variation with occasional outliers) + - Produces smooth clusters + - Some values can extend beyond expected range + +- `uniform`: Uniform distribution [-`noise_factor`, +`noise_factor`] + - Bounded variation (no extreme outliers) + - More predictable results + - Flat probability across range + +**Configuration**: +```yaml +field_overrides: + realistic_embedding: + generator: generate_knn_vector + params: + sample_vectors: [...] + noise_factor: 0.1 + distribution_type: gaussian # More realistic + + controlled_embedding: + generator: generate_knn_vector + params: + sample_vectors: [...] + noise_factor: 0.1 + distribution_type: uniform # More predictable +``` +{% include copy.html %} + +**Best practice**: Use `gaussian` for production-like benchmarks. + +--- + +#### noise_factor + +This parameter controls the amount of noise added to base vectors: +- For `gaussian`: Standard deviation of normal distribution +- For `uniform`: Range of uniform distribution (±`noise_factor`) + +Optional. Default is `0.1`. + +The following table shows how different `noise_factor` values impact the generated data. + +| `noise_factor` | Effect | Use Case | +|--------------|--------|----------| +| 0.01--0.05 | Tight clustering, minimal variation | Duplicate detection, near-exact matches | +| 0.1--0.2 | Natural variation within topic | General semantic search, recommendations | +| 0.3--0.5 | Wide dispersion, diverse concepts | Broad topic matching, discovery | +| > 0.5 | Very scattered, overlapping clusters | Testing edge cases, stress testing | + +**Configuration**: + +```yaml +field_overrides: + tight_clustering: + generator: generate_knn_vector + params: + sample_vectors: [...] + noise_factor: 0.05 # Tight clusters + + diverse_results: + generator: generate_knn_vector + params: + sample_vectors: [...] + noise_factor: 0.2 # More variation +``` +{% include copy.html %} + +**Best practice**: Start with `0.1`, then adjust based on search recall or precision requirements. + +--- + +#### normalize + +This parameter normalizes vectors after noise addition, making their magnitude (length) exactly `1.0`. Optional. Default is `false`. + +The following table shows when to set `normalize` to `true` based on your index configuration. + +| `space_type` in the index mapping | `normalize` value | Explanation | +| --------------------------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `cosinesimil` | `true` | Cosine similarity depends only on vector direction. Pre-normalizing improves performance because the dot product directly represents cosine similarity. | +| `l2` | `false` | L2 distance relies on vector magnitude. Normalizing removes magnitude information and reduces accuracy. | +| `innerproduct` | `false` | Inner product incorporates vector magnitude into the similarity score, so normalization would change the intended scoring behavior. | + +**Real-world model guidance**: + +* **OpenAI embeddings**: These vectors are pre-normalized, so set `normalize` to `true`. +* **sentence-transformers**: Many models output normalized vectors. Review the model documentation; in most cases, `normalize` should be set to `true`. +* **BERT (raw output)**: Raw BERT embeddings are not normalized. Set `normalize` to `false` and rely on the index configuration to perform normalization if needed. + + +**Configuration**: + +```yaml +field_overrides: + # For cosine similarity search + cosine_embedding: + generator: generate_knn_vector + params: + dimension: 384 + sample_vectors: [...] + normalize: true # Required for accurate cosine similarity + + # For L2 distance search + l2_embedding: + generator: generate_knn_vector + params: + dimension: 768 + sample_vectors: [...] + normalize: false # Keep original magnitudes +``` +{% include copy.html %} + +**Best practice**: Match your OpenSearch index's `space_type` setting. + +--- + +## Sparse vectors parameters + +The following are parameters that you can add to your synthetic data generation configuration file to finetune how sparse vectors are generated. These parameters are used in the `field_overrides` section with the `generate_sparse_vector` generator. For complete configuration details, see [Advanced configuration](/benchmark/features/synthetic-data-generation/mapping-sdg/#advanced-configuration). + +#### num_tokens + +This parameter specifies the number of token-weight pairs to generate per vector. Optional. Default is `10`. + +**Impact**: +- **Low (5--10)**: Very sparse, fast search, may miss some relevant documents +- **Medium (10--25)**: Balanced performance and recall +- **High (50--100)**: Dense sparse representation, comprehensive but slower + +The following table shows typical `num_tokens` values for different models and approaches. + +| Model/Approach | Typical `num_tokens` | Use Case | +|----------------|-------------------|----------| +| SPLADE v1 | 10--15 | Standard sparse neural search | +| SPLADE v2 | 15--25 | Improved recall | +| DeepImpact | 8--12 | Efficient sparse search | +| Custom/Hybrid | 20--50 | Rich representations | + +**Configuration**: + +```yaml +field_overrides: + sparse_standard: + generator: generate_sparse_vector + params: + num_tokens: 15 # Standard SPLADE-like + + sparse_rich: + generator: generate_sparse_vector + params: + num_tokens: 30 # Richer representation +``` +{% include copy.html %} + +**Best practice**: Start with `10--15`; increase if recall is insufficient. + +--- + +#### min_weight and max_weight + +These parameters define the range of token importance weights. Optional. Default `min_weight` is `0.01`; default `max_weight` is `1.0`. + +**Impact**: +- `min_weight`: Excludes low-importance tokens from generation. Tokens with weights below this value are not included. +- `max_weight`: Limits the upper bound of token influence to prevent any single token from dominating the vector. + +The following table shows common weight range configurations and their use cases. + +| Configuration | `min_weight` | `max_weight` | Use case | +|---------------|-----|-----|----------| +| Standard SPLADE | `0.01` | `1.0` | Default, balanced importance | +| Narrow range | `0.1` | `0.9` | More uniform importance | +| Wide range | `0.01` | `2.0` | Strong importance signals | +| High threshold | `0.05` | `1.0` | Filters low-confidence tokens | + +**Configuration**: + +```yaml +field_overrides: + sparse_balanced: + generator: generate_sparse_vector + params: + num_tokens: 15 + min_weight: 0.01 + max_weight: 1.0 + + sparse_uniform: + generator: generate_sparse_vector + params: + num_tokens: 20 + min_weight: 0.2 # Higher minimum + max_weight: 0.8 # Lower maximum +``` +{% include copy.html %} + +**Constraints**: +- `min_weight` must be > `0.0` (OpenSearch requires positive weights) +- `max_weight` must be > `min_weight` +- Weights are rounded to `4` decimal places + +**Best practice**: Keep `min_weight` small (`0.01--0.05`) to allow nuanced weighting. + +--- + +#### token_id_start and token_id_step + +These parameters define how token IDs are assigned during vector generation: + +- `token_id_start`: Sets the starting token ID in the generated sequence. Default is `1000`. + +- `token_id_step`: Specifies the increment applied between each consecutive token ID. Default is `100`. + +**Generated sequence**: `start, start+step, start+2*step, ...` + +**Example** with `start=1000`, `step=100`, `num_tokens=5`: + +```json +{ + "1000": 0.3421, // token_id_start + "1100": 0.5234, // start + 1*step + "1200": 0.7821, // start + 2*step + "1300": 0.1523, // start + 3*step + "1400": 0.9102 // start + 4*step +} +``` +{% include copy.html %} + +The following table shows different token ID configurations and their use cases. + +| Configuration | `token_id_start` | `token_id_step` | Use case | +| --------------------------- | ----------------- | --------------- | ------------------------------------------------------------------ | +| Default testing | `1000` | `100` | Helps visually distinguish generated token ranges. | +| Realistic vocabulary | `0` | `1` | Aligns token IDs with a real model's vocabulary indices. | +| Multi-field generation | `1000`, `5000`, `10000` | `1` | Keeps token ID ranges separate across different fields. | +| Large vocabulary simulation | `0` | `1` | Supports generation scenarios with vocabularies of `50,000`+ tokens. | + +**Configuration**: + +```yaml +field_overrides: + # Default: easy debugging + sparse_debug: + generator: generate_sparse_vector + params: + num_tokens: 10 + token_id_start: 1000 + token_id_step: 100 + + # Realistic: actual vocab indices + sparse_realistic: + generator: generate_sparse_vector + params: + num_tokens: 15 + token_id_start: 0 + token_id_step: 1 + + # Multiple fields: separate ranges + sparse_field1: + generator: generate_sparse_vector + params: + token_id_start: 1000 + + sparse_field2: + generator: generate_sparse_vector + params: + token_id_start: 5000 +``` +{% include copy.html %} + +**Note**: Token IDs in the generated data are sequential. In real sparse vectors, IDs may be non-sequential based on the actual vocabulary. This difference does not impact OpenSearch indexing or search functionality. + +**Best practice**: Use a larger `token_id_step` (for example, `100`) for debugging, and set `token_id_step` to `1` for production-like data. + +--- + +## Choosing simple or complex generation approaches + +The following table outlines when to use simple generation versus a more complex, configurable approach based on your testing goals. + +| Scenario | Recommended approach | Rationale | +| ------------------------- | ----------------------------------------------- | --------------------------------------------------------------------------------------------- | +| Learning or quick testing | Simple generation (no additional configuration) | Provides the fastest setup and is sufficient for basic validation. | +| Load testing | Simple generation | Prioritizes data volume and throughput over vector realism. | +| Realistic benchmarks | Complex generation (with configuration) | Requires realistic vector clustering and distributions to reflect real-world behavior. | +| Production simulation | Complex generation | Needs vector characteristics that closely match those produced by the actual embedding model. | +| Search quality testing | Complex generation | Requires meaningful vector clusters to evaluate recall and precision accurately. | + + +**Recommendation**: For search quality testing or algorithm comparisons, use complex configuration with sample vectors to ensure realistic data distributions. diff --git a/_benchmark/features/synthetic-data-generation/index.md b/_benchmark/features/synthetic-data-generation/index.md index 6b6135fea7a..326c3fdcb2f 100644 --- a/_benchmark/features/synthetic-data-generation/index.md +++ b/_benchmark/features/synthetic-data-generation/index.md @@ -1,44 +1,48 @@ --- layout: default -title: Synthetic Data Generation +title: Synthetic data generation nav_order: 5 has_children: true -parent: Features +parent: Additional features has_toc: false redirect_from: - /benchmark/features/synthetic-data-generation/ - - /benchmark/features/synthetic-data-generation/index/ -more_cards: - - heading: "OpenSearch Index Mappings Approach" - description: "Generating synthetic data with OpenSearch Index Mappings" +cards: + - heading: "Generate data using index mappings" + description: "Create synthetic data based on your OpenSearch index mappings." link: "/benchmark/features/synthetic-data-generation/mapping-sdg/" - - heading: "Custom Logic Approach" - description: "Generate synthetic data with custom logic" + - heading: "Generate data using custom logic" + description: "Build synthetic data using your own scripts or domain-specific rules." link: "/benchmark/features/synthetic-data-generation/custom-logic-sdg/" - - heading: "Advanced" - description: "Advanced guide on generating data" - link: "/benchmark/features/synthetic-data-generation/advanced/" - - heading: "Tips and Tricks" - description: "Advice to help you generate data efficiently" +more_cards: + - heading: "Generating vectors" + description: "Generate synthetic dense and sparse vectors with configurable parameters for realistic AI/ML benchmarking scenarios." + link: "/benchmark/features/synthetic-data-generation/generating-vectors/" +tip_cards: + - heading: "Tips and best practices" + description: "Learn practical guidance and best practices to optimize your synthetic data generation workflows." link: "/benchmark/features/synthetic-data-generation/tips/" --- -# Synthetic Data Generation +# Synthetic data generation +**Introduced 2.0** +{: .label .label-purple } + +OpenSearch Benchmark provides a built-in synthetic data generator that can create datasets for any use case at any scale. It currently supports two generation methods: -## Overview +* **Random data generation** produces fields with randomized values. This is useful for stress testing and evaluating system performance under load. +* **Rule-based data generation** creates data according to user-defined rules. This is helpful for testing specific scenarios, benchmarking query behavior, or simulating domain-specific patterns. -Starting in 2.0, OpenSearch Benchmark comes with its own synthetic data generator that can generate any dataset, for any use case, at any scale. OpenSearch Benchmark’s synthetic data generator currently uses Random data generation and Rule-based generation to generate synthetic data. +## Data generation methods -* Random data generation is data generated with random values. This is useful for testing stress testing systems. -* Rule-based synthesized data is data generated by specific rules defined by users. This is useful for stress testing systems and benchmarking use-cases. +OpenSearch Benchmark currently supports the following data generation methods. -However, there are plans to support other techniques such as: +{% include cards.html cards=page.cards %} -* **Data masking and anonymization**: altering existing data’s specific fields to ensure privacy -* **Data transformation**: taking existing data and using statistical methods to generate new data +For advanced synthetic data generation capabilities, explore vector generation. -## Usage +{% include cards.html cards=page.more_cards %} -Users can generate synthetic data with the subcommand generate-data. There are currently two ways to generate synthetic data in OpenSearch — with an OpenSearch index mapping or a custom Python module. The following pages explore generating data with OpenSearch index mappings, generating data with a custom Python module, and general tips and tricks users can use. +## Tips and best practices -{% include cards.html cards=page.more_cards %} \ No newline at end of file +{% include cards.html cards=page.tip_cards %} diff --git a/_benchmark/features/synthetic-data-generation/mapping-sdg.md b/_benchmark/features/synthetic-data-generation/mapping-sdg.md index c160b937209..fdbd894eac2 100644 --- a/_benchmark/features/synthetic-data-generation/mapping-sdg.md +++ b/_benchmark/features/synthetic-data-generation/mapping-sdg.md @@ -1,26 +1,16 @@ --- layout: default -title: Mapping Approach +title: Generating data using index mappings nav_order: 15 -parent: Synthetic Data Generation -grand_parent: Features +parent: Synthetic data generation +grand_parent: Additional features --- -# Generating Data with OpenSearch Index Mappings +# Generating data using index mappings -To invoke synthetic data generation, you'll need to provide either one of the two required input files: -* OpenSearch index mappings -* Custom logic (via Python module) +You can use OpenSearch index mappings to generate synthetic data. This approach offers a balance between automation and customization. -This document explores using OpenSearch index mappings to generate synthetic data. - -### Prerequisites - -* **Required**: OpenSearch Index Mapping -* **Optional**: Synthetic Data Generation Config - -### Overview -This approach offers a balance between automation and customization. Synthetic data generation in OpenSearch Benchmark can use basic OpenSearch index mappings like this: +To use this method, save your OpenSearch index mappings to a JSON file: ```json { @@ -64,239 +54,245 @@ This approach offers a balance between automation and customization. Synthetic d } ``` -or complex OpenSearch index mappings like this: +OpenSearch Benchmark works with any valid index mappings, regardless of complexity. You can provide more complex mappings similar to the following: + +
+ + Mappings + + {: .text-delta} ```json { - "mappings": { - "dynamic": "strict", - "properties": { - "user": { - "type": "object", - "properties": { - "id": { - "type": "keyword" - }, - "email": { - "type": "keyword" - }, - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - }, - "completion": { - "type": "completion" - } + "mappings": { + "dynamic": "strict", + "properties": { + "user": { + "type": "object", + "properties": { + "id": { + "type": "keyword" + }, + "email": { + "type": "keyword" + }, + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 }, - "analyzer": "standard" - }, - "address": { - "type": "object", - "properties": { - "street": { - "type": "text" - }, - "city": { - "type": "keyword" - }, - "state": { - "type": "keyword" - }, - "zip": { - "type": "keyword" - }, - "location": { - "type": "geo_point" - } + "completion": { + "type": "completion" } }, - "preferences": { - "type": "object", - "dynamic": true + "analyzer": "standard" + }, + "address": { + "type": "object", + "properties": { + "street": { + "type": "text" + }, + "city": { + "type": "keyword" + }, + "state": { + "type": "keyword" + }, + "zip": { + "type": "keyword" + }, + "location": { + "type": "geo_point" + } } + }, + "preferences": { + "type": "object", + "dynamic": true } - }, - "orders": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "date": { - "type": "date", - "format": "strict_date_optional_time||epoch_millis" - }, - "amount": { - "type": "scaled_float", - "scaling_factor": 100 - }, - "status": { - "type": "keyword" - }, - "items": { - "type": "nested", - "properties": { - "product_id": { - "type": "keyword" - }, - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword" - } + } + }, + "orders": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "date": { + "type": "date", + "format": "strict_date_optional_time||epoch_millis" + }, + "amount": { + "type": "scaled_float", + "scaling_factor": 100 + }, + "status": { + "type": "keyword" + }, + "items": { + "type": "nested", + "properties": { + "product_id": { + "type": "keyword" + }, + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" } - }, - "quantity": { - "type": "short" - }, - "price": { - "type": "float" - }, - "categories": { - "type": "keyword" } + }, + "quantity": { + "type": "short" + }, + "price": { + "type": "float" + }, + "categories": { + "type": "keyword" } - }, - "shipping_address": { - "type": "object", - "properties": { - "street": { - "type": "text" - }, - "city": { - "type": "keyword" - }, - "state": { - "type": "keyword" - }, - "zip": { - "type": "keyword" - }, - "location": { - "type": "geo_point" - } + } + }, + "shipping_address": { + "type": "object", + "properties": { + "street": { + "type": "text" + }, + "city": { + "type": "keyword" + }, + "state": { + "type": "keyword" + }, + "zip": { + "type": "keyword" + }, + "location": { + "type": "geo_point" } } } - }, - "activity_log": { - "type": "nested", - "properties": { - "timestamp": { - "type": "date" - }, - "action": { - "type": "keyword" - }, - "ip_address": { - "type": "ip" - }, - "details": { - "type": "object", - "enabled": false - } + } + }, + "activity_log": { + "type": "nested", + "properties": { + "timestamp": { + "type": "date" + }, + "action": { + "type": "keyword" + }, + "ip_address": { + "type": "ip" + }, + "details": { + "type": "object", + "enabled": false } - }, - "metadata": { - "type": "object", - "properties": { - "created_at": { - "type": "date" - }, - "updated_at": { - "type": "date" - }, - "tags": { - "type": "keyword" - }, - "source": { - "type": "keyword" - }, - "version": { - "type": "integer" - } + } + }, + "metadata": { + "type": "object", + "properties": { + "created_at": { + "type": "date" + }, + "updated_at": { + "type": "date" + }, + "tags": { + "type": "keyword" + }, + "source": { + "type": "keyword" + }, + "version": { + "type": "integer" } - }, - "description": { - "type": "text", - "analyzer": "english", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256 - }, - "standard": { - "type": "text", - "analyzer": "standard" - } + } + }, + "description": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + }, + "standard": { + "type": "text", + "analyzer": "standard" } - }, - "ranking_scores": { - "type": "object", - "properties": { - "popularity": { - "type": "float" - }, - "relevance": { - "type": "float" - }, - "quality": { - "type": "float" - } + } + }, + "ranking_scores": { + "type": "object", + "properties": { + "popularity": { + "type": "float" + }, + "relevance": { + "type": "float" + }, + "quality": { + "type": "float" } - }, - "permissions": { - "type": "nested", - "properties": { - "user_id": { - "type": "keyword" - }, - "role": { - "type": "keyword" - }, - "granted_at": { - "type": "date" - } + } + }, + "permissions": { + "type": "nested", + "properties": { + "user_id": { + "type": "keyword" + }, + "role": { + "type": "keyword" + }, + "granted_at": { + "type": "date" } } } - }, - "settings": { - "number_of_shards": 3, - "number_of_replicas": 2, - "analysis": { - "analyzer": { - "email_analyzer": { - "type": "custom", - "tokenizer": "uax_url_email", - "filter": ["lowercase", "stop"] - } + } + }, + "settings": { + "number_of_shards": 3, + "number_of_replicas": 2, + "analysis": { + "analyzer": { + "email_analyzer": { + "type": "custom", + "tokenizer": "uax_url_email", + "filter": ["lowercase", "stop"] } } } } +} ``` -In the next section, we'll use the example index mappings (or your own) to generate synthetic documents. +
+ +## Generating data + +To generate synthetic data using index mappings, use the `generate-data` subcommand and provide the required index mappings file, index name, output path, and total amount of data to generate: -## Command Parameters -A basic command that activates synthetic data generation with an OpenSearch index mapping: ```shell -osb generate-data --index-name --index-mappings --output-path --total-size +osb generate-data --index-name --index-mappings --output-path --total-size ``` -* `generate-data` (required): sub-command that activates synthetic data generation in OpenSearch Benchmark -* `--index-mappings` or `-i` (required): Path to OpenSearch index mappings -* `--index-name` or `-n` (required): Name of data corpora generated -* `--output-path` or `-p` (required): Path where data should be generated in -* `--total-size` or `-s` (required): Total amount of data that should be generated in GB -* `--custom-config` or `-c` (optional): Path to YAML config defining rules for how data should be generated. This is further explored in the subsequent section -* `--test-document` or `-t` (optional): When flag is present, OSB generates a single synthetic document and outputs to the console. Provides users a way to verify that the example document generated is aligned with expectations. When the flag is not present, the entire data corpora will be generated +{% include copy.html %} + +For a complete list of available parameters and their descriptions, see the [`generate-data` command reference]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/generate-data/). + +## Example output -### Example Output +The following is an example output of generating 100 GB of data: ``` ____ _____ __ ____ __ __ @@ -310,7 +306,7 @@ osb generate-data --index-name --index-mappings -N -L localhost:8787:localhost:8787 ec2-user@ +ssh -i -N -L localhost:8787:localhost:8787 ec2-user@ Total GB to generate: [1] Average document size in bytes: [412] @@ -325,22 +321,11 @@ Generated 24271844660 docs in 12000 seconds. Total dataset size is 100.21GB. [INFO] ✅ SUCCESS (took 272 seconds) ----------------------------------- ``` -This is an example output of what it might look like if you generated 100GB. -## Using synthetic data generation config +## Advanced configuration -Users can have more control over how data is generated with the help of the synthetic data generation config. When generating synthetic data with an OpenSearch index mappings, the synthetic data generation config should have *MappingGenerationValues* defined and either *generator_overrides*, *field_overrides*, or both defined. +You can control how synthetic data is generated by creating a YAML configuration file. The following is an example configuration file that defines custom rules in the `MappingGenerationValues` parameter: -* **MappingGenerationValues** → When synthetic data generator is using OpenSearch index mappings to generate synthetic data, it looks for this section for additional instructions -* **generator_overrides** → For each type of generator defined in this section, the synthetic data generator uses these rules to generate synthetic data for that OpenSearch mapping field type. -* **field_overrides** → For each field defined, the synthetic data generator uses these rules to generate synthetic data for that specific field - -If both generator_overrides and field_overrides are defined, field_overrides have a higher precedence than generator_overrides. -{: .important} - -### Example sdg-config.yml - -Example of MappingGenerationValues defined: ```yml MappingGenerationValues: # For users who want more granular control over how data is generated when providing an OpenSearch mapping @@ -386,11 +371,61 @@ MappingGenerationValues: params: choices: ["Python", "English"] ``` +{% include copy.html %} + +`MappingGenerationValues` supports the following parameters. -To use this synthetic data generation config, append the following parameter and path to the YAML config to the `generate-data` command: +| Parameter | Description | +|---|---| +| `generator_overrides` | Defines custom generator rules for specific OpenSearch field types. Any field that uses the corresponding type will follow these rules. See [Generator overrides parameters](#generator-overrides-parameters). | +| `field_overrides` | Defines generator rules for individual fields by field name. These apply only to the fields explicitly listed. For nested fields, use dot notation (for example, `orders.items.product_id`). See [Field overrides parameters](#field-overrides-parameters). | + +If both `generator_overrides` and `field_overrides` are present, `field_overrides` take precedence. +{: .important} + +#### Generator overrides parameters + +The following parameters are available for each OpenSearch field type in `generator_overrides`. + +| Field type | Parameters | +|---|---| +| `integer`, `long`, `short`, `byte` | `min`, `max` | +| `float`, `double` | `min`, `max`, `round` (the number of decimal places to round to) | +| `date` | `start_date`, `end_date`, `format` | +| `text` | `must_include` (array of terms to include in generated text) | +| `keyword` | `choices` (array of keywords to randomly select from) | + +#### Field overrides parameters + +The following generators and their parameters are available for use in `field_overrides`. + +| Generator | Parameters | +|---|---| +| `generate_text` | `must_include` (array of terms to include in generated text) | +| `generate_keyword` | `choices` (array of keywords to randomly select from) | +| `generate_integer` | `min`, `max` | +| `generate_long` | `min`, `max` | +| `generate_short` | `min`, `max` | +| `generate_byte` | `min`, `max` | +| `generate_float` | `min`, `max`, `round` (the number of decimal places to round to) | +| `generate_double` | `min`, `max` | +| `generate_boolean` | N/A| +| `generate_date` | `format`, `start_date`, `end_date` | +| `generate_ip` | N/A| +| `generate_geo_point` | N/A| +| `generate_knn_vector` | `dimension`, `sample_vectors`, `noise_factor`, `distribution_type`, `normalize`. See [Advanced techniques](/benchmark/features/synthetic-data-generation/advanced/). | +| `generate_sparse_vector` | `num_tokens`, `min_weight`, `max_weight`, `token_id_start`, `token_id_step`. See [Advanced techniques](/benchmark/features/synthetic-data-generation/advanced/). | + +### Using the configuration + +To use your configuration file, provide its full path in the `--custom-config` parameter: ```shell ---custom-config ~/Desktop/sdg-config.yml +osb generate-data --index-name --index-mappings --output-path --total-size --custom-config ~/Desktop/sdg-config.yml ``` +{% include copy.html %} + +## Related documentation -OpenSearch Benchmark should now be generating synthetic data with these rules in mind. +- [`generate-data` command reference]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/generate-data/) +- [Generating data using custom logic]({{site.url}}{{site.baseurl}}/benchmark/features/synthetic-data-generation/custom-logic-sdg/) \ No newline at end of file diff --git a/_benchmark/features/synthetic-data-generation/tips.md b/_benchmark/features/synthetic-data-generation/tips.md index e59b2d148c0..e93e621e4f0 100644 --- a/_benchmark/features/synthetic-data-generation/tips.md +++ b/_benchmark/features/synthetic-data-generation/tips.md @@ -1,15 +1,23 @@ --- layout: default -title: Tips & Tricks +title: Tips and best practices nav_order: 45 -parent: Synthetic Data Generation -grand_parent: Features +parent: Synthetic data generation +grand_parent: Additional features --- -# Tips & tricks +# Tips and best practices -### Visualizing Generation -The URL outputted takes users to a Dask Dashboard that visualizes the generation process. Users can keep track of CPU and memory of each worker as well as obtain a CPU flamegraph of the generation process. This is helpful for monitoring the load generation's resources when generating data or optimizing generation when using a custom python module. +The following tips help you efficiently generate synthetic data and monitor performance during the process. -### Use Default Settings -We recommend using the default settings that come with Synthetic Data Generation. Workers should be no more than the CPU count on the load generation host and chunk sizes should be 10,000 docs per chunk. However, users are encouraged to change the `max_file_size_gb` field as needed. This just changes how much data should be stored in each file generated. +### Visualizing generation + +The generated URL opens a [Dask dashboard](https://docs.dask.org/en/latest/dashboard.html) that visualizes the data generation process. You can monitor CPU and memory usage for each worker and view a CPU flamegraph of the generation workflow. This helps track resource usage and optimize performance, especially when using a [custom Python module]({{site.url}}{{site.baseurl}}/benchmark/features/synthetic-data-generation/custom-logic-sdg/). + +### Use default settings + +We recommend starting with the default synthetic data generation settings. These guidelines help you choose appropriate settings for efficient and reliable synthetic data generation: + +* Set the number of workers to **no more than the CPU count** on the load generation host. +* Use a **chunk size of 10,000 documents** per chunk. +* Adjust the `max_file_size_gb` setting as needed to control how much data is written to each generated file. diff --git a/_benchmark/reference/commands/aggregate.md b/_benchmark/reference/commands/aggregate.md index a891bf3edf1..39dfadedaa1 100644 --- a/_benchmark/reference/commands/aggregate.md +++ b/_benchmark/reference/commands/aggregate.md @@ -1,7 +1,7 @@ --- layout: default title: aggregate -nav_order: 85 +nav_order: 10 parent: Command reference grand_parent: OpenSearch Benchmark Reference redirect_from: diff --git a/_benchmark/reference/commands/command-flags.md b/_benchmark/reference/commands/command-flags.md index 12837cc98bf..50fd9e9390a 100644 --- a/_benchmark/reference/commands/command-flags.md +++ b/_benchmark/reference/commands/command-flags.md @@ -1,7 +1,7 @@ --- layout: default title: Command flags -nav_order: 51 +nav_order: 150 parent: Command reference redirect_from: - /benchmark/commands/command-flags/ diff --git a/_benchmark/reference/commands/compare.md b/_benchmark/reference/commands/compare.md index 35bafe07047..aa9e6ac1a7a 100644 --- a/_benchmark/reference/commands/compare.md +++ b/_benchmark/reference/commands/compare.md @@ -1,7 +1,7 @@ --- layout: default title: compare -nav_order: 55 +nav_order: 20 parent: Command reference grand_parent: OpenSearch Benchmark Reference redirect_from: diff --git a/_benchmark/reference/commands/download.md b/_benchmark/reference/commands/download.md index 34e93551dbb..82a1fd26120 100644 --- a/_benchmark/reference/commands/download.md +++ b/_benchmark/reference/commands/download.md @@ -1,7 +1,7 @@ --- layout: default title: download -nav_order: 60 +nav_order: 30 parent: Command reference grand_parent: OpenSearch Benchmark Reference redirect_from: diff --git a/_benchmark/reference/commands/execute-test.md b/_benchmark/reference/commands/execute-test.md index 4dbfa41da6f..50d823038b7 100644 --- a/_benchmark/reference/commands/execute-test.md +++ b/_benchmark/reference/commands/execute-test.md @@ -1,7 +1,7 @@ --- layout: default title: execute-test -nav_order: 65 +nav_order: 40 parent: Command reference grand_parent: OpenSearch Benchmark Reference redirect_from: diff --git a/_benchmark/reference/commands/generate-data.md b/_benchmark/reference/commands/generate-data.md index 0f12b69c4e4..1ee74252b69 100644 --- a/_benchmark/reference/commands/generate-data.md +++ b/_benchmark/reference/commands/generate-data.md @@ -1,7 +1,7 @@ --- layout: default title: generate-data -nav_order: 75 +nav_order: 50 parent: Command reference grand_parent: OpenSearch Benchmark Reference redirect_from: @@ -10,32 +10,52 @@ redirect_from: # generate-data -### Usage +The `generate-data` command creates synthetic datasets for benchmarking and testing. OpenSearch Benchmark supports two methods for data generation: using OpenSearch index mappings or custom Python modules with user-defined logic. For more information, see [Synthetic data generation]({{site.url}}{{site.baseurl}}/benchmark/features/synthetic-data-generation/). -Users can generate synthetic data with the subcommand generate-data. There are currently two ways to generate synthetic data in OpenSearch — with an OpenSearch index mapping or a custom Python module. The following pages explore generating data with OpenSearch index mappings, generating data with a custom Python module, and general tips and tricks users can use. +## Usage -For more information and examples, see [Synthetic Data Generation Guide]({{site.url}}{{site.baseurl}}/benchmark/features/synthetic-data-generation/) +```shell +osb generate-data --index-name --output-path --total-size [OPTIONS] +``` + +**Requirements**: + +- Either `--index-mappings` or `--custom-module` must be specified, but not both. +- When using `--custom-module`, your Python module must include the `generate_synthetic_document(providers, **custom_lists)` function. + +## Data generation methods + +Choose one of the following approaches: + +**Method 1: Using index mappings**: -### Options +```shell +osb generate-data --index-name my-index --index-mappings mapping.json --output-path ./data --total-size 1 +``` + +**Method 2: Using custom Python module**: ```shell -osb generate-data --custom-module ~/Desktop/http-logs.py --index-name http-logs-regenerated --output-path ~/Desktop/sdg_outputs/ --total-size 2 +osb generate-data --index-name my-index --custom-module custom.py --output-path ./data --total-size 1 ``` -* `generate-data` (required): sub-command that activates synthetic data generation in OpenSearch Benchmark -* `--index-mappings` or `-i` (required): Path to OpenSearch index mappings. If present, `--custom-module` cannot be used. -* `--custom-module` or `-m` (required): Path to Python logic that includes custom logic. If present, `--index-mappings` cannot be used. +## Options + +Use the following options with the `generate-data` command. -Custom Python module must include generate_synthetic_data(providers, **custom_lists) -{: .important} +| Option | Required/Optional | Description | +|---|---|---| +| `--index-name` or `-n` | Required | The name of the data corpora you want to generate. | +| `--output-path` or `-p` | Required | The path where you want the data to be generated. | +| `--total-size` or `-s` | Required | The total amount of data you want to generate, in GB. | +| `--index-mappings` or `-i` | Conditional (Either `--index-mappings` or `--custom-module` must be specified)| The path to the OpenSearch index mappings you want to use. Required when using mapping-based generation. Cannot be used with `--custom-module`. | +| `--custom-module` or `-m` | Conditional (Either `--index-mappings` or `--custom-module` must be specified)| The path to the Python module that includes your custom logic. Required when using custom logic generation. Cannot be used with `--index-mappings`. The Python module must include the `generate_synthetic_document(providers, **custom_lists)` function. | +| `--custom-config` or `-c` | Optional | The path to a YAML configuration file defining rules for how you want data to be generated. | +| `--test-document` or `-t` | Optional | When this flag is present, OSB generates a single synthetic document and outputs it to the console. This provides you with a way to verify that the example document generated aligns with your expectations. When the flag is not present, the entire data corpora will be generated. | -* `--index-name` or `-n` (required): Name of data corpora generated -* `--output-path` or `-p` (required): Path where data should be generated in -* `--total-size` or `-s` (required): Total amount of data that should be generated in GB -* `--custom-config` or `-c` (optional): Path to YAML config defining rules for how data should be generated. This is further explored in the subsequent section -* `--test-document` or `-t` (optional): When flag is present, OSB generates a single synthetic document and outputs to the console. Provides users a way to verify that the example document generated is aligned with expectations. When the flag is not present, the entire data corpora will be generated +## Example output -### Example Output +The following is an example output when generating synthetic data: ``` ____ _____ __ ____ __ __ @@ -49,7 +69,7 @@ Custom Python module must include generate_synthetic_data(providers, **custom_li [NOTE] ✨ Dashboard link to monitor processes and task streams: [http://127.0.0.1:8787/status] [NOTE] ✨ For users who are running generation on a virtual machine, consider SSH port forwarding (tunneling) to localhost to view dashboard. [NOTE] Example of localhost command for SSH port forwarding (tunneling) from an AWS EC2 instance: -ssh -i -N -L localhost:8787:localhost:8787 ec2-user@ +ssh -i -N -L localhost:8787:localhost:8787 ec2-user@ Total GB to generate: [1] Average document size in bytes: [412] @@ -63,4 +83,9 @@ Generated 24271844660 docs in 12000 seconds. Total dataset size is 100.21GB. ----------------------------------- [INFO] ✅ SUCCESS (took 272 seconds) ----------------------------------- -``` \ No newline at end of file +``` + +## Related documentation + +- [Generating data using index mappings]({{site.url}}{{site.baseurl}}/benchmark/features/synthetic-data-generation/mapping-sdg/) +- [Generating data using custom logic]({{site.url}}{{site.baseurl}}/benchmark/features/synthetic-data-generation/custom-logic-sdg/) \ No newline at end of file diff --git a/_benchmark/reference/commands/index.md b/_benchmark/reference/commands/index.md index 12276d17132..88011d6f1a4 100644 --- a/_benchmark/reference/commands/index.md +++ b/_benchmark/reference/commands/index.md @@ -3,19 +3,23 @@ layout: default title: Command reference nav_order: 50 has_children: true +has_toc: false parent: OpenSearch Benchmark Reference redirect_from: /benchmark/commands/index/ --- # OpenSearch Benchmark command reference -This section provides a list of commands supported by OpenSearch Benchmark, including commonly used commands such as `execute-test` and `list`. +OpenSearch Benchmark supports the following commands: -- [compare]({{site.url}}{{site.baseurl}}/benchmark/commands/compare/) -- [download]({{site.url}}{{site.baseurl}}/benchmark/commands/download/) -- [execute-test]({{site.url}}{{site.baseurl}}/benchmark/commands/execute-test/) -- [info]({{site.url}}{{site.baseurl}}/benchmark/commands/info/) -- [list]({{site.url}}{{site.baseurl}}/benchmark/commands/list/) +- [aggregate]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/aggregate/) +- [compare]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/compare/) +- [download]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/download/) +- [execute-test]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/execute-test/) +- [generate-data]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/generate-data/) +- [info]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/info/) +- [list]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/list/) +- [redline-test]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/redline-test/) ## List of common options @@ -25,3 +29,4 @@ All OpenSearch Benchmark commands support the following options: - `--quiet`: Hides as much of the results output as possible. Default is `false`. - `--offline`: Indicates whether OpenSearch Benchmark has a connection to the internet. Default is `false`. +For more information about command options, see [Command flags]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/command-flags/). \ No newline at end of file diff --git a/_benchmark/reference/commands/info.md b/_benchmark/reference/commands/info.md index 3bfefabe99e..cfd3be0b937 100644 --- a/_benchmark/reference/commands/info.md +++ b/_benchmark/reference/commands/info.md @@ -1,7 +1,7 @@ --- layout: default title: info -nav_order: 75 +nav_order: 70 parent: Command reference grand_parent: OpenSearch Benchmark Reference redirect_from: From 6dc22087718b289ad24774c4dba0157924889c94 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Fri, 21 Nov 2025 16:58:52 -0500 Subject: [PATCH 6/9] Update _benchmark/features/synthetic-data-generation/custom-logic-sdg.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- .../features/synthetic-data-generation/custom-logic-sdg.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md index a1243695c65..96292aa758e 100644 --- a/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md +++ b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md @@ -19,7 +19,7 @@ Every custom module provided to OpenSearch Benchmark must define the `generate_s | Parameter | Required/Optional | Description | |---|---|---| | `providers` | Required | A dictionary containing data generation tools. Available providers are `generic` (Mimesis [Generic provider](https://mimesis.name/master/api.html#generic-providers)) and `random` (Mimesis [Random class](https://mimesis.name/master/random_and_seed.html)). To add custom providers, see [Advanced configuration](#advanced-configuration). | -| `**custom_lists` | Optional | Keyword arguments containing predefined lists of values that you can use in your data generation logic. These are defined in your YAML configuration file under `custom_lists` and allow you to separate data values from your Python code. For example, if you define `dog_names: [Buddy, Max, Luna]` in YAML, you can access it as `custom_lists['dog_names']` in your function. This makes it easy to modify data values without changing your Python code. | +| `custom_lists` | Optional | Keyword arguments containing predefined lists of values that you can use in your data generation logic. These are defined in your YAML configuration file under `custom_lists` and allow you to separate data values from your Python code. For example, if you define `dog_names: [Buddy, Max, Luna]` in YAML, you can access it as `custom_lists['dog_names']` in your function. This makes it easy to modify data values without changing your Python code. | ### Basic function template From 44094818b514f5cb05431d48a10551754d2e6cc2 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Fri, 21 Nov 2025 17:15:07 -0500 Subject: [PATCH 7/9] Fix links Signed-off-by: Fanit Kolchina --- _benchmark/features/synthetic-data-generation/mapping-sdg.md | 4 ++-- _benchmark/quickstart.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/_benchmark/features/synthetic-data-generation/mapping-sdg.md b/_benchmark/features/synthetic-data-generation/mapping-sdg.md index fdbd894eac2..cea19e0f2cf 100644 --- a/_benchmark/features/synthetic-data-generation/mapping-sdg.md +++ b/_benchmark/features/synthetic-data-generation/mapping-sdg.md @@ -413,8 +413,8 @@ The following generators and their parameters are available for use in `field_ov | `generate_date` | `format`, `start_date`, `end_date` | | `generate_ip` | N/A| | `generate_geo_point` | N/A| -| `generate_knn_vector` | `dimension`, `sample_vectors`, `noise_factor`, `distribution_type`, `normalize`. See [Advanced techniques](/benchmark/features/synthetic-data-generation/advanced/). | -| `generate_sparse_vector` | `num_tokens`, `min_weight`, `max_weight`, `token_id_start`, `token_id_step`. See [Advanced techniques](/benchmark/features/synthetic-data-generation/advanced/). | +| `generate_knn_vector` | `dimension`, `sample_vectors`, `noise_factor`, `distribution_type`, `normalize`. See [Generating vectors](/benchmark/features/synthetic-data-generation/generating-vectors/). | +| `generate_sparse_vector` | `num_tokens`, `min_weight`, `max_weight`, `token_id_start`, `token_id_step`. See [Generating vectors](/benchmark/features/synthetic-data-generation/generating-vectors/). | ### Using the configuration diff --git a/_benchmark/quickstart.md b/_benchmark/quickstart.md index 928aae59805..5b0505a1a10 100644 --- a/_benchmark/quickstart.md +++ b/_benchmark/quickstart.md @@ -114,9 +114,9 @@ You can now run your first benchmark. The following benchmark uses the [percolat ### Understanding workload command flags -Benchmarks are run using the [`run`]({{site.url}}{{site.baseurl}}/benchmark/commands/run/) command with the following command flags: +Benchmarks are run using the [`run`]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/run/) command with the following command flags: -For additional `run` command flags, see the [run]({{site.url}}{{site.baseurl}}/benchmark/commands/run/) reference. Some commonly used options are `--workload-params`, `--exclude-tasks`, and `--include-tasks`. +For additional `run` command flags, see the [run]({{site.url}}{{site.baseurl}}/benchmark/reference/commands/run/) reference. Some commonly used options are `--workload-params`, `--exclude-tasks`, and `--include-tasks`. {: .tip} * `--pipeline=benchmark-only` : Informs OSB that users wants to provide their own OpenSearch cluster. From 994f74cf3d7b2538560ddf44f0668b9ddf2c6b0f Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Tue, 25 Nov 2025 06:40:25 -0500 Subject: [PATCH 8/9] Update _benchmark/features/synthetic-data-generation/custom-logic-sdg.md Signed-off-by: Nathan Bower --- .../features/synthetic-data-generation/custom-logic-sdg.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md index 96292aa758e..c595ed6f8dc 100644 --- a/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md +++ b/_benchmark/features/synthetic-data-generation/custom-logic-sdg.md @@ -181,7 +181,7 @@ For a complete list of available parameters and their descriptions, see the [`ge ## Example output -The following is an example output of generating 100 GB of data: +The following is an example output when generating 100 GB of data: ``` ____ _____ __ ____ __ __ From 7843f90664dba3586644e8e4da269e250fefa1e5 Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Tue, 25 Nov 2025 07:23:37 -0500 Subject: [PATCH 9/9] Apply suggestions from code review Signed-off-by: Nathan Bower --- .../generating-vectors.md | 45 +++++++++---------- .../synthetic-data-generation/mapping-sdg.md | 2 +- .../reference/commands/generate-data.md | 4 +- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/_benchmark/features/synthetic-data-generation/generating-vectors.md b/_benchmark/features/synthetic-data-generation/generating-vectors.md index 0165e99d5e4..35ef2d2d3e3 100644 --- a/_benchmark/features/synthetic-data-generation/generating-vectors.md +++ b/_benchmark/features/synthetic-data-generation/generating-vectors.md @@ -24,7 +24,7 @@ Example embedding for the word "dog": ## Sparse vectors -Sparse vectors (represented by the [`sparse_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/sparse-vector/) field type in OpenSearch) are vectors in which most dimensions are zero, represented as key-value pairs of non-zero token IDs and their weights. Think of sparse vectors as a dictionary of important words with their importance scores, in which only significant terms are stored. +Sparse vectors (represented by the [`sparse_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/sparse-vector/) field type in OpenSearch) are vectors in which most dimensions are zero, represented as key-value pairs of non-zero token IDs and their weights. Example text: "Korean jindos are hunting dogs that have a reputation for being loyal, independent, and confident." @@ -93,7 +93,7 @@ In each of the generated documents, the `my_embedding` field might appear as fol ### Generating sparse vectors -Generate sparse vectors with default configuration (10 tokens). +Generate sparse vectors with the default configuration (10 tokens). **1. Create a mapping file** (`simple-sparse-mapping.json`): @@ -144,13 +144,13 @@ In each of the generated documents, the `sparse_embedding` field might appear as } ``` -Using only an OpenSearch index mapping, OSB can generate synthetic dense and sparse vectors. However, this produces basic synthetic vectors. For more realistic distributions and clusterings, we recommend configuring the parameters described in the following section. +Using only an OpenSearch index mapping, OpenSearch Benchmark can generate synthetic dense and sparse vectors. However, this produces basic synthetic vectors. For more realistic distributions and clusterings, we recommend configuring the parameters described in the following section. --- ## Dense vector (k-NN vector) parameters -The following are parameters that you can add to your synthetic data generation configuration file (YAML Config) to fine-tune generation of dense vectors. These parameters are used in the `field_overrides` section with the `generate_knn_vector` generator. For complete configuration details, see [Advanced configuration](/benchmark/features/synthetic-data-generation/mapping-sdg/#advanced-configuration). +The following are parameters that you can add to your synthetic data generation configuration file (YAML configuration) to fine-tune the generation of dense vectors. These parameters are used in the `field_overrides` section with the `generate_knn_vector` generator. For complete configuration details, see [Advanced configuration](/benchmark/features/synthetic-data-generation/mapping-sdg/#advanced-configuration). #### dimension @@ -168,14 +168,14 @@ This parameter specifies the number of dimensions in the vector. Optional. The following table shows common dimension values and their typical use cases. -| Dimension | Use Case | Example Models | +| Dimension | Use case | Example models | |-----------|----------|----------------| | 128 | Lightweight, custom models | Custom embeddings, fast search | | 384 | General purpose | sentence-transformers/all-MiniLM-L6-v2 | | 768 | Standard NLP | BERT-Base, DistilBERT, MPNet | -| 1024 | High quality NLP | BERT-Large | -| 1536 | OpenAI standard | text-embedding-ada-002, text-embedding-3-small | -| 3072 | OpenAI premium | text-embedding-3-large | +| 1,024 | High-quality NLP | BERT-Large | +| 1,536 | OpenAI standard | text-embedding-ada-002, text-embedding-3-small | +| 3,072 | OpenAI premium | text-embedding-3-large | **Example**: @@ -194,9 +194,9 @@ field_overrides: #### sample_vectors -This parameter provides base vectors to which the generator adds noise, creating realistic variations and clusters. Optional, but highly recommended. +This parameter provides base vectors to which the generator adds noise, creating realistic variations and clusters. Optional but highly recommended. -Without sample vectors, OSB's synthetic data generator generates random uniform vectors across the entire space, which is unrealistic and offers poor search quality. Providing sample vectors allows OSB's synthetic data generator to create more realistic and natural clusters. +Without sample vectors, OpenSearch Benchmark's synthetic data generator generates random uniform vectors across the entire space, which is unrealistic and offers poor search quality. Providing sample vectors allows OpenSearch Benchmark's synthetic data generator to create more realistic and natural clusters. After you prepare a list of sample vectors, insert them as a **list of lists**, in which each inner list is a complete vector. The following example provides sample vectors in the synthetic data generation configuration file: @@ -221,7 +221,7 @@ Use the following guidelines to determine the number of vectors that you provide **How to obtain sample vectors**: -**Option 1: Using actual embeddings from your domain (Recommended)**: Use actual embeddings from your domain, representing different semantic clusters. Random generation without sample vectors produces unrealistic data unsuitable for search quality testing. +**Option 1 (Recommended): Using actual embeddings from your domain**: Use actual embeddings from your domain, representing different semantic clusters. Random generation without sample vectors produces unrealistic data unsuitable for search quality testing. **Option 2: Using sentence-transformers** in Python: @@ -294,7 +294,7 @@ Optional. Default is `0.1`. The following table shows how different `noise_factor` values impact the generated data. -| `noise_factor` | Effect | Use Case | +| `noise_factor` | Effect | Use case | |--------------|--------|----------| | 0.01--0.05 | Tight clustering, minimal variation | Duplicate detection, near-exact matches | | 0.1--0.2 | Natural variation within topic | General semantic search, recommendations | @@ -341,7 +341,6 @@ The following table shows when to set `normalize` to `true` based on your index * **sentence-transformers**: Many models output normalized vectors. Review the model documentation; in most cases, `normalize` should be set to `true`. * **BERT (raw output)**: Raw BERT embeddings are not normalized. Set `normalize` to `false` and rely on the index configuration to perform normalization if needed. - **Configuration**: ```yaml @@ -368,22 +367,22 @@ field_overrides: --- -## Sparse vectors parameters +## Sparse vector parameters -The following are parameters that you can add to your synthetic data generation configuration file to finetune how sparse vectors are generated. These parameters are used in the `field_overrides` section with the `generate_sparse_vector` generator. For complete configuration details, see [Advanced configuration](/benchmark/features/synthetic-data-generation/mapping-sdg/#advanced-configuration). +The following are parameters that you can add to your synthetic data generation configuration file to fine-tune how sparse vectors are generated. These parameters are used in the `field_overrides` section with the `generate_sparse_vector` generator. For complete configuration details, see [Advanced configuration](/benchmark/features/synthetic-data-generation/mapping-sdg/#advanced-configuration). #### num_tokens This parameter specifies the number of token-weight pairs to generate per vector. Optional. Default is `10`. **Impact**: -- **Low (5--10)**: Very sparse, fast search, may miss some relevant documents +- **Low (5--10)**: Very sparse, fast search; may miss some relevant documents - **Medium (10--25)**: Balanced performance and recall -- **High (50--100)**: Dense sparse representation, comprehensive but slower +- **High (50--100)**: Dense sparse representation; comprehensive but slower The following table shows typical `num_tokens` values for different models and approaches. -| Model/Approach | Typical `num_tokens` | Use Case | +| Model/Approach | Typical `num_tokens` | Use case | |----------------|-------------------|----------| | SPLADE v1 | 10--15 | Standard sparse neural search | | SPLADE v2 | 15--25 | Improved recall | @@ -448,9 +447,9 @@ field_overrides: {% include copy.html %} **Constraints**: -- `min_weight` must be > `0.0` (OpenSearch requires positive weights) -- `max_weight` must be > `min_weight` -- Weights are rounded to `4` decimal places +- `min_weight` must be > `0.0` (OpenSearch requires positive weights). +- `max_weight` must be > `min_weight`. +- Weights are rounded to `4` decimal places. **Best practice**: Keep `min_weight` small (`0.01--0.05`) to allow nuanced weighting. @@ -484,7 +483,7 @@ The following table shows different token ID configurations and their use cases. | Configuration | `token_id_start` | `token_id_step` | Use case | | --------------------------- | ----------------- | --------------- | ------------------------------------------------------------------ | | Default testing | `1000` | `100` | Helps visually distinguish generated token ranges. | -| Realistic vocabulary | `0` | `1` | Aligns token IDs with a real model's vocabulary indices. | +| Realistic vocabulary | `0` | `1` | Aligns token IDs with a real model's vocabulary indexes. | | Multi-field generation | `1000`, `5000`, `10000` | `1` | Keeps token ID ranges separate across different fields. | | Large vocabulary simulation | `0` | `1` | Supports generation scenarios with vocabularies of `50,000`+ tokens. | @@ -540,4 +539,4 @@ The following table outlines when to use simple generation versus a more complex | Search quality testing | Complex generation | Requires meaningful vector clusters to evaluate recall and precision accurately. | -**Recommendation**: For search quality testing or algorithm comparisons, use complex configuration with sample vectors to ensure realistic data distributions. +**Recommendation**: For search quality testing or algorithm comparisons, use a complex configuration with sample vectors to ensure realistic data distributions. diff --git a/_benchmark/features/synthetic-data-generation/mapping-sdg.md b/_benchmark/features/synthetic-data-generation/mapping-sdg.md index cea19e0f2cf..9a333569d57 100644 --- a/_benchmark/features/synthetic-data-generation/mapping-sdg.md +++ b/_benchmark/features/synthetic-data-generation/mapping-sdg.md @@ -292,7 +292,7 @@ For a complete list of available parameters and their descriptions, see the [`ge ## Example output -The following is an example output of generating 100 GB of data: +The following is an example output when generating 100 GB of data: ``` ____ _____ __ ____ __ __ diff --git a/_benchmark/reference/commands/generate-data.md b/_benchmark/reference/commands/generate-data.md index 1ee74252b69..764a8e62f81 100644 --- a/_benchmark/reference/commands/generate-data.md +++ b/_benchmark/reference/commands/generate-data.md @@ -33,7 +33,7 @@ Choose one of the following approaches: osb generate-data --index-name my-index --index-mappings mapping.json --output-path ./data --total-size 1 ``` -**Method 2: Using custom Python module**: +**Method 2: Using a custom Python module**: ```shell osb generate-data --index-name my-index --custom-module custom.py --output-path ./data --total-size 1 @@ -51,7 +51,7 @@ Use the following options with the `generate-data` command. | `--index-mappings` or `-i` | Conditional (Either `--index-mappings` or `--custom-module` must be specified)| The path to the OpenSearch index mappings you want to use. Required when using mapping-based generation. Cannot be used with `--custom-module`. | | `--custom-module` or `-m` | Conditional (Either `--index-mappings` or `--custom-module` must be specified)| The path to the Python module that includes your custom logic. Required when using custom logic generation. Cannot be used with `--index-mappings`. The Python module must include the `generate_synthetic_document(providers, **custom_lists)` function. | | `--custom-config` or `-c` | Optional | The path to a YAML configuration file defining rules for how you want data to be generated. | -| `--test-document` or `-t` | Optional | When this flag is present, OSB generates a single synthetic document and outputs it to the console. This provides you with a way to verify that the example document generated aligns with your expectations. When the flag is not present, the entire data corpora will be generated. | +| `--test-document` or `-t` | Optional | When this flag is present, OpenSearch Benchmark generates a single synthetic document and outputs it to the console. This provides you with a way to verify that the generated example document aligns with your expectations. When the flag is not present, the entire data corpora will be generated. | ## Example output