openvinotoolkit
diff --git a/‎.github/label_config.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/label_config.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/labeler.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/labeler.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/llm_bench-python.yml‎
Lines changed: 15 additions & 13 deletions b/‎.github/workflows/llm_bench-python.yml‎
Lines changed: 15 additions & 13 deletions
diff --git a/‎bandit.yml‎
Lines changed: 1 addition & 1 deletion b/‎bandit.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm_bench/python/README.md‎
100755100644
Lines changed: 2 additions & 171 deletions b/‎llm_bench/python/README.md‎
100755100644
Lines changed: 2 additions & 171 deletions
diff --git a/‎llm_bench/python/doc/IMAGE_GEN.md‎
Lines changed: 0 additions & 23 deletions b/‎llm_bench/python/doc/IMAGE_GEN.md‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎llm_bench/python/doc/NOTES.md‎
Lines changed: 0 additions & 74 deletions b/‎llm_bench/python/doc/NOTES.md‎
Lines changed: 0 additions & 74 deletions
diff --git a/‎llm_bench/python/who_what_bench/README.md‎
Lines changed: 4 additions & 0 deletions b/‎llm_bench/python/who_what_bench/README.md‎
Lines changed: 4 additions & 0 deletions
@@ -1,13 +1,13 @@
 # https://github.com/actions/labeler
 
-# Add label to the PRs changing files under llm_bench/
+# Add label to the PRs changing files under tools/llm_bench/
 llm_bench:
 - changed-files:
   - any-glob-to-any-file:
-    - 'llm_bench/**'
+    - 'tools/llm_bench/**'
     - '.github/workflows/llm_bench-python.yml'
-
+# Add label to the PRs changing files under tools/who_what_benchmark/
 WWB:
 - changed-files:
   - any-glob-to-any-file:
-    - 'llm_bench/python/who_what_benchmark/**'
+    - 'tools/who_what_benchmark/**'
@@ -5,7 +5,7 @@ on:
   pull_request_target:
     types: [opened, edited, synchronize]
     paths:
-      - llm_bench/python/**
+      - tools/llm_bench/**
       - .github/workflows/llm_bench-python.yml
 
 permissions: read-all  # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions
 
@@ -4,17 +4,19 @@
 name: llm_bench Python Test
 
 env:
-  LLM_BENCH_PYPATH: llm_bench/python
-  WWB_PATH: llm_bench/python/who_what_benchmark
+  LLM_BENCH_PYPATH: tools/llm_bench
+  WWB_PATH: tools/who_what_benchmark
 
 on:
   push:
     branches: [ "master" ]
     paths:
-      - llm_bench/python/**
+      - tools/llm_bench/**
+      - tools/who_what_benchmark/**
   pull_request:
     paths:
-      - llm_bench/python/**
+      - tools/llm_bench/**
+      - tools/who_what_benchmark/**
       - .github/workflows/llm_bench-python.yml
 
 permissions: read-all  # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions
@@ -59,22 +61,22 @@ jobs:
       run: |
         export GIT_LFS_SKIP_SMUDGE=0
         git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen
-        python ./llm_bench/python/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
+        python ./tools/llm_bench/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
     - name: Test tiny-random-baichuan2 on Linux
       run: |
         optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
-        python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1
     - name: Test tiny-stable-diffusion on Linux
       run: |
         optimum-cli export openvino --model segmind/tiny-sd --trust-remote-code --weight-format fp16 ./ov_models/tiny-sd/pytorch/dldt/FP16/
-        python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./llm_bench/python/prompts/stable-diffusion.jsonl -d cpu -n 1
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1
     - name: WWB Tests
       run: |
         GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt
         pip install git+https://github.com/huggingface/optimum.git
         GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }}
         python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall
-        python -m pytest llm_bench/python/who_what_benchmark/tests
+        python -m pytest tools/who_what_benchmark/tests
   stateful:
     runs-on: ubuntu-20.04
     steps:
@@ -84,16 +86,16 @@ jobs:
           python-version: "3.10"
       - name: Test stateful
         run: |
-          GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt
+          GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r tools/llm_bench/requirements.txt
           python -m pip uninstall --yes openvino
           python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful
+          python tools/llm_bench/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful
           grep beam_idx pytorch/dldt/FP32/openvino_model.xml
       - name: WWB Tests
         run: |
-          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r llm_bench/python/who_what_benchmark/requirements.txt
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r tools/who_what_benchmark/requirements.txt
           pip install git+https://github.com/huggingface/optimum.git
-          GIT_CLONE_PROTECTION_ACTIVE=false pip install llm_bench/python/who_what_benchmark/
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install tools/who_what_benchmark/
           pip install pytest
           python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall
-          python -m pytest llm_bench/python/who_what_benchmark/tests
+          python -m pytest tools/who_what_benchmark/tests
@@ -131,7 +131,7 @@ any_other_function_with_shell_equals_true:
   - subprocess.check_output
   - subprocess.run
 assert_used:
-  skips: ["llm_bench/python/who_what_benchmark/tests/test_*.py"]
+  skips: ["tools/who_what_benchmark/tests/test_*.py"]
 hardcoded_tmp_directory:
   tmp_dirs:
   - /tmp
 
@@ -1,173 +1,4 @@
 # Benchmarking Script for Large Language Models
 
-This script provides a unified approach to estimate performance for Large Language Models (LLMs). It leverages pipelines provided by Optimum-Intel and allows performance estimation for PyTorch and OpenVINO models using nearly identical code and pre-collected models.
-
-
-### 1. Prepare Python Virtual Environment for LLM Benchmarking
-   
-``` bash
-python3 -m venv ov-llm-bench-env
-source ov-llm-bench-env/bin/activate
-pip install --upgrade pip
-
-git clone  https://github.com/openvinotoolkit/openvino.genai.git
-cd openvino.genai/llm_bench/python/
-pip install -r requirements.txt  
-```
-
-> Note:
-> For existing Python environments, run the following command to ensure that all dependencies are installed with the latest versions:  
-> `pip install -U --upgrade-strategy eager -r requirements.txt`
-
-#### (Optional) Hugging Face Login :
-
-Login to Hugging Face if you want to use non-public models:
-
-```bash
-huggingface-cli login
-```
-
-### 2. Convert Model to OpenVINO IR Format
-   
-The `optimum-cli` tool simplifies converting Hugging Face models to OpenVINO IR format. 
-- Detailed documentation can be found in the [Optimum-Intel documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/export). 
-- To learn more about weight compression, see the [NNCF Weight Compression Guide](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html).
-- For additional guidance on running inference with OpenVINO for LLMs, see the [OpenVINO LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html).
-
-**Usage:**
-
-```bash
-optimum-cli export openvino --model <MODEL_ID> --weight-format <PRECISION> <OUTPUT_DIR>
-
-optimum-cli export openvino -h # For detailed information
-```
-
-* `--model <MODEL_ID>` : model_id for downloading from [huggngface_hub](https://huggingface.co/models) or path with directory where pytorch model located. 
-* `--weight-format <PRECISION>` : precision for model conversion. Available options: `fp32, fp16, int8, int4, mxfp4`
-* `<OUTPUT_DIR>`: output directory for saving generated OpenVINO model.
-
-**NOTE:** 
-- Models larger than 1 billion parameters are exported to the OpenVINO format with 8-bit weights by default. You can disable it with `--weight-format fp32`.
-
-**Example:**
-```bash
-optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format fp16 models/llama-2-7b-chat
-```
-**Resulting file structure:**
-
-```console
-    models
-    └── llama-2-7b-chat
-        ├── config.json
-        ├── generation_config.json
-        ├── openvino_detokenizer.bin
-        ├── openvino_detokenizer.xml
-        ├── openvino_model.bin
-        ├── openvino_model.xml
-        ├── openvino_tokenizer.bin
-        ├── openvino_tokenizer.xml
-        ├── special_tokens_map.json
-        ├── tokenizer_config.json
-        ├── tokenizer.json
-        └── tokenizer.model
-```
-
-### 3. Benchmark LLM Model
-
-To benchmark the performance of the LLM, use the following command:
-
-``` bash
-python benchmark.py -m <model> -d <device> -r <report_csv> -f <framework> -p <prompt text> -n <num_iters>
-# e.g.
-python benchmark.py -m models/llama-2-7b-chat/ -n 2
-python benchmark.py -m models/llama-2-7b-chat/ -p "What is openvino?" -n 2
-python benchmark.py -m models/llama-2-7b-chat/ -pf prompts/llama-2-7b-chat_l.jsonl -n 2
-```
-
-**Parameters:**
-- `-m`: Path to the model.
-- `-d`: Inference device (default: CPU).
-- `-r`: Path to the CSV report.
-- `-f`: Framework (default: ov).
-- `-p`: Interactive prompt text.
-- `-pf`: Path to a JSONL file containing prompts.
-- `-n`: Number of iterations (default: 0, the first iteration is excluded).
-- `-ic`: Limit the output token size (default: 512) for text generation and code generation models.
-
-**Additional options:**
-``` bash
-python ./benchmark.py -h # for more information
-```
-
-#### Benchmarking the Original PyTorch Model:
-To benchmark the original PyTorch model, first download the model locally and then run benchmark by specifying PyTorch as the framework with parameter `-f pt`
-
-```bash
-# Download PyTorch Model
-huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir models/llama-2-7b-chat/pytorch
-# Benchmark with PyTorch Framework
-python benchmark.py -m models/llama-2-7b-chat/pytorch -n 2 -f pt
-```
-
-> **Note:** If needed, You can install a specific OpenVINO version using pip:
-> ``` bash
-> # e.g. 
-> pip install openvino==2024.4.0
-> # Optional, install the openvino nightly package if needed.
-> # OpenVINO nightly is pre-release software and has not undergone full release validation or qualification. 
-> pip uninstall openvino
-> pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-> ```
-
-## 4. Benchmark LLM with `torch.compile()`
-
-The `--torch_compile_backend` option enables you to use `torch.compile()` to accelerate PyTorch models by compiling them into optimized kernels using a specified backend.
-
-Before benchmarking, you need to download the original PyTorch model. Use the following command to download the model locally:
-
-```bash
-huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir models/llama-2-7b-chat/pytorch
-```
-
-To run the benchmarking script with `torch.compile()`, use the `--torch_compile_backend` option to specify the backend. You can choose between `pytorch` or `openvino` (default). Example:
-
-```bash
-python ./benchmark.py -m models/llama-2-7b-chat/pytorch -d CPU --torch_compile_backend openvino
-```
-
-> **Note:** To use `torch.compile()` with CUDA GPUs, you need to install the nightly version of PyTorch:
->
-> ```bash
-> pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
-> ```
-
-
-## 5. Running on 2-Socket Platforms
-
-The benchmarking script sets `openvino.properties.streams.num(1)` by default. For multi-socket platforms, use `numactl` on Linux or the `--load_config` option to modify behavior.
-
-| OpenVINO Version    | Behaviors                                       |
-|:--------------------|:------------------------------------------------|
-| Before 2024.0.0     | streams.num(1) <br>execute on 2 sockets.        |
-| 2024.0.0            | streams.num(1) <br>execute on the same socket as the APP is running on. |
-
-For example, `--load_config config.json` as following will result in streams.num(1) and execute on 2 sockets.
-```json
-{
-  "INFERENCE_NUM_THREADS": <NUMBER>
-} 
-``` 
-`<NUMBER>` is the number of total physical cores in 2 sockets.
-
-## 6. Execution on CPU device
-
-OpenVINO is by default bult with [oneTBB](https://github.com/oneapi-src/oneTBB/) threading library, while Torch uses [OpenMP](https://www.openmp.org/). Both threading libraries have ['busy-wait spin'](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fSPINCOUNT.html) by default. When running LLM pipeline on CPU device, there is threading overhead in the switching between inference on CPU with OpenVINO (oneTBB) and postprocessing (For example: greedy search or beam search) with Torch (OpenMP).
-
-**Alternative solutions**
-1. Use --genai option which uses OpenVINO genai API instead of optimum-intel API. In this case postprocessing is executed with OpenVINO genai API.
-2. Without --genai option which uses optimum-intel API, set environment variable [OMP_WAIT_POLICY](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fWAIT_005fPOLICY.html) to PASSIVE which will disable OpenMP 'busy-wait', and benchmark.py will also limit the Torch thread number to avoid using CPU cores which is in 'busy-wait' by OpenVINO inference.
-
-## 7. Additional Resources
-
-- **Error Troubleshooting:** Check the [NOTES.md](./doc/NOTES.md) for solutions to known issues.
-- **Image Generation Configuration:** Refer to [IMAGE_GEN.md](./doc/IMAGE_GEN.md) for setting parameters for image generation models.
+> [!IMPORTANT]  
+> LLM bench code was moved to [tools](../../tools/llm_bench/) directory. Please navigate to the new directory for continue of tool usage.
@@ -0,0 +1,4 @@
+# Simple Accuracy Benchmark for Generative AI models
+
+> [!IMPORTANT]  
+> Who What Benchmark code was moved to [tools](../../../tools/who_what_benchmark/) directory. Please navigate to the new directory for continue of tool usage.