unixwzrd
diff --git a/‎.env.default‎
Lines changed: 33 additions & 0 deletions b/‎.env.default‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎.env.local‎
Lines changed: 85 additions & 0 deletions b/‎.env.local‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 36 additions & 22 deletions b/‎README.md‎
Lines changed: 36 additions & 22 deletions
diff --git a/‎characters/instruction-following/Llama-v2.yaml‎
Lines changed: 1 addition & 1 deletion b/‎characters/instruction-following/Llama-v2.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎convert-to-flexgen.py‎
Lines changed: 0 additions & 63 deletions b/‎convert-to-flexgen.py‎
Lines changed: 0 additions & 63 deletions
diff --git a/‎docker/.dockerignore‎
Lines changed: 0 additions & 9 deletions b/‎docker/.dockerignore‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎docker/.env.example‎
Lines changed: 0 additions & 30 deletions b/‎docker/.env.example‎
Lines changed: 0 additions & 30 deletions
@@ -0,0 +1,33 @@
+#/bin/bash
+
+# This is for overriding the setup_*.sh, webui.py, server.py, and other
+# oogabooga environment variables. If they are not set here, they will
+# default to what the one-click-installers use as their defaults.
+#
+export OOBABOOGA_INST=="${OOBABOOGA_INST:=${OOBABOOGA_BASE}/oobagooga_instal}"
+export OOBABOOGA_BASE="${OOBABOOGA_BASE:=${PWD}}"
+export OOBABOOGA_WEBUI="${OOBABOOGA_BASE}/text-generation-webui"
+export OOBABOOGA_OPTS="--chat"
+
+# This file will be overwritten by updating with a pull if new variables
+# are added to support the environment.
+#
+# The following was generated by the following command line in my login
+# shell environment.  I already have miniconda/conda installed.
+#
+# conda info --all | \
+#     grep '^CONDA' | \
+#     awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}'
+#
+
+export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
+export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen}"
+
+export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
+export CONDA_EXE="${CONDA_EXE:=${OOBABOOGA_INST}/installer_files/conda/bin/conda}"
+export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+export CONDA_ROOT="${CONDA_ROOT:=${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda}"
+export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"
@@ -0,0 +1,85 @@
+#/bin/bash
+# This is for overriding the setup_*.sh, webui.py, server.py, and other
+# oogabooga environment variables. If they are not set here, they will
+# default to what the one-click-installers use as their defaults.
+#
+export OOBABOOGA_INST=="oobabooga"
+export OOBABOOGA_BASE="${HOME}/${OOBABOOGA_BASE}"
+export OOBABOOGA_WEBUI="${OOBABOOGA_BASE}/text-generation-webui-macos"
+export OOBABOOGA_OPTS="--chat --verbose "
+
+# This file will be overwritten by updating with a pull if new variables
+# are added to support the environment.
+#
+# export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
+# export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen}"
+# export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
+# export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+# export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+# export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"
+
+# The following was generated by the following command line in my login
+# shell environment.  I already have miniconda/conda installed.
+#
+# conda info --all | grep '^CONDA' | awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}
+#
+export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
+export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen.00}"
+export CONDA_EXE="${CONDA_EXE:=/Users/unixwzrd/miniconda3/bin/conda}"
+export CONDA_PREFIX="${CONDA_PREFIX:=/Users/unixwzrd/miniconda3}"
+export CONDA_PROMPT_MODIFIER="${CONDA_PROMPT_MODIFIER:=(base) }"
+export CONDA_PYTHON_EXE="${CONDA_PYTHON_EXE:=/Users/unixwzrd/miniconda3/bin/python}"
+export CONDA_ROOT="${CONDA_ROOT:=/Users/unixwzrd/miniconda3}"
+
+# This is for overriding the setup_*.sh, webui.py, server.py, and other
+# oogabooga environment variables. If they are not set here, they will
+# default to what the one-click-installers use as their defaults.
+#
+# Set the default Conda environment and the environment for the Web GUI
+#
+export CONDA_DEFAULT_ENV="base"
+export CONDA_OOBABOOGA_ENV="textgen"
+
+OOBABOOGA_BASE="oobabooga_macos"
+
+OOBABOOGA_INST="${HOME}/${OOBABOOGA_BASE}"
+
+export CONDA_DEFAULT_ENV="textgen"
+export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
+export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+export CONDA_PROMPT_MODIFIER="() "
+export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"#/bin/bash
+
+# This is for overriding the setup_*.sh, webui.py, server.py, and other
+# oogabooga environment variables. If they are not set here, they will
+# default to what the one-click-installers use as their defaults.
+#
+export OOBABOOGA_INST=="oobabooga-test-install"
+export OOBABOOGA_BASE="${HOME}/${OOBABOOGA_BASE}"
+export OOBABOOGA_WEBUI="${OOBABOOGA_BASE}/text-generation-webui"
+export OOBABOOGA_OPTS="--chat --verbose "
+
+# This file will be overwritten by updating with a pull if new variables
+# are added to support the environment.
+#
+# export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
+# export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen}"
+# export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
+# export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
+# export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
+# export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"
+
+# The following was generated by the following command line in my login
+# shell environment.  I already have miniconda/conda installed.
+#
+# conda info --all | \
+#     grep '^CONDA' | \
+#     awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}'
+
+export CONDA_ROOT="/Users/unixwzrd/miniconda3"
+export CONDA_PREFIX="${CONDA_ROOT}"
+# export CONDA_PROMPT_MODIFIER=" (base)"
+export CONDA_PYTHON_EXE="${CONDA_ROOT}/bin/python"
+export CONDA_EXE="${CONDA_DEFAULT_ENV}/bin/conda"
+
@@ -37,3 +37,9 @@ Thumbs.db
 
 *.swp
 .*.un~
+*.prof
+
+# Coming soon
+.env.local
+oobstart
+oobainst
@@ -1,6 +1,26 @@
-# OLD VERSION - 1.3.1 Patched for macOS and Apple Silicon
+# MERGED 1.5 Version.  macOS TEST VERSION
 
-Patched and working with macOS and Apple Silicon M1/M2 GPU now.
+This is a development version and I have not added many changes I had planned. Please feel free to use at your own risk as there may be bugs not yet found.
+
+Items Added to this version.
+ * "Stop Server" under the sessions tab. Use with caution if in multi-user, will probably disable this if in multi-user mode, however it offers better shutdown than just killing the process on the server.
+ * Added Python Class for handling diverse GPU/Compute devices like CUDA, CPU or MPS Changed code to use "torch device" once set initially to a device. Will fall back to CPU.
+
+Items working and tested on macOS
+ * More support for Apple Silicon M1/M2 processors.
+ * Working with new llama-cpp-python 0.1.81
+ * Works with LLaMa2 Models
+    * There GGML models will need conversion to GGUF format if using llama-cpp-python 0.1.81.
+    * Earlier version llama-coo-python still works
+    * Have not concluded testing of library dependencies, will have that updated in build instructions for oobagooba-macOS.
+    * Still mainly supporting GGML, now GGUF, GG-Universal Format files. You will have to convert your GGML files to GGUF format.
+
+Removed from this
+ * Tried to continue what was already started in removing FlexGEN from the repo.
+ * Removed Docker - if someone wants to help maintain for macOS, let me know.
+ * SLowly removing information on CUDA as it is not relevant to macOS.
+
+  **Updated Installation Instructions** for libraries in the [oobabooga-macOS Quickstart](https://github.com/unixwzrd/oobabooga-macOS/blob/main/macOS_Apple_Silicon_QuickStart.md) and the longer [Building Apple Silicon Support](https://github.com/unixwzrd/oobabooga-macOS/blob/main/macOS_Apple_Silicon_QuickStart.md)
 
 GGML support is in this release, and has not been extensively tested. From the look of upstream commits, there are some changes which must be made before this will work with Llama2 models.
 
@@ -13,15 +33,15 @@ Otherwise, use these instructions I have on putting together the macOS Python en
 
 I will be updating this README file with new information specifically regarding macOS and Apple Silicon.
 
-I would like to work closely with the oobaboogs team and try to implement simkilar solutions so the web UI can have a similar look and feel.
+I would like to work closely with the oobabooga team and try to implement similar solutions so the web UI can have a similar look and feel.
 
 Maintaining and improving support for macOS and Apple Silicon in this project has required significant research, debugging, and development effort. If you find my contributions helpful and want to show your appreciation, you can Buy Me a Coffee, sponsor this project, or consider me for job opportunities.
 
 While the focus of this branch is to enhance macOS and Apple Silicon support, I aim to maintain compatibility with Linux and POSIX operating systems. Contributions and feedback related to Linux compatibility are always welcome.
 
 Anyone who would like to assist with supporting Apple Silicon, let me know. There is much to do and I can only do so much by myself.
 
-- [OLD VERSION - 1.3.1 Patched for macOS and Apple Silicon](#old-version---131-patched-for-macos-and-apple-silicon)
+- [MERGED 1.5 Version.  macOS TEST VERSION](#merged-15-version--macos-test-version)
   - [Features](#features)
   - [Installation](#installation)
   - [Downloading models](#downloading-models)
@@ -36,9 +56,9 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
       - [AutoGPTQ](#autogptq)
       - [ExLlama](#exllama)
       - [GPTQ-for-LLaMa](#gptq-for-llama)
-      - [FlexGen](#flexgen)
       - [DeepSpeed](#deepspeed)
       - [RWKV](#rwkv)
+      - [RoPE (for llama.cpp and ExLlama only)](#rope-for-llamacpp-and-exllama-only)
       - [Gradio](#gradio)
       - [API](#api)
       - [Multimodal](#multimodal)
@@ -47,7 +67,6 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
   - [Community](#community)
   - [Credits](#credits)
 
-
 ## Features
 
 * 3 interface modes: default, notebook, and chat
@@ -56,7 +75,7 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
 * LoRA: load and unload LoRAs on the fly, load multiple LoRAs at the same time, train a new LoRA
 * Precise instruction templates for chat mode, including Alpaca, Vicuna, Open Assistant, Dolly, Koala, ChatGLM, MOSS, RWKV-Raven, Galactica, StableLM, WizardLM, Baize, Ziya, Chinese-Vicuna, MPT, INCITE, Wizard Mega, KoAlpaca, Vigogne, Bactrian, h2o, and OpenBuddy
 * [Multimodal pipelines, including LLaVA and MiniGPT-4](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal)
-* 8-bit and 4-bit inference through bitsandbytes **CPU only mode for macOS, bitsandbytes does not support Apple Silicon M1/M2 processors**
+* 8-bit and 4-bit inference through bitsandbytes **CPU only mode for macOS, bitsandbytes does not support Apple Silicon GPU**
 * CPU mode for transformers models
 * [DeepSpeed ZeRO-3 inference](docs/DeepSpeed.md)
 * [Extensions](docs/Extensions.md)
@@ -165,7 +184,7 @@ Optionally, you can use the following command-line flags:
 
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen |
+| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv |
 
 #### Accelerate/transformers
 
@@ -203,8 +222,8 @@ Optionally, you can use the following command-line flags:
 | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
 | `--no-mmap` | Prevent mmap from being used. |
 | `--mlock`   | Force the system to keep the model in RAM. |
-| `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
-| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
+| `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. Does not apply for Apple Silicon GPU since it uses unified memory. |
+| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with Apple Silicon GPU Support for BLAS and llama-cpp using Metal. Load the model and look for **llama_model_load_internal: n_layer in ths STDERR and this will show you the number of layers in the model. Set this value to that number or possibly n + 2, This si very sensitive now an will overrun your data area or tensor cache causing a segmentation fault. |
 | `--n_ctx N_CTX` | Size of the prompt context. |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 | `--n_gqa N_GQA`         | grouped-query attention. Must be 8 for llama2 70b. |
@@ -226,8 +245,6 @@ Optionally, you can use the following command-line flags:
 |------------------|-------------|
 |`--gpu-split`     | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` |
 |`--max_seq_len MAX_SEQ_LEN`           | Maximum sequence length. |
-|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
-|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both. `
 
 #### GPTQ-for-LLaMa
 
@@ -243,14 +260,6 @@ Optionally, you can use the following command-line flags:
 | `--warmup_autotune`    | (triton) Enable warmup autotune. |
 | `--fused_mlp`          | (triton) Enable fused mlp. |
 
-#### FlexGen
-
-| Flag             | Description |
-|------------------|-------------|
-| `--percent PERCENT [PERCENT ...]` | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
-| `--compress-weight`               | FlexGen: Whether to compress weight (default: False).|
-| `--pin-weight [PIN_WEIGHT]`       | FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). |
-
 #### DeepSpeed
 
 | Flag                                  | Description |
@@ -266,6 +275,13 @@ Optionally, you can use the following command-line flags:
 | `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
 | `--rwkv-cuda-on`                | RWKV: Compile the CUDA kernel for better performance. |
 
+#### RoPE (for llama.cpp and ExLlama only)
+
+| Flag             | Description |
+|------------------|-------------|
+|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
+|`--alpha_value ALPHA_VALUE`           | Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both. |
+
 #### Gradio
 
 | Flag                                  | Description |
@@ -293,8 +309,6 @@ Optionally, you can use the following command-line flags:
 |---------------------------------------|-------------|
 | `--multimodal-pipeline PIPELINE`      | The multimodal pipeline to use. Examples: `llava-7b`, `llava-13b`. |
 
-Out of memory errors? [Check the low VRAM guide](docs/Low-VRAM-guide.md).
-
 ## Presets
 
 Inference settings presets can be created under `presets/` as yaml files. These files are detected automatically at startup.
 
@@ -1,4 +1,4 @@
 user: ""
 bot: ""
 turn_template: "<|user|><|user-message|> [/INST] <|bot|><|bot-message|> </s><s>[INST] "
-context: "[INST] <<SYS>>\nAnswer the questions.\n<</SYS>>\n"
+context: "[INST] <<SYS>>\nAnswer the questions.\n<</SYS>>\n\n"