Skip to content

Commit e24a01a

Browse files
authored
Dev merge (#11)
* updated source to supoort mps and cuds * mpre changes. * mpre changes. * Current point in time. * missed a change * Updated files and README * Committing changes before applying stash * .gitignore.. whatever. * Update the requirements.tst for llama.cpp and gguf * Update to requitements for ctransformers * Update requirements.txt bumped llama-cpp-python * Handle GGML files for model loading * chaekcpint
1 parent d71f674 commit e24a01a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+1306
-857
lines changed

.env.default

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#/bin/bash
2+
3+
# This is for overriding the setup_*.sh, webui.py, server.py, and other
4+
# oogabooga environment variables. If they are not set here, they will
5+
# default to what the one-click-installers use as their defaults.
6+
#
7+
export OOBABOOGA_INST=="${OOBABOOGA_INST:=${OOBABOOGA_BASE}/oobagooga_instal}"
8+
export OOBABOOGA_BASE="${OOBABOOGA_BASE:=${PWD}}"
9+
export OOBABOOGA_WEBUI="${OOBABOOGA_BASE}/text-generation-webui"
10+
export OOBABOOGA_OPTS="--chat"
11+
12+
# This file will be overwritten by updating with a pull if new variables
13+
# are added to support the environment.
14+
#
15+
# The following was generated by the following command line in my login
16+
# shell environment. I already have miniconda/conda installed.
17+
#
18+
# conda info --all | \
19+
# grep '^CONDA' | \
20+
# awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}'
21+
#
22+
23+
export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
24+
export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen}"
25+
26+
export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
27+
export CONDA_EXE="${CONDA_EXE:=${OOBABOOGA_INST}/installer_files/conda/bin/conda}"
28+
export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
29+
export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
30+
export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
31+
export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
32+
export CONDA_ROOT="${CONDA_ROOT:=${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda}"
33+
export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"

.env.local

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#/bin/bash
2+
# This is for overriding the setup_*.sh, webui.py, server.py, and other
3+
# oogabooga environment variables. If they are not set here, they will
4+
# default to what the one-click-installers use as their defaults.
5+
#
6+
export OOBABOOGA_INST=="oobabooga"
7+
export OOBABOOGA_BASE="${HOME}/${OOBABOOGA_BASE}"
8+
export OOBABOOGA_WEBUI="${OOBABOOGA_BASE}/text-generation-webui-macos"
9+
export OOBABOOGA_OPTS="--chat --verbose "
10+
11+
# This file will be overwritten by updating with a pull if new variables
12+
# are added to support the environment.
13+
#
14+
# export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
15+
# export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen}"
16+
# export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
17+
# export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
18+
# export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
19+
# export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"
20+
21+
# The following was generated by the following command line in my login
22+
# shell environment. I already have miniconda/conda installed.
23+
#
24+
# conda info --all | grep '^CONDA' | awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}
25+
#
26+
export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
27+
export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen.00}"
28+
export CONDA_EXE="${CONDA_EXE:=/Users/unixwzrd/miniconda3/bin/conda}"
29+
export CONDA_PREFIX="${CONDA_PREFIX:=/Users/unixwzrd/miniconda3}"
30+
export CONDA_PROMPT_MODIFIER="${CONDA_PROMPT_MODIFIER:=(base) }"
31+
export CONDA_PYTHON_EXE="${CONDA_PYTHON_EXE:=/Users/unixwzrd/miniconda3/bin/python}"
32+
export CONDA_ROOT="${CONDA_ROOT:=/Users/unixwzrd/miniconda3}"
33+
34+
# This is for overriding the setup_*.sh, webui.py, server.py, and other
35+
# oogabooga environment variables. If they are not set here, they will
36+
# default to what the one-click-installers use as their defaults.
37+
#
38+
# Set the default Conda environment and the environment for the Web GUI
39+
#
40+
export CONDA_DEFAULT_ENV="base"
41+
export CONDA_OOBABOOGA_ENV="textgen"
42+
43+
OOBABOOGA_BASE="oobabooga_macos"
44+
45+
OOBABOOGA_INST="${HOME}/${OOBABOOGA_BASE}"
46+
47+
export CONDA_DEFAULT_ENV="textgen"
48+
export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
49+
export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
50+
export CONDA_PROMPT_MODIFIER="() "
51+
export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
52+
export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"#/bin/bash
53+
54+
# This is for overriding the setup_*.sh, webui.py, server.py, and other
55+
# oogabooga environment variables. If they are not set here, they will
56+
# default to what the one-click-installers use as their defaults.
57+
#
58+
export OOBABOOGA_INST=="oobabooga-test-install"
59+
export OOBABOOGA_BASE="${HOME}/${OOBABOOGA_BASE}"
60+
export OOBABOOGA_WEBUI="${OOBABOOGA_BASE}/text-generation-webui"
61+
export OOBABOOGA_OPTS="--chat --verbose "
62+
63+
# This file will be overwritten by updating with a pull if new variables
64+
# are added to support the environment.
65+
#
66+
# export CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:=base}"
67+
# export CONDA_OOBABOOGA_ENV="${CONDA_OOBABOOGA_DEFAULT_ENV:=textgen}"
68+
# export CONDA_EXE="${OOBABOOGA_INST}/installer_files/conda/bin/conda"
69+
# export CONDA_PREFIX="${OOBABOOGA_INST}/installer_files/condaj/envs/textgen"
70+
# export CONDA_PYTHON_EXE="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda/bin/python"
71+
# export CONDA_ROOT="${OOBABOOGA_INST}/oobabooga_macos/installer_files/conda"
72+
73+
# The following was generated by the following command line in my login
74+
# shell environment. I already have miniconda/conda installed.
75+
#
76+
# conda info --all | \
77+
# grep '^CONDA' | \
78+
# awk -F': ' '{print "export " $1 "=\"${"$1":="$2"}\""}'
79+
80+
export CONDA_ROOT="/Users/unixwzrd/miniconda3"
81+
export CONDA_PREFIX="${CONDA_ROOT}"
82+
# export CONDA_PROMPT_MODIFIER=" (base)"
83+
export CONDA_PYTHON_EXE="${CONDA_ROOT}/bin/python"
84+
export CONDA_EXE="${CONDA_DEFAULT_ENV}/bin/conda"
85+

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,9 @@ Thumbs.db
3737

3838
*.swp
3939
.*.un~
40+
*.prof
41+
42+
# Coming soon
43+
.env.local
44+
oobstart
45+
oobainst

README.md

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,26 @@
1-
# OLD VERSION - 1.3.1 Patched for macOS and Apple Silicon
1+
# MERGED 1.5 Version. macOS TEST VERSION
22

3-
Patched and working with macOS and Apple Silicon M1/M2 GPU now.
3+
This is a development version and I have not added many changes I had planned. Please feel free to use at your own risk as there may be bugs not yet found.
4+
5+
Items Added to this version.
6+
* "Stop Server" under the sessions tab. Use with caution if in multi-user, will probably disable this if in multi-user mode, however it offers better shutdown than just killing the process on the server.
7+
* Added Python Class for handling diverse GPU/Compute devices like CUDA, CPU or MPS Changed code to use "torch device" once set initially to a device. Will fall back to CPU.
8+
9+
Items working and tested on macOS
10+
* More support for Apple Silicon M1/M2 processors.
11+
* Working with new llama-cpp-python 0.1.81
12+
* Works with LLaMa2 Models
13+
* There GGML models will need conversion to GGUF format if using llama-cpp-python 0.1.81.
14+
* Earlier version llama-coo-python still works
15+
* Have not concluded testing of library dependencies, will have that updated in build instructions for oobagooba-macOS.
16+
* Still mainly supporting GGML, now GGUF, GG-Universal Format files. You will have to convert your GGML files to GGUF format.
17+
18+
Removed from this
19+
* Tried to continue what was already started in removing FlexGEN from the repo.
20+
* Removed Docker - if someone wants to help maintain for macOS, let me know.
21+
* SLowly removing information on CUDA as it is not relevant to macOS.
22+
23+
**Updated Installation Instructions** for libraries in the [oobabooga-macOS Quickstart](https://github.com/unixwzrd/oobabooga-macOS/blob/main/macOS_Apple_Silicon_QuickStart.md) and the longer [Building Apple Silicon Support](https://github.com/unixwzrd/oobabooga-macOS/blob/main/macOS_Apple_Silicon_QuickStart.md)
424

525
GGML support is in this release, and has not been extensively tested. From the look of upstream commits, there are some changes which must be made before this will work with Llama2 models.
626

@@ -13,15 +33,15 @@ Otherwise, use these instructions I have on putting together the macOS Python en
1333

1434
I will be updating this README file with new information specifically regarding macOS and Apple Silicon.
1535

16-
I would like to work closely with the oobaboogs team and try to implement simkilar solutions so the web UI can have a similar look and feel.
36+
I would like to work closely with the oobabooga team and try to implement similar solutions so the web UI can have a similar look and feel.
1737

1838
Maintaining and improving support for macOS and Apple Silicon in this project has required significant research, debugging, and development effort. If you find my contributions helpful and want to show your appreciation, you can Buy Me a Coffee, sponsor this project, or consider me for job opportunities.
1939

2040
While the focus of this branch is to enhance macOS and Apple Silicon support, I aim to maintain compatibility with Linux and POSIX operating systems. Contributions and feedback related to Linux compatibility are always welcome.
2141

2242
Anyone who would like to assist with supporting Apple Silicon, let me know. There is much to do and I can only do so much by myself.
2343

24-
- [OLD VERSION - 1.3.1 Patched for macOS and Apple Silicon](#old-version---131-patched-for-macos-and-apple-silicon)
44+
- [MERGED 1.5 Version. macOS TEST VERSION](#merged-15-version--macos-test-version)
2545
- [Features](#features)
2646
- [Installation](#installation)
2747
- [Downloading models](#downloading-models)
@@ -36,9 +56,9 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
3656
- [AutoGPTQ](#autogptq)
3757
- [ExLlama](#exllama)
3858
- [GPTQ-for-LLaMa](#gptq-for-llama)
39-
- [FlexGen](#flexgen)
4059
- [DeepSpeed](#deepspeed)
4160
- [RWKV](#rwkv)
61+
- [RoPE (for llama.cpp and ExLlama only)](#rope-for-llamacpp-and-exllama-only)
4262
- [Gradio](#gradio)
4363
- [API](#api)
4464
- [Multimodal](#multimodal)
@@ -47,7 +67,6 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
4767
- [Community](#community)
4868
- [Credits](#credits)
4969

50-
5170
## Features
5271

5372
* 3 interface modes: default, notebook, and chat
@@ -56,7 +75,7 @@ Anyone who would like to assist with supporting Apple Silicon, let me know. Ther
5675
* LoRA: load and unload LoRAs on the fly, load multiple LoRAs at the same time, train a new LoRA
5776
* Precise instruction templates for chat mode, including Alpaca, Vicuna, Open Assistant, Dolly, Koala, ChatGLM, MOSS, RWKV-Raven, Galactica, StableLM, WizardLM, Baize, Ziya, Chinese-Vicuna, MPT, INCITE, Wizard Mega, KoAlpaca, Vigogne, Bactrian, h2o, and OpenBuddy
5877
* [Multimodal pipelines, including LLaVA and MiniGPT-4](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal)
59-
* 8-bit and 4-bit inference through bitsandbytes **CPU only mode for macOS, bitsandbytes does not support Apple Silicon M1/M2 processors**
78+
* 8-bit and 4-bit inference through bitsandbytes **CPU only mode for macOS, bitsandbytes does not support Apple Silicon GPU**
6079
* CPU mode for transformers models
6180
* [DeepSpeed ZeRO-3 inference](docs/DeepSpeed.md)
6281
* [Extensions](docs/Extensions.md)
@@ -165,7 +184,7 @@ Optionally, you can use the following command-line flags:
165184

166185
| Flag | Description |
167186
|--------------------------------------------|-------------|
168-
| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen |
187+
| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv |
169188

170189
#### Accelerate/transformers
171190

@@ -203,8 +222,8 @@ Optionally, you can use the following command-line flags:
203222
| `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. |
204223
| `--no-mmap` | Prevent mmap from being used. |
205224
| `--mlock` | Force the system to keep the model in RAM. |
206-
| `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
207-
| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. |
225+
| `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. Does not apply for Apple Silicon GPU since it uses unified memory. |
226+
| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with Apple Silicon GPU Support for BLAS and llama-cpp using Metal. Load the model and look for **llama_model_load_internal: n_layer in ths STDERR and this will show you the number of layers in the model. Set this value to that number or possibly n + 2, This si very sensitive now an will overrun your data area or tensor cache causing a segmentation fault. |
208227
| `--n_ctx N_CTX` | Size of the prompt context. |
209228
| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
210229
| `--n_gqa N_GQA` | grouped-query attention. Must be 8 for llama2 70b. |
@@ -226,8 +245,6 @@ Optionally, you can use the following command-line flags:
226245
|------------------|-------------|
227246
|`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` |
228247
|`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. |
229-
|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
230-
|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both. `
231248

232249
#### GPTQ-for-LLaMa
233250

@@ -243,14 +260,6 @@ Optionally, you can use the following command-line flags:
243260
| `--warmup_autotune` | (triton) Enable warmup autotune. |
244261
| `--fused_mlp` | (triton) Enable fused mlp. |
245262

246-
#### FlexGen
247-
248-
| Flag | Description |
249-
|------------------|-------------|
250-
| `--percent PERCENT [PERCENT ...]` | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). |
251-
| `--compress-weight` | FlexGen: Whether to compress weight (default: False).|
252-
| `--pin-weight [PIN_WEIGHT]` | FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). |
253-
254263
#### DeepSpeed
255264

256265
| Flag | Description |
@@ -266,6 +275,13 @@ Optionally, you can use the following command-line flags:
266275
| `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
267276
| `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. |
268277

278+
#### RoPE (for llama.cpp and ExLlama only)
279+
280+
| Flag | Description |
281+
|------------------|-------------|
282+
|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
283+
|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Scaling is not identical to embedding compression. Use either this or compress_pos_emb, not both. |
284+
269285
#### Gradio
270286

271287
| Flag | Description |
@@ -293,8 +309,6 @@ Optionally, you can use the following command-line flags:
293309
|---------------------------------------|-------------|
294310
| `--multimodal-pipeline PIPELINE` | The multimodal pipeline to use. Examples: `llava-7b`, `llava-13b`. |
295311

296-
Out of memory errors? [Check the low VRAM guide](docs/Low-VRAM-guide.md).
297-
298312
## Presets
299313

300314
Inference settings presets can be created under `presets/` as yaml files. These files are detected automatically at startup.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
user: ""
22
bot: ""
33
turn_template: "<|user|><|user-message|> [/INST] <|bot|><|bot-message|> </s><s>[INST] "
4-
context: "[INST] <<SYS>>\nAnswer the questions.\n<</SYS>>\n"
4+
context: "[INST] <<SYS>>\nAnswer the questions.\n<</SYS>>\n\n"

convert-to-flexgen.py

Lines changed: 0 additions & 63 deletions
This file was deleted.

docker/.dockerignore

Lines changed: 0 additions & 9 deletions
This file was deleted.

docker/.env.example

Lines changed: 0 additions & 30 deletions
This file was deleted.

0 commit comments

Comments
 (0)