Skip to content

Commit a8be6a7

Browse files
committed
DLPXECO-11971 Custom ollama changes for delphix
1 parent 238aa11 commit a8be6a7

File tree

6 files changed

+181
-8
lines changed

6 files changed

+181
-8
lines changed

build_for_cpu.sh

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/bin/bash
2+
set -eu
3+
4+
# Set your organization and image name
5+
ORG=${ORG:-"arunskurian"}
6+
IMAGE_NAME=${IMAGE_NAME:-"ollama-cpu"}
7+
VERSION=${VERSION:-"latest"}
8+
9+
# Docker Hub credentials (can be set via environment variables)
10+
DOCKER_USERNAME=${DOCKER_USERNAME:-""}
11+
DOCKER_PASSWORD=${DOCKER_PASSWORD:-""}
12+
13+
# Target platforms - same as Ollama's defaults
14+
PLATFORMS=${PLATFORMS:-"linux/arm64,linux/amd64"}
15+
16+
# Silent login if credentials are provided
17+
if [ -n "$DOCKER_USERNAME" ] && [ -n "$DOCKER_PASSWORD" ]; then
18+
echo "Logging in to Docker Hub as $DOCKER_USERNAME..."
19+
echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin >/dev/null 2>&1
20+
echo "Login successful"
21+
22+
# If login successful, use the provided username as the org
23+
if [ "$ORG" = "yourorg" ]; then
24+
ORG=$DOCKER_USERNAME
25+
echo "Using Docker username '$ORG' as organization"
26+
fi
27+
else
28+
echo "Docker credentials not provided, assuming you're already logged in"
29+
fi
30+
31+
# Ensure QEMU is installed for cross-platform builds
32+
echo "Setting up QEMU for cross-platform builds..."
33+
docker run --privileged --rm tonistiigi/binfmt --install all
34+
35+
# Set up buildx if needed
36+
BUILDER_NAME="multiarch-builder"
37+
if ! docker buildx inspect ${BUILDER_NAME} &>/dev/null; then
38+
echo "Creating new buildx builder: ${BUILDER_NAME}"
39+
docker buildx create --name ${BUILDER_NAME} --driver docker-container --use
40+
else
41+
docker buildx use ${BUILDER_NAME}
42+
fi
43+
docker buildx inspect --bootstrap
44+
45+
# Set PUSH to a non-empty string to trigger push instead of load
46+
PUSH=${PUSH:-""}
47+
if [ -z "${PUSH}" ] ; then
48+
echo "Building ${ORG}/${IMAGE_NAME}:${VERSION} locally. Set PUSH=1 to push"
49+
# Note: --load only works for single platform, so if building locally, adjust PLATFORMS
50+
if [[ "${PLATFORMS}" == *","* ]]; then
51+
echo "WARNING: --load only works for single platform. Setting platform to linux/$(uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')"
52+
PLATFORMS="linux/$(uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')"
53+
fi
54+
LOAD_OR_PUSH="--load"
55+
else
56+
echo "Will be pushing ${ORG}/${IMAGE_NAME}:${VERSION}"
57+
LOAD_OR_PUSH="--push"
58+
fi
59+
60+
# Build and push/load the multi-arch image
61+
echo "Building for platforms: ${PLATFORMS}"
62+
docker buildx build \
63+
--network=host \
64+
${LOAD_OR_PUSH} \
65+
--platform=${PLATFORMS} \
66+
-f Dockerfile-cpu \
67+
-t ${ORG}/${IMAGE_NAME}:${VERSION} \
68+
.
69+
70+
echo "Build completed successfully!"
71+
if [ -n "${PUSH}" ]; then
72+
echo "Image pushed to: ${ORG}/${IMAGE_NAME}:${VERSION}"
73+
echo "To pull: docker pull ${ORG}/${IMAGE_NAME}:${VERSION}"
74+
fi

cmd/cmd.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1571,6 +1571,7 @@ func NewCLI() *cobra.Command {
15711571
envVars["OLLAMA_LLM_LIBRARY"],
15721572
envVars["OLLAMA_GPU_OVERHEAD"],
15731573
envVars["OLLAMA_LOAD_TIMEOUT"],
1574+
envVars["OLLAMA_SKIP_MEMORY_CHECK"],
15741575
})
15751576
default:
15761577
appendEnvDocs(cmd, envs)

docs/faq.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,23 @@ The currently available K/V cache quantization types are:
333333
How much the cache quantization impacts the model's response quality will depend on the model and the task. Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
334334

335335
You may need to experiment with different quantization types to find the best balance between memory usage and quality.
336+
337+
338+
339+
## How do I bypass available memory check before loading a model?
340+
341+
By default, Ollama checks if your system has sufficient available memory before loading a model to prevent out-of-memory errors that could crash your system or cause instability.
342+
You can bypass this safety check by setting the OLLAMA_SKIP_MEMORY_CHECK environment variable to 1.
343+
344+
### When to use this option
345+
346+
- You have swap space configured and accept slower performance
347+
- You're running on a system with non-standard memory reporting
348+
- You're debugging memory-related issues
349+
- You understand the risks and have adequate system monitoring
350+
351+
### Important Warnings
352+
353+
- System instability: Loading models without sufficient memory can cause system freezes or crashes
354+
- Performance degradation: Your system may become unresponsive due to excessive swapping
355+
- Data loss risk: System crashes could result in unsaved work being lost

envconfig/config.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,12 @@ var (
226226
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
227227
)
228228

229+
var (
230+
// Bypass the memory check during model load. This is an expert only setting, to be used under situations where the system is guaranteedAdd commentMore actions
231+
// to get the have enough memory or is able to procure this at runtime by evicting blocks from caches. e.g ZFS Arc Cache.
232+
AvailableMemoryCheckOverride = Uint("OLLAMA_SKIP_MEMORY_CHECK", 0)
233+
)
234+
229235
func Uint64(key string, defaultValue uint64) func() uint64 {
230236
return func() uint64 {
231237
if s := Var(key); s != "" {
@@ -275,6 +281,9 @@ func AsMap() map[string]EnvVar {
275281
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
276282
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
277283
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
284+
285+
//Overrides
286+
"OLLAMA_SKIP_MEMORY_CHECK": {"OLLAMA_SKIP_MEMORY_CHECK", AvailableMemoryCheckOverride(), "Bypass checking for available memory before loading models. (e.g. OLLAMA_SKIP_MEMORY_CHECK=1)"},
278287
}
279288

280289
if runtime.GOOS != "windows" {

llm/memory.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package llm
22

33
import (
4+
"bufio"
45
"fmt"
56
"log/slog"
67
"os"
@@ -438,3 +439,55 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
438439

439440
return weights
440441
}
442+
443+
// GetZFSReclaimable returns max(0, size – c_min) from ZFS ARC stats.
444+
// Added to fix the arc memory cache issue on zfs
445+
// This will be a no-op is no zfs is involved.
446+
func GetZFSReclaimableMemory() (uint64, error) {
447+
paths := []string{"/proc/spl/kstat/zfs/arcstats", "/proc/zfs/arcstats"}
448+
var f *os.File
449+
for _, path := range paths {
450+
if file, err := os.Open(path); err == nil {
451+
f = file
452+
break
453+
}
454+
}
455+
if f == nil {
456+
return 0, fmt.Errorf("no ZFS ARC stats found")
457+
}
458+
defer f.Close()
459+
460+
var size, cmin uint64
461+
scanner := bufio.NewScanner(f)
462+
for scanner.Scan() {
463+
cols := strings.Fields(scanner.Text())
464+
if len(cols) < 3 {
465+
continue
466+
}
467+
var err error
468+
var val uint64
469+
470+
val, err = strconv.ParseUint(cols[2], 10, 64)
471+
if err != nil {
472+
continue
473+
}
474+
switch cols[0] {
475+
case "size":
476+
size = val
477+
case "c_min":
478+
cmin = val
479+
default:
480+
continue
481+
}
482+
}
483+
if err := scanner.Err(); err != nil {
484+
return 0, err
485+
}
486+
if size <= 0 || cmin <= 0 {
487+
return 0, fmt.Errorf("failed to read ZFS ARC stats")
488+
}
489+
if size > cmin {
490+
return size - cmin, nil
491+
}
492+
return 0, nil
493+
}

llm/server.go

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -161,14 +161,30 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
161161
}
162162
}
163163

164-
// On linux and windows, over-allocating CPU memory will almost always result in an error
165-
// Darwin has fully dynamic swap so has no direct concept of free swap space
166-
if runtime.GOOS != "darwin" {
167-
systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
168-
available := systemFreeMemory + systemSwapFreeMemory
169-
if systemMemoryRequired > available {
170-
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
171-
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
164+
// Env variable to bypass ollama's memory check guardrail.
165+
if envconfig.AvailableMemoryCheckOverride() == 1 {
166+
slog.Warn("OLLAMA_SKIP_MEMORY_CHECK set; bypassing memory checks")
167+
} else {
168+
// On linux and windows, over-allocating CPU memory will almost always result in an error
169+
// Darwin has fully dynamic swap so has no direct concept of free swap space
170+
slog.Debug("OLLAMA_SKIP_MEMORY_CHECK not set; running memory checks")
171+
if runtime.GOOS != "darwin" {
172+
systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
173+
available := systemFreeMemory + systemSwapFreeMemory
174+
175+
// On Linux, reclaim ZFS ARC (size – c_min)
176+
if runtime.GOOS == "linux" {
177+
if reclaim, err := GetZFSReclaimableMemory(); err == nil {
178+
slog.Info("reclaiming ZFS Arc cache size:", "size", format.HumanBytes2(reclaim))
179+
available += reclaim
180+
} else {
181+
slog.Warn("failure while computing ZFS Arc cache size:", "error", err)
182+
}
183+
}
184+
if systemMemoryRequired > available {
185+
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
186+
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
187+
}
172188
}
173189
}
174190

0 commit comments

Comments
 (0)