DLPXECO-11971 Custom ollama changes for delphix

arunskurian · arunskurian · commit a8be6a78dca1 · 2025-07-09T23:15:02.000-04:00
diff --git a/build_for_cpu.sh b/build_for_cpu.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -eu
+
+# Set your organization and image name
+ORG=${ORG:-"arunskurian"}
+IMAGE_NAME=${IMAGE_NAME:-"ollama-cpu"}
+VERSION=${VERSION:-"latest"}
+
+# Docker Hub credentials (can be set via environment variables)
+DOCKER_USERNAME=${DOCKER_USERNAME:-""}
+DOCKER_PASSWORD=${DOCKER_PASSWORD:-""}
+
+# Target platforms - same as Ollama's defaults
+PLATFORMS=${PLATFORMS:-"linux/arm64,linux/amd64"}
+
+# Silent login if credentials are provided
+if [ -n "$DOCKER_USERNAME" ] && [ -n "$DOCKER_PASSWORD" ]; then
+  echo "Logging in to Docker Hub as $DOCKER_USERNAME..."
+  echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin >/dev/null 2>&1
+  echo "Login successful"
+
+  # If login successful, use the provided username as the org
+  if [ "$ORG" = "yourorg" ]; then
+    ORG=$DOCKER_USERNAME
+    echo "Using Docker username '$ORG' as organization"
+  fi
+else
+  echo "Docker credentials not provided, assuming you're already logged in"
+fi
+
+# Ensure QEMU is installed for cross-platform builds
+echo "Setting up QEMU for cross-platform builds..."
+docker run --privileged --rm tonistiigi/binfmt --install all
+
+# Set up buildx if needed
+BUILDER_NAME="multiarch-builder"
+if ! docker buildx inspect ${BUILDER_NAME} &>/dev/null; then
+    echo "Creating new buildx builder: ${BUILDER_NAME}"
+    docker buildx create --name ${BUILDER_NAME} --driver docker-container --use
+else
+    docker buildx use ${BUILDER_NAME}
+fi
+docker buildx inspect --bootstrap
+
+# Set PUSH to a non-empty string to trigger push instead of load
+PUSH=${PUSH:-""}
+if [ -z "${PUSH}" ] ; then
+    echo "Building ${ORG}/${IMAGE_NAME}:${VERSION} locally. Set PUSH=1 to push"
+    # Note: --load only works for single platform, so if building locally, adjust PLATFORMS
+    if [[ "${PLATFORMS}" == *","* ]]; then
+        echo "WARNING: --load only works for single platform. Setting platform to linux/$(uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')"
+        PLATFORMS="linux/$(uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')"
+    fi
+    LOAD_OR_PUSH="--load"
+else
+    echo "Will be pushing ${ORG}/${IMAGE_NAME}:${VERSION}"
+    LOAD_OR_PUSH="--push"
+fi
+
+# Build and push/load the multi-arch image
+echo "Building for platforms: ${PLATFORMS}"
+docker buildx build \
+    --network=host \
+    ${LOAD_OR_PUSH} \
+    --platform=${PLATFORMS} \
+    -f Dockerfile-cpu \
+    -t ${ORG}/${IMAGE_NAME}:${VERSION} \
+    .
+
+echo "Build completed successfully!"
+if [ -n "${PUSH}" ]; then
+    echo "Image pushed to: ${ORG}/${IMAGE_NAME}:${VERSION}"
+    echo "To pull: docker pull ${ORG}/${IMAGE_NAME}:${VERSION}"
+fi
diff --git a/cmd/cmd.go b/cmd/cmd.go
@@ -1571,6 +1571,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
+				envVars["OLLAMA_SKIP_MEMORY_CHECK"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
diff --git a/docs/faq.md b/docs/faq.md
@@ -333,3 +333,23 @@ The currently available K/V cache quantization types are:
 How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
 
 You may need to experiment with different quantization types to find the best balance between memory usage and quality.
+
+
+
+## How do I bypass available memory check before loading a model?
+
+By default, Ollama checks if your system has sufficient available memory before loading a model to prevent out-of-memory errors that could crash your system or cause instability.
+You can bypass this safety check by setting the OLLAMA_SKIP_MEMORY_CHECK environment variable to 1. 
+
+### When to use this option
+
+- You have swap space configured and accept slower performance
+- You're running on a system with non-standard memory reporting
+- You're debugging memory-related issues
+- You understand the risks and have adequate system monitoring
+
+###  Important Warnings
+
+- System instability: Loading models without sufficient memory can cause system freezes or crashes
+- Performance degradation: Your system may become unresponsive due to excessive swapping
+- Data loss risk: System crashes could result in unsaved work being lost
diff --git a/envconfig/config.go b/envconfig/config.go
@@ -226,6 +226,12 @@ var (
 	MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
 )
 
+var (
+	// Bypass the memory check during model load. This is an expert only setting, to be used under situations where the system is guaranteedAdd commentMore actions
+	// to get the have enough memory or is able to procure this at runtime by evicting blocks from caches. e.g ZFS Arc Cache.
+	AvailableMemoryCheckOverride = Uint("OLLAMA_SKIP_MEMORY_CHECK", 0)
+)
+
 func Uint64(key string, defaultValue uint64) func() uint64 {
 	return func() uint64 {
 		if s := Var(key); s != "" {
@@ -275,6 +281,9 @@ func AsMap() map[string]EnvVar {
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
 		"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
 		"NO_PROXY":    {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
+
+		//Overrides
+		"OLLAMA_SKIP_MEMORY_CHECK": {"OLLAMA_SKIP_MEMORY_CHECK", AvailableMemoryCheckOverride(), "Bypass checking for available memory before loading models. (e.g. OLLAMA_SKIP_MEMORY_CHECK=1)"},
 	}
 
 	if runtime.GOOS != "windows" {
diff --git a/llm/memory.go b/llm/memory.go
@@ -1,6 +1,7 @@
 package llm
 
 import (
+	"bufio"
 	"fmt"
 	"log/slog"
 	"os"
@@ -438,3 +439,55 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
 
 	return weights
 }
+
+// GetZFSReclaimable returns max(0, size – c_min) from ZFS ARC stats.
+// Added to fix the arc memory cache issue on zfs
+// This will be a no-op is no zfs is involved.
+func GetZFSReclaimableMemory() (uint64, error) {
+	paths := []string{"/proc/spl/kstat/zfs/arcstats", "/proc/zfs/arcstats"}
+	var f *os.File
+	for _, path := range paths {
+		if file, err := os.Open(path); err == nil {
+			f = file
+			break
+		}
+	}
+	if f == nil {
+		return 0, fmt.Errorf("no ZFS ARC stats found")
+	}
+	defer f.Close()
+
+	var size, cmin uint64
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		cols := strings.Fields(scanner.Text())
+		if len(cols) < 3 {
+			continue
+		}
+		var err error
+		var val uint64
+
+		val, err = strconv.ParseUint(cols[2], 10, 64)
+		if err != nil {
+			continue
+		}
+		switch cols[0] {
+		case "size":
+			size = val
+		case "c_min":
+			cmin = val
+		default:
+			continue
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return 0, err
+	}
+	if size <= 0 || cmin <= 0 {
+		return 0, fmt.Errorf("failed to read ZFS ARC stats")
+	}
+	if size > cmin {
+		return size - cmin, nil
+	}
+	return 0, nil
+}
diff --git a/llm/server.go b/llm/server.go
@@ -161,14 +161,30 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		}
 	}
 
-	// On linux and windows, over-allocating CPU memory will almost always result in an error
-	// Darwin has fully dynamic swap so has no direct concept of free swap space
-	if runtime.GOOS != "darwin" {
-		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
-		available := systemFreeMemory + systemSwapFreeMemory
-		if systemMemoryRequired > available {
-			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
-			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
+	// Env variable to bypass ollama's memory check guardrail.
+	if envconfig.AvailableMemoryCheckOverride() == 1 {
+		slog.Warn("OLLAMA_SKIP_MEMORY_CHECK set; bypassing memory checks")
+	} else {
+		// On linux and windows, over-allocating CPU memory will almost always result in an error
+		// Darwin has fully dynamic swap so has no direct concept of free swap space
+		slog.Debug("OLLAMA_SKIP_MEMORY_CHECK not set; running memory checks")
+		if runtime.GOOS != "darwin" {
+			systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
+			available := systemFreeMemory + systemSwapFreeMemory
+
+			// On Linux, reclaim ZFS ARC (size – c_min)
+			if runtime.GOOS == "linux" {
+				if reclaim, err := GetZFSReclaimableMemory(); err == nil {
+					slog.Info("reclaiming ZFS Arc cache size:", "size", format.HumanBytes2(reclaim))
+					available += reclaim
+				} else {
+					slog.Warn("failure while computing ZFS Arc cache size:", "error", err)
+				}
+			}
+			if systemMemoryRequired > available {
+				slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory))
+				return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
+			}
 		}
 	}