Move loadModel methods to dedicated model classes

orionpapadakis · orionpapadakis · commit 8e63862ef156 · 2025-06-12T18:17:06.000+03:00
diff --git a/src/main/java/com/example/aot/AOT.java b/src/main/java/com/example/aot/AOT.java
@@ -46,7 +46,7 @@ private static PartialModel preLoadGGUF(String modelPath) {
             try (FileChannel fileChannel = FileChannel.open(path, StandardOpenOption.READ)) {
                 return new PartialModel(
                         path.getFileName().toString(),
-                        ModelLoader.loadLlamaModel(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false),
+                        Llama.loadModel(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false), // TODO: needs proper handling for AOT
                         gguf.getTensorDataOffset(),
                         gguf.getTensorInfos()
                 );
diff --git a/src/main/java/com/example/loader/weights/ModelLoader.java b/src/main/java/com/example/loader/weights/ModelLoader.java
@@ -1,7 +1,6 @@
 package com.example.loader.weights;
 
 import com.example.LlamaApp;
-import com.example.auxiliary.Timer;
 import com.example.core.model.GGMLType;
 import com.example.core.model.GGUF;
 import com.example.core.model.tensor.F16FloatTensor;
@@ -70,89 +69,10 @@ public static Model loadModel(Path ggufPath, int contextLength, boolean loadWeig
         // initial load of metadata from gguf file
         GGUF gguf = GGUF.loadModel(ggufPath);
         FileChannel fileChannel = FileChannel.open(ggufPath, StandardOpenOption.READ);
-
         // detect model type
         ModelType modelType = detectModelType(gguf.getMetadata());
-        System.out.println("Detected model type: " + modelType);
-
-        // load model (vocabulary, tokenizer, configuration, tensors, weights)
-        return switch (modelType) {
-            case LLAMA_3 -> loadLlamaModel(fileChannel, gguf, contextLength, loadWeights);
-            case MISTRAL -> loadMistralModel(fileChannel, gguf, contextLength, loadWeights);
-            default -> throw new UnsupportedOperationException("Unsupported model type: " + modelType);
-        };
-    }
-
-    public static Llama loadLlamaModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights) throws IOException {
-        try (var ignored = Timer.log("Load LlaMa model")) {
-            Map<String, Object> metadata = gguf.getMetadata();
-
-            Vocabulary vocabulary = Vocabulary.loadLlamaVocabulary(metadata);
-            Tokenizer tokenizer = createLlama3Tokenizer(metadata, vocabulary);
-
-            LlamaConfiguration config = new LlamaConfiguration(
-                    (int) metadata.get("llama.embedding_length"),
-                    (int) metadata.get("llama.feed_forward_length"),
-                    (int) metadata.get("llama.block_count"),
-                    (int) metadata.get("llama.attention.head_count"),
-
-                    metadata.containsKey("llama.attention.head_count_kv") ?
-                            (int) metadata.get("llama.attention.head_count_kv") :
-                            (int) metadata.get("llama.attention.head_count"),
-
-                    vocabulary.size(),
-                    (int) metadata.get("llama.context_length"),
-                    (float) metadata.getOrDefault("llama.attention.layer_norm_rms_epsilon", 1e-5f),
-                    (float) metadata.getOrDefault("llama.rope.freq_base", 10000f)
-            ).withContextLength(contextLength);
-
-            Weights weights = null;
-            if (loadWeights) {
-                Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());
-                weights = loadWeights(tensorEntries, config);
-            }
-            return new Llama(config, tokenizer, weights);
-        }
-    }
-
-    public static Mistral loadMistralModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights) {
-        try (var ignored = Timer.log("Load Mistral model")) {
-            Map<String, Object> metadata = gguf.getMetadata();
-
-            Vocabulary vocabulary = Vocabulary.loadMistralVocabulary(metadata);
-            Tokenizer tokenizer = createMistralTokenizer(metadata, vocabulary);
-
-            int modelContextLength = (int) metadata.get("llama.context_length");
-            if (contextLength < 0 || modelContextLength < contextLength) {
-                contextLength = modelContextLength;
-            }
-
-            MistralConfiguration config = new MistralConfiguration(
-                    (int) metadata.get("llama.embedding_length"),
-                    (int) metadata.get("llama.feed_forward_length"),
-                    (int) metadata.get("llama.block_count"),
-                    (int) metadata.get("llama.attention.head_count"),
-
-                    metadata.containsKey("llama.attention.head_count_kv")
-                            ? (int) metadata.get("llama.attention.head_count_kv")
-                            : (int) metadata.get("llama.attention.head_count"),
-
-                    vocabulary.size(),
-                    contextLength,
-                    false,
-                    (float) metadata.getOrDefault("llama.attention.layer_norm_rms_epsilon", 1e-5f),
-                    (float) metadata.getOrDefault("llama.rope.freq_base", 10000f)
-            );
-
-            Weights weights = null;
-            if (loadWeights) {
-                Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());
-                weights = loadWeights(tensorEntries, config);
-            }
-            return new Mistral(config, tokenizer, weights);
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
+        // model type-specific load
+        return modelType.loadModel(fileChannel, gguf, contextLength, loadWeights);
     }
 
     public static Weights loadWeights(Map<String, GGMLTensorEntry> tensorEntries, Configuration config) {
diff --git a/src/main/java/com/example/model/ModelType.java b/src/main/java/com/example/model/ModelType.java
@@ -1,11 +1,33 @@
 package com.example.model;
 
 import com.example.core.model.GGUF;
+import com.example.model.llama.Llama;
+import com.example.model.mistral.Mistral;
 
 import java.nio.channels.FileChannel;
 
 public enum ModelType {
-    LLAMA_3,
-    MISTRAL,
-    UNKNOWN;
+    LLAMA_3 {
+        @Override
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights) {
+            return Llama.loadModel(fileChannel, gguf, contextLength, loadWeights);
+        }
+    },
+
+    MISTRAL {
+        @Override
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights) {
+            return Mistral.loadModel(fileChannel, gguf, contextLength, loadWeights);
+        }
+    },
+
+    UNKNOWN {
+        @Override
+        public Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights) {
+            throw new UnsupportedOperationException("Cannot load unknown model type");
+        }
+    };
+
+    // Abstract method that each enum constant must implement
+    public abstract Model loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights);
 }
diff --git a/src/main/java/com/example/model/llama/Llama.java b/src/main/java/com/example/model/llama/Llama.java
@@ -1,11 +1,21 @@
 package com.example.model.llama;
 
+import com.example.auxiliary.Timer;
+import com.example.core.model.GGUF;
+import com.example.core.model.tensor.GGMLTensorEntry;
 import com.example.model.Model;
 import com.example.loader.weights.State;
 import com.example.loader.weights.Weights;
 import com.example.model.ModelType;
 import com.example.tokenizer.impl.LlamaTokenizer;
 import com.example.tokenizer.impl.Tokenizer;
+import com.example.tokenizer.vocabulary.Vocabulary;
+
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.util.Map;
+
+import static com.example.loader.weights.ModelLoader.loadWeights;
 
 public record Llama(LlamaConfiguration configuration, Tokenizer tokenizer, Weights weights) implements Model {
     private static final int BATCH_SIZE = Integer.getInteger("llama.BatchSize", 16);
@@ -32,5 +42,39 @@ public State createNewState(int batchsize) {
         return state;
     }
 
+    public static Llama loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights) {
+        try (var ignored = Timer.log("Load LlaMa model")) {
+            Map<String, Object> metadata = gguf.getMetadata();
+
+            Vocabulary vocabulary = Vocabulary.loadLlamaVocabulary(metadata);
+            Tokenizer tokenizer = new LlamaTokenizer(metadata, vocabulary);
+
+            LlamaConfiguration config = new LlamaConfiguration(
+                    (int) metadata.get("llama.embedding_length"),
+                    (int) metadata.get("llama.feed_forward_length"),
+                    (int) metadata.get("llama.block_count"),
+                    (int) metadata.get("llama.attention.head_count"),
+
+                    metadata.containsKey("llama.attention.head_count_kv") ?
+                            (int) metadata.get("llama.attention.head_count_kv") :
+                            (int) metadata.get("llama.attention.head_count"),
+
+                    vocabulary.size(),
+                    (int) metadata.get("llama.context_length"),
+                    (float) metadata.getOrDefault("llama.attention.layer_norm_rms_epsilon", 1e-5f),
+                    (float) metadata.getOrDefault("llama.rope.freq_base", 10000f)
+            ).withContextLength(contextLength);
+
+            Weights weights = null;
+            if (loadWeights) {
+                Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());
+                weights = loadWeights(tensorEntries, config);
+            }
+            return new Llama(config, tokenizer, weights);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
 }
 
diff --git a/src/main/java/com/example/model/mistral/Mistral.java b/src/main/java/com/example/model/mistral/Mistral.java
@@ -1,11 +1,21 @@
 package com.example.model.mistral;
 
+import com.example.auxiliary.Timer;
+import com.example.core.model.GGUF;
+import com.example.core.model.tensor.GGMLTensorEntry;
 import com.example.model.Model;
 import com.example.loader.weights.State;
 import com.example.loader.weights.Weights;
 import com.example.model.ModelType;
 import com.example.tokenizer.impl.MistralTokenizer;
 import com.example.tokenizer.impl.Tokenizer;
+import com.example.tokenizer.vocabulary.Vocabulary;
+
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.util.Map;
+
+import static com.example.loader.weights.ModelLoader.loadWeights;
 
 public record Mistral(MistralConfiguration configuration, Tokenizer tokenizer, Weights weights) implements Model {
 
@@ -29,4 +39,44 @@ public State createNewState(int batchsize) {
         return state;
     }
 
+    public static Mistral loadModel(FileChannel fileChannel, GGUF gguf, int contextLength, boolean loadWeights) {
+        try (var ignored = Timer.log("Load Mistral model")) {
+            Map<String, Object> metadata = gguf.getMetadata();
+
+            Vocabulary vocabulary = Vocabulary.loadMistralVocabulary(metadata);
+            Tokenizer tokenizer = new MistralTokenizer(metadata, vocabulary);
+
+            int modelContextLength = (int) metadata.get("llama.context_length");
+            if (contextLength < 0 || modelContextLength < contextLength) {
+                contextLength = modelContextLength;
+            }
+
+            MistralConfiguration config = new MistralConfiguration(
+                    (int) metadata.get("llama.embedding_length"),
+                    (int) metadata.get("llama.feed_forward_length"),
+                    (int) metadata.get("llama.block_count"),
+                    (int) metadata.get("llama.attention.head_count"),
+
+                    metadata.containsKey("llama.attention.head_count_kv")
+                            ? (int) metadata.get("llama.attention.head_count_kv")
+                            : (int) metadata.get("llama.attention.head_count"),
+
+                    vocabulary.size(),
+                    contextLength,
+                    false,
+                    (float) metadata.getOrDefault("llama.attention.layer_norm_rms_epsilon", 1e-5f),
+                    (float) metadata.getOrDefault("llama.rope.freq_base", 10000f)
+            );
+
+            Weights weights = null;
+            if (loadWeights) {
+                Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, gguf.getTensorDataOffset(), gguf.getTensorInfos());
+                weights = loadWeights(tensorEntries, config);
+            }
+            return new Mistral(config, tokenizer, weights);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
 }