Refactor ModelLoader for improved accessibility and cleanup

mikepapadim · mikepapadim · commit 45bfbbe71c30 · 2025-05-22T17:55:38.000+03:00
diff --git a/src/main/java/com/example/loader/weights/ModelLoader.java b/src/main/java/com/example/loader/weights/ModelLoader.java
@@ -9,7 +9,6 @@
 import com.example.core.model.tensor.GGMLTensorEntry;
 import com.example.core.model.tensor.Q4_0FloatTensor;
 import com.example.core.model.tensor.Q8_0FloatTensor;
-import com.example.core.types.Float16;
 import com.example.core.types.Pair;
 import com.example.inference.engine.impl.Configuration;
 import com.example.inference.engine.impl.Llama;
@@ -22,7 +21,6 @@
 import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
 
 import java.io.IOException;
-import java.lang.foreign.MemorySegment;
 import java.nio.ByteOrder;
 import java.nio.FloatBuffer;
 import java.nio.channels.FileChannel;
@@ -35,9 +33,6 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
-import static com.example.core.model.tensor.FloatTensor.readByte;
-import static com.example.core.model.tensor.FloatTensor.readShort;
-
 public final class ModelLoader {
     private static final String TOKENIZER_LLAMA_3_MODEL = "gpt2";
 
@@ -105,8 +100,7 @@ private static Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tenso
             GGMLTensorEntry outputWeight) {
         return new Weights(
                 // Load directly to TornadoVM format
-                loadTensorAsFloatArray(tokenEmbeddings),
-                loadArrayAsFloatArrayFromBuffer(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")),
+                loadTensorAsFloatArray(tokenEmbeddings), loadArrayAsFloatArrayFromBuffer(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")),
                 loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_q.weight")),
                 loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_k.weight")),
                 loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".attn_v.weight")),
@@ -115,7 +109,7 @@ private static Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tenso
                 loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_gate.weight")),
                 loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_down.weight")),
                 loadArrayAsHalfFloatArray(config.numberOfLayers, i -> tensorEntries.get("blk." + i + ".ffn_up.weight")), floatBufferToFloatArray(tensorEntries.get("output_norm.weight")),
-                FloatArray.fromArray(ropeFreqs.first()), FloatArray.fromArray(ropeFreqs.second()), createByteArrayFromTensor(outputWeight), outputWeight.ggmlType());
+                FloatArray.fromArray(ropeFreqs.first()), FloatArray.fromArray(ropeFreqs.second()), loadTensorAsHalfFloatArray(outputWeight), outputWeight.ggmlType());
     }
 
     /**
@@ -135,23 +129,51 @@ private static Weights createStandardWeights(Map<String, GGMLTensorEntry> tensor
                 FloatBuffer.wrap(ropeFreqs.first()), FloatBuffer.wrap(ropeFreqs.second()), loadQuantized(outputWeight), outputWeight.ggmlType());
     }
 
-    private static FloatArray[] loadArrayAsFloatArray(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
+    private static Tokenizer createTokenizer(Map<String, Object> metadata, Vocabulary vocabulary) {
+        String[] mergeLines = (String[]) metadata.get("tokenizer.ggml.merges");
+        List<Pair<Integer, Integer>> merges = Arrays.stream(mergeLines).map(line -> line.split(" "))
+                .map(parts -> new Pair<>(vocabulary.getIndex(parts[0]).orElseThrow(), vocabulary.getIndex(parts[1]).orElseThrow())).toList();
+
+        int allTokens = vocabulary.size();
+        int baseTokens = 128000; // assume all tokens after the base ones are special.
+        int reservedSpecialTokens = allTokens - baseTokens;
+        List<String> specialTokensList = Arrays.stream(vocabulary.tokens(), baseTokens, allTokens).toList();
+
+        assert specialTokensList.stream().allMatch(token -> vocabulary.getIndex(token).isPresent());
+
+        Map<String, Integer> specialTokens = IntStream.range(0, specialTokensList.size()).boxed().collect(Collectors.toMap(i -> specialTokensList.get(i), i -> baseTokens + i));
+
+        return new Tokenizer(vocabulary, merges, LLAMA_3_PATTERN, specialTokens);
+    }
+
+    public static FloatTensor loadQuantized(GGMLTensorEntry entry) {
+        GGMLType ggmlType = entry.ggmlType();
+        return switch (ggmlType) {
+            //            case F32 -> new F32FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            case Q8_0 -> new Q8_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            case Q4_0 -> new Q4_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            case F16 -> new F16FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            default -> throw new UnsupportedOperationException("Quantization format " + ggmlType);
+        };
+    }
+
+    public static FloatArray[] loadArrayAsFloatArray(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
         FloatArray[] array = new FloatArray[size];
         for (int i = 0; i < size; i++) {
             array[i] = loadTensorAsFloatArray(getTensorEntry.apply(i));
         }
         return array;
     }
 
-    private static HalfFloatArray[] loadArrayAsHalfFloatArray(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
+    public static HalfFloatArray[] loadArrayAsHalfFloatArray(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
         HalfFloatArray[] array = new HalfFloatArray[size];
         for (int i = 0; i < size; i++) {
             array[i] = loadTensorAsHalfFloatArray(getTensorEntry.apply(i));
         }
         return array;
     }
 
-    private static FloatArray floatBufferToFloatArray(GGMLTensorEntry tensorEntry) {
+    public static FloatArray floatBufferToFloatArray(GGMLTensorEntry tensorEntry) {
         if (tensorEntry.ggmlType() == GGMLType.F32) {
             FloatBuffer buffer = tensorEntry.memorySegment().asByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
             return FloatArray.fromFloatBuffer(buffer);
@@ -160,21 +182,20 @@ private static FloatArray floatBufferToFloatArray(GGMLTensorEntry tensorEntry) {
         }
     }
 
-
-    private static FloatArray[] loadArrayAsFloatArrayFromBuffer(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
+    public static FloatArray[] loadArrayAsFloatArrayFromBuffer(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
         FloatArray[] array = new FloatArray[size];
         for (int i = 0; i < size; i++) {
             array[i] = floatBufferToFloatArray(getTensorEntry.apply(i));
         }
         return array;
     }
 
-    private static ByteArray createByteArrayFromTensor(GGMLTensorEntry entry) {
+    public static ByteArray createByteArrayFromTensor(GGMLTensorEntry entry) {
         FloatTensor tensor = loadQuantized(entry);
         return ByteArray.fromSegment(tensor.asMemorySegment());
     }
 
-    private static FloatArray loadTensorAsFloatArray(GGMLTensorEntry entry) {
+    public static FloatArray loadTensorAsFloatArray(GGMLTensorEntry entry) {
         if (entry.ggmlType() == GGMLType.F32) {
             // For F32, we can directly create FloatArray from memory
             FloatBuffer buffer = entry.memorySegment().asByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
@@ -194,18 +215,10 @@ private static FloatArray loadTensorAsFloatArray(GGMLTensorEntry entry) {
         }
     }
 
-    private static HalfFloatArray loadTensorAsHalfFloatArray(GGMLTensorEntry entry) {
+    public static HalfFloatArray loadTensorAsHalfFloatArray(GGMLTensorEntry entry) {
         if (entry.ggmlType() == GGMLType.F32) {
-            // For F32, we can directly create FloatArray from memory
-//            FloatBuffer buffer = entry.memorySegment().asByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
-//            FloatArray array = new FloatArray(buffer.remaining());
-//            for (int i = 0; i < buffer.remaining(); i++) {
-//                array.set(i, buffer.get());
-//            }
-//            return array
-            //           ;
             System.out.println("Loading F32 tensor as HalfFloatArray");
-            return  null;
+            return null;
         } else {
             // For quantized formats, we need to load through FloatTensor
             FloatTensor tensor = loadQuantized(entry);
@@ -218,52 +231,6 @@ private static HalfFloatArray loadTensorAsHalfFloatArray(GGMLTensorEntry entry)
         }
     }
 
-    public static float getFloat(int index, int size, MemorySegment memorySegment) {
-        assert 0 <= index && index < size;
-        int blockIndex = index / GGMLType.Q4_0.getBlockSize();
-        int blockOffset = blockIndex * GGMLType.Q4_0.getTypeSize();
-        float scale = Float.float16ToFloat(readShort(memorySegment, blockOffset));
-        byte quant;
-        int modIndex = index % GGMLType.Q4_0.getBlockSize();
-        if (modIndex < GGMLType.Q4_0.getBlockSize() / 2) {
-            quant = (byte) (readByte(memorySegment, blockOffset + Float16.BYTES + modIndex) & 0x0F);
-        } else {
-            quant = (byte) ((readByte(memorySegment, blockOffset + Float16.BYTES + modIndex - GGMLType.Q4_0.getBlockSize() / 2) >>> 4) & 0x0F);
-        }
-        quant -= 8;
-        return quant * scale;
-    }
-
-    private static Tokenizer createTokenizer(Map<String, Object> metadata, Vocabulary vocabulary) {
-        String[] mergeLines = (String[]) metadata.get("tokenizer.ggml.merges");
-        List<Pair<Integer, Integer>> merges = Arrays.stream(mergeLines).map(line -> line.split(" "))
-                .map(parts -> new Pair<>(vocabulary.getIndex(parts[0]).orElseThrow(), vocabulary.getIndex(parts[1]).orElseThrow())).toList();
-
-        int allTokens = vocabulary.size();
-        int baseTokens = 128000; // assume all tokens after the base ones are special.
-        int reservedSpecialTokens = allTokens - baseTokens;
-        List<String> specialTokensList = Arrays.stream(vocabulary.tokens(), baseTokens, allTokens).toList();
-
-        assert specialTokensList.stream().allMatch(token -> vocabulary.getIndex(token).isPresent());
-
-        Map<String, Integer> specialTokens = IntStream.range(0, specialTokensList.size()).boxed().collect(Collectors.toMap(i -> specialTokensList.get(i), i -> baseTokens + i));
-
-        return new Tokenizer(vocabulary, merges, LLAMA_3_PATTERN, specialTokens);
-    }
-
-    public static FloatTensor loadQuantized(GGMLTensorEntry entry) {
-        GGMLType ggmlType = entry.ggmlType();
-//        System.out.println("Loading quantized tensor of type " + entry.name());
-        return switch (ggmlType) {
-            //            case F32 -> new F32FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            case Q8_0 -> new Q8_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            case Q4_0 -> new Q4_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            //            case BF16 ->  new BF16FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            case F16 -> new F16FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
-            default -> throw new UnsupportedOperationException("Quantization format " + ggmlType);
-        };
-    }
-
     public static FloatTensor[] loadArrayOfQuantized(int size, IntFunction<GGMLTensorEntry> getTensorEntry) {
         FloatTensor[] array = new FloatTensor[size];
         for (int i = 0; i < size; i++) {