Working AVX through explicit vectorization.

braxtonmckee · braxtonmckee · commit 81b856fa7bf3 · 2023-05-24T21:39:15.000Z
Its not ideal that llvm can't figure this out for us, but its better
than not having it.
diff --git a/typed_python/compiler/native_compiler/binary_shared_object.py b/typed_python/compiler/native_compiler/binary_shared_object.py
@@ -81,7 +81,9 @@ def fromDisk(path, globalVariableDefinitions, functionNameToType, usedExternalFu
     def fromModule(module, globalVariableDefinitions, functionNameToType, usedExternalFunctions, functionDefinitions):
         target_triple = llvm.get_process_triple()
         target = llvm.Target.from_triple(target_triple)
-        target_machine_shared_object = target.create_target_machine(reloc='pic', codemodel='default')
+        features = llvm.get_host_cpu_features()
+
+        target_machine_shared_object = target.create_target_machine(reloc='pic', codemodel='default', features=features.flatten())
 
         # returns the contents of a '.o' file coming out of a c++ compiler like clang
         o_file_contents = target_machine_shared_object.emit_object(module)
diff --git a/typed_python/compiler/native_compiler/llvm_execution_engine.py b/typed_python/compiler/native_compiler/llvm_execution_engine.py
@@ -25,10 +25,13 @@
 llvm.initialize_native_target()
 llvm.initialize_native_asmprinter()  # yes, even this one
 
+features = llvm.get_host_cpu_features()
+
+
 target_triple = llvm.get_process_triple()
 target = llvm.Target.from_triple(target_triple)
-target_machine = target.create_target_machine()
-target_machine_shared_object = target.create_target_machine(reloc='pic', codemodel='default')
+target_machine = target.create_target_machine(features=features.flatten())
+target_machine_shared_object = target.create_target_machine(reloc='pic', codemodel='default', features=features.flatten())
 
 ctypes.CDLL(_types.__file__, mode=ctypes.RTLD_GLOBAL)
 
diff --git a/typed_python/compiler/tests/compilable_builtin_test.py b/typed_python/compiler/tests/compilable_builtin_test.py
@@ -14,10 +14,10 @@
 
 class InlineLlvmFunc(CompilableBuiltin):
     def __eq__(self, other):
-        return isinstance(other, inlineLlvmFunc)
+        return isinstance(other, InlineLlvmFunc)
 
     def __hash__(self):
-        return hash("inlineLlvmFunc")
+        return hash("InlineLlvmFunc")
 
     def convert_call(self, context, instance, args, kwargs):
         return context.pushPod(
diff --git a/typed_python/compiler/tests/vectorization_test.py b/typed_python/compiler/tests/vectorization_test.py
@@ -0,0 +1,115 @@
+import time
+
+from typed_python import Entrypoint, ListOf
+from typed_python.compiler.type_wrappers.compilable_builtin import CompilableBuiltin
+from typed_python.compiler.type_wrappers.runtime_functions import externalCallTarget, Float64, Void
+
+
+tp_llvm_vecMultAdd = externalCallTarget("tp_llvm_vecMultAdd", Void, Float64.pointer(), Float64.pointer(), Float64.pointer(), inlineLlvmDefinition="""
+    define external void @"tp_llvm_vecMultAdd"(double* %p1, double* %p2, double* %p3) {
+    entry:
+        %p1_vec_ptr = bitcast double* %p1 to <8 x double>*
+        %p2_vec_ptr = bitcast double* %p2 to <8 x double>*
+        %p3_vec_ptr = bitcast double* %p3 to <8 x double>*
+
+        ; note that we have to have 'align 1' here because we don't make any guarantees
+        ; about alignment in TP internals at all, mostly due to laziness. As a result, you
+        ; can get a segfault if your memory is not aligned to the native alignment of the
+        ; vector type - this is never a problem with loading a primitive like int64, but the
+        ; avx instructions generated by the load here will crash if you leave off the
+        ; alignment because they'll assume 64 which is not always the case, and then
+        ; the resulting aligned processor read will crash.
+        %p1_vec = load <8 x double>, <8 x double>* %p1_vec_ptr, align 1
+        %p2_vec = load <8 x double>, <8 x double>* %p2_vec_ptr, align 1
+        %p3_vec = fmul <8 x double> %p1_vec, %p2_vec
+
+        store <8 x double> %p3_vec, <8 x double>* %p3_vec_ptr, align 1
+
+        ret void
+    }
+""")
+
+
+class TpLlvmVecMultAdd(CompilableBuiltin):
+    def __eq__(self, other):
+        return isinstance(other, TpLlvmVecMultAdd)
+
+    def __hash__(self):
+        return hash("TpLlvmVecMultAdd")
+
+    def convert_call(self, context, instance, args, kwargs):
+        context.pushEffect(
+            tp_llvm_vecMultAdd.call(
+                args[0],
+                args[1],
+                args[2]
+            )
+        )
+        return context.constant(None)
+
+
+@Entrypoint
+def fmultAdd1(l, p1, p2, p3):
+    i = 0
+
+    while i < l:
+        p3[i] = p1[i] * p2[i]
+        i += 1
+
+
+@Entrypoint
+def fmultAdd2(l, p1, p2, p3):
+    i = 0
+
+    while i + 8 < l:
+        TpLlvmVecMultAdd()(p1 + i, p2 + i, p3 + i)
+        i += 8
+
+    while i < l:
+        p3[i] = p1[i] * p2[i]
+        i += 1
+
+
+@Entrypoint
+def fmultAdd1Times(ct, l, p1, p2, p3):
+    for i in range(ct):
+        fmultAdd1(l, p1, p2, p3)
+
+
+@Entrypoint
+def fmultAdd2Times(ct, l, p1, p2, p3):
+    for i in range(ct):
+        fmultAdd2(l, p1, p2, p3)
+
+
+def test_inline_vectorization_working():
+    l1 = ListOf(float)()
+    l2 = ListOf(float)()
+    l3 = ListOf(float)()
+
+    N = 1024
+
+    l1.resize(N)
+    l2.resize(N)
+    l3.resize(N)
+
+    fmultAdd1Times(1, N, l1.pointerUnsafe(0), l2.pointerUnsafe(0), l3.pointerUnsafe(0))
+    fmultAdd2Times(1, N, l1.pointerUnsafe(0), l2.pointerUnsafe(0), l3.pointerUnsafe(0))
+
+    t0 = time.time()
+    fmultAdd1Times(1000000, N, l1.pointerUnsafe(0), l2.pointerUnsafe(0), l3.pointerUnsafe(0))
+    t1 = time.time()
+
+    t2 = time.time()
+    fmultAdd2Times(1000000, N, l1.pointerUnsafe(0), l2.pointerUnsafe(0), l3.pointerUnsafe(0))
+    t3 = time.time()
+
+    print(t1 - t0)
+    print(t3 - t2)
+
+    speedup = (t1 - t0) / (t3 - t2)
+
+    # I get about 4x because of AVX instructions. LLVM can't figure out to do this
+    # directly for whatever reason, but the inlined primitive does it.  I don't get the same
+    # speedup on the MacOS workers on gitlab so the threshold is set quite low
+    assert speedup > 1.25