Redo how fastmath functions are implemented

stijnh · stijnh · commit 4d3d35642cd0 · 2024-05-17T11:03:32.000+02:00
Previously, each fast math function was implemented by having an
corresponding `ops::fast_X&lt;T&gt;` functor for each `ops::X&lt;T&gt;` functor.

This commit rewrite this logic to instead have a special
`apply_fastmath_impl&lt;...&gt;` helper that falls back to `apply_impl&lt;...&gt;`
if there is not fast math version available. This means the normal
apply logic is used if there is not fast math version, which was
previously not the case.

Additionally, this commit adds a global `KERNEL_FLOAT_FAST_MATH`
define that can be used to turn on fast math mode.
diff --git a/include/kernel_float/apply.h b/include/kernel_float/apply.h
@@ -152,6 +152,9 @@ struct apply_recur_impl<1> {
         result[0] = fun(inputs[0]...);
     }
 };
+
+template<typename F, size_t N, typename Output, typename... Args>
+struct apply_fastmath_impl: apply_impl<F, N, Output, Args...> {};
 }  // namespace detail
 
 template<typename F, typename... Args>
@@ -174,7 +177,34 @@ KERNEL_FLOAT_INLINE map_type<F, Args...> map(F fun, const Args&... args) {
     using E = broadcast_vector_extent_type<Args...>;
     vector_storage<Output, E::value> result;
 
-    detail::apply_impl<F, E::value, Output, vector_value_type<Args>...>::call(
+    // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
+#if KERNEL_FLOAT_FAST_MATH
+    using apply_impl = detail::apply_fastmath_impl<F, E::value, Output, vector_value_type<Args>...>;
+#else
+    using apply_impl = detail::apply_impl<F, E::value, Output, vector_value_type<Args>...>;
+#endif
+
+    apply_impl::call(
+        fun,
+        result.data(),
+        (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
+             into_vector_storage(args))
+             .data())...);
+
+    return result;
+}
+
+/**
+ * Apply the function `F` to each element from the vector `input` and return the results as a new vector. This
+ * uses fast-math if available for the given function `F`, otherwise this function behaves like `map`.
+ */
+template<typename F, typename... Args>
+KERNEL_FLOAT_INLINE map_type<F, Args...> fast_map(F fun, const Args&... args) {
+    using Output = result_t<F, vector_value_type<Args>...>;
+    using E = broadcast_vector_extent_type<Args...>;
+    vector_storage<Output, E::value> result;
+
+    detail::apply_fastmath_impl<F, E::value, Output, vector_value_type<Args>...>::call(
         fun,
         result.data(),
         (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h
@@ -89,6 +89,20 @@ struct extent<N> {
     static constexpr size_t size = N;
 };
 
+namespace detail {
+// Indicates that elements of type `T` offer less precision than floats, thus operations
+// on elements of type `T` can be performed by upcasting them to ` float`.
+template<typename T>
+struct allow_float_fallback {
+    static constexpr bool value = false;
+};
+
+template<>
+struct allow_float_fallback<float> {
+    static constexpr bool value = true;
+};
+}  // namespace detail
+
 template<typename T>
 struct into_vector_impl {
     using value_type = T;
diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h
@@ -72,11 +72,7 @@ KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt)
 KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin)
 KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt)
 KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc)
-
-KERNEL_FLOAT_BF16_UNARY_FUN(fast_exp, ::hexp, ::h2exp)
-KERNEL_FLOAT_BF16_UNARY_FUN(fast_log, ::hlog, ::h2log)
-KERNEL_FLOAT_BF16_UNARY_FUN(fast_cos, ::hcos, ::h2cos)
-KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin)
+KERNEL_FLOAT_BF16_UNARY_FUN(rcp, ::hrcp, ::h2rcp)
 #endif
 
 #if KERNEL_FLOAT_CUDA_ARCH >= 800
@@ -114,8 +110,6 @@ KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div)
 KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2)
 KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2)
 
-KERNEL_FLOAT_BF16_BINARY_FUN(fast_div, __hdiv, __h2div)
-
 KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2)
 KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __hneu, __hneu2)
 KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2)
diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h
@@ -7,9 +7,7 @@
 namespace kernel_float {
 
 template<typename F, typename L, typename R>
-using zip_type = vector<
-    result_t<F, vector_value_type<L>, vector_value_type<R>>,
-    broadcast_vector_extent_type<L, R>>;
+using zip_type = map_type<F, L, R>;
 
 /**
  * Combines the elements from the two inputs (`left` and `right`)  element-wise, applying a provided binary
@@ -25,20 +23,7 @@ using zip_type = vector<
  */
 template<typename F, typename L, typename R>
 KERNEL_FLOAT_INLINE zip_type<F, L, R> zip(F fun, const L& left, const R& right) {
-    using A = vector_value_type<L>;
-    using B = vector_value_type<R>;
-    using O = result_t<F, A, B>;
-    using E = broadcast_vector_extent_type<L, R>;
-    vector_storage<O, E::value> result;
-
-    detail::apply_impl<F, E::value, O, A, B>::call(
-        fun,
-        result.data(),
-        detail::broadcast_impl<A, vector_extent_type<L>, E>::call(into_vector_storage(left)).data(),
-        detail::broadcast_impl<B, vector_extent_type<R>, E>::call(into_vector_storage(right))
-            .data());
-
-    return result;
+    return ::kernel_float::map(fun, left, right);
 }
 
 template<typename F, typename L, typename R>
@@ -67,7 +52,14 @@ KERNEL_FLOAT_INLINE zip_common_type<F, L, R> zip_common(F fun, const L& left, co
 
     vector_storage<O, E::value> result;
 
-    detail::apply_impl<F, E::value, O, T, T>::call(
+// Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
+#if KERNEL_FLOAT_FAST_MATH
+    using apply_impl = detail::apply_fastmath_impl<F, E::value, O, T, T>;
+#else
+    using apply_impl = detail::apply_impl<F, E::value, O, T, T>;
+#endif
+
+    apply_impl::call(
         fun,
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
@@ -277,36 +269,17 @@ KERNEL_FLOAT_DEFINE_BINARY(
 #if KERNEL_FLOAT_IS_DEVICE
 KERNEL_FLOAT_DEFINE_BINARY(
     rhypot,
-    (T(1) / ops::hypot<T>()(left, right)),
+    (ops::rcp<T>(ops::hypot<T>()(left, right))),
     ::rhypot(left, right),
     ::rhypotf(left, right))
 #else
 KERNEL_FLOAT_DEFINE_BINARY(
     rhypot,
-    (T(1) / ops::hypot<T>()(left, right)),
+    (ops::rcp<T>(ops::hypot<T>()(left, right))),
     (double(1) / ::hypot(left, right)),
     (float(1) / ::hypotf(left, right)))
 #endif
 
-#if KERNEL_FLOAT_IS_DEVICE
-#define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \
-    KERNEL_FLOAT_DEFINE_BINARY(                                       \
-        FUN_NAME,                                                     \
-        ops::OP_NAME<T> {}(left, right),                              \
-        ops::OP_NAME<double> {}(left, right),                         \
-        ops::OP_NAME<float> {}(left, right))
-#else
-#define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \
-    KERNEL_FLOAT_DEFINE_BINARY(                                       \
-        FUN_NAME,                                                     \
-        ops::OP_NAME<T> {}(left, right),                              \
-        ops::OP_NAME<double> {}(left, right),                         \
-        ops::OP_NAME<float> {}(left, right))
-#endif
-
-KERNEL_FLOAT_DEFINE_BINARY_FAST(fast_div, divide, __fdividef)
-KERNEL_FLOAT_DEFINE_BINARY_FAST(fast_pow, pow, __powf)
-
 namespace ops {
 template<>
 struct add<bool> {
@@ -323,6 +296,52 @@ struct multiply<bool> {
 };
 };  // namespace ops
 
+namespace detail {
+template<typename T, size_t N>
+struct apply_fastmath_impl<ops::divide<T>, N, T, T, T> {
+    KERNEL_FLOAT_INLINE static void
+    call(ops::divide<T> fun, T* result, const T* lhs, const T* rhs) {
+        T rhs_rcp[N];
+
+        // Fast way to perform division is to multiply by the reciprocal
+        apply_fastmath_impl<ops::rcp<T>, N, T, T, T>::call({}, rhs_rcp, rhs);
+        apply_fastmath_impl<ops::multiply<T>, N, T, T, T>::call({}, result, lhs, rhs_rcp);
+    }
+};
+
+#if KERNEL_FLOAT_IS_DEVICE
+template<size_t N>
+struct apply_fastmath_impl<ops::divide<float>, N, float, float, float> {
+    KERNEL_FLOAT_INLINE static void
+    call(ops::divide<float> fun, float* result, const float* lhs, const float* rhs) {
+#pragma unroll
+        for (size_t i = 0; i < N; i++) {
+            result[i] = __fdividef(lhs[i], rhs[i]);
+        }
+    }
+};
+#endif
+}  // namespace detail
+
+template<typename L, typename R, typename T = promoted_vector_value_type<L, R>>
+KERNEL_FLOAT_INLINE zip_common_type<ops::divide<T>, T, T>
+fast_divide(const L& left, const R& right) {
+    using E = broadcast_vector_extent_type<L, R>;
+    vector_storage<T, E::value> result;
+
+    detail::apply_fastmath_impl<ops::divide<T>, E::value, T, T, T>::call(
+        ops::divide<T> {},
+        result.data(),
+        detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
+            into_vector_storage(left))
+            .data(),
+        detail::convert_impl<vector_value_type<R>, vector_extent_type<R>, T, E>::call(
+            into_vector_storage(right))
+            .data());
+
+    return result;
+}
+
 namespace detail {
 template<typename T>
 struct cross_impl {
diff --git a/include/kernel_float/constant.h b/include/kernel_float/constant.h
@@ -30,7 +30,7 @@ struct constant {
     constexpr constant(T value = {}) : value_(value) {}
 
     KERNEL_FLOAT_INLINE
-    constexpr constant(const constant<T>& that) : value_(that.value) {}
+    constexpr constant(const constant<T>& that) : value_(that.value_) {}
 
     /**
      * Create a new constant from another constant of type `R`.
@@ -129,7 +129,9 @@ struct cast<constant<T>, R, m> {
     KERNEL_FLOAT_INLINE constant<T> operator OP(                                               \
         const constant<L>& left,                                                               \
         const constant<R>& right) {                                                            \
-        return constant<T>(left.get()) OP constant<T>(right.get());                            \
+        auto fl = ops::cast<L, T>();                                                           \
+        auto fr = ops::cast<R, T>();                                                           \
+        return fl(left.get()) OP fr(right.get());                                              \
     }
 
 KERNEL_FLOAT_CONSTANT_DEFINE_OP(+)
diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h
@@ -68,11 +68,7 @@ KERNEL_FLOAT_FP16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt)
 KERNEL_FLOAT_FP16_UNARY_FUN(sin, ::hsin, ::h2sin)
 KERNEL_FLOAT_FP16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt)
 KERNEL_FLOAT_FP16_UNARY_FUN(trunc, ::htrunc, ::h2trunc)
-
-KERNEL_FLOAT_FP16_UNARY_FUN(fast_exp, ::hexp, ::h2exp)
-KERNEL_FLOAT_FP16_UNARY_FUN(fast_log, ::hlog, ::h2log)
-KERNEL_FLOAT_FP16_UNARY_FUN(fast_cos, ::hcos, ::h2cos)
-KERNEL_FLOAT_FP16_UNARY_FUN(fast_sin, ::hsin, ::h2sin)
+KERNEL_FLOAT_FP16_UNARY_FUN(rcp, ::hrcp, ::h2rcp)
 
 #if KERNEL_FLOAT_IS_DEVICE
 #define KERNEL_FLOAT_FP16_BINARY_FUN(NAME, FUN1, FUN2)                              \
@@ -104,7 +100,6 @@ KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2)
 KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div)
 KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2)
 KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2)
-KERNEL_FLOAT_FP16_BINARY_FUN(fast_div, __hdiv, __h2div)
 
 KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2)
 KERNEL_FLOAT_FP16_BINARY_FUN(not_equal_to, __hneu, __hneu2)
diff --git a/include/kernel_float/macros.h b/include/kernel_float/macros.h
@@ -63,4 +63,8 @@
 
 #define KERNEL_FLOAT_MAX_ALIGNMENT (32)
 
+#ifndef KERNEL_FLOAT_FAST_MATH
+#define KERNEL_FLOAT_FAST_MATH (0)
+#endif
+
 #endif  //KERNEL_FLOAT_MACROS_H
diff --git a/include/kernel_float/meta.h b/include/kernel_float/meta.h
@@ -270,6 +270,9 @@ struct enable_if_impl<true, T> {
 template<bool C, typename T = void>
 using enable_if_t = typename detail::enable_if_impl<C, T>::type;
 
+template<typename T, typename...>
+using identity_t = T;
+
 KERNEL_FLOAT_INLINE
 constexpr size_t round_up_to_power_of_two(size_t n) {
     size_t result = 1;
diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
diff --git a/tests/cast.cu b/tests/cast.cu