diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 50c1f55..826fb4f 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -29,6 +29,8 @@ jobs:
       run: cargo build --verbose --features simd
     - name: Run tests
       run: cargo test --verbose --features simd
+    - name: Run tests use_wide
+      run: cargo test --verbose --features use_wide
 
   no_std:
 
diff --git a/Cargo.toml b/Cargo.toml
index 8712c81..87b73c6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,11 +15,14 @@ rust-version = "1.87"
 default = ["std"]
 simd = ["std"]
 std = []
+use_wide = ["wide", "bytemuck"]
 
 # DO NOT USE THIS IN PRODUCTION. Expose several internal functions for benchmark purposes.
 benchmark = []
 
 [dependencies]
+wide = { version = "1.0", optional = true, default-features = false }
+bytemuck = { version = "1.24", optional = true, default-features = false }
 
 [dev-dependencies]
 jpeg-decoder = { version = "0.3", default-features = false }
diff --git a/src/fdct.rs b/src/fdct.rs
index 7d0273e..dd693bf 100644
--- a/src/fdct.rs
+++ b/src/fdct.rs
@@ -72,37 +72,39 @@
  */
 
 const CONST_BITS: i32 = 13;
-const PASS1_BITS: i32 = 2;
-
-const FIX_0_298631336: i32 = 2446;
-const FIX_0_390180644: i32 = 3196;
-const FIX_0_541196100: i32 = 4433;
-const FIX_0_765366865: i32 = 6270;
-const FIX_0_899976223: i32 = 7373;
-const FIX_1_175875602: i32 = 9633;
-const FIX_1_501321110: i32 = 12299;
-const FIX_1_847759065: i32 = 15137;
-const FIX_1_961570560: i32 = 16069;
-const FIX_2_053119869: i32 = 16819;
-const FIX_2_562915447: i32 = 20995;
-const FIX_3_072711026: i32 = 25172;
-
-const DCT_SIZE: usize = 8;
-
-#[inline(always)]
-fn descale(x: i32, n: i32) -> i32 {
-    // right shift with rounding
-    (x + (1 << (n - 1))) >> n
-}
-
-#[inline(always)]
-fn into_el(v: i32) -> i16 {
-    v as i16
-}
 
+#[cfg(not(feature = "use_wide"))]
 #[allow(clippy::erasing_op)]
 #[allow(clippy::identity_op)]
 pub fn fdct(data: &mut [i16; 64]) {
+    const PASS1_BITS: i32 = 2;
+
+    const FIX_0_298631336: i32 = 2446;
+    const FIX_0_390180644: i32 = 3196;
+    const FIX_0_541196100: i32 = 4433;
+    const FIX_0_765366865: i32 = 6270;
+    const FIX_0_899976223: i32 = 7373;
+    const FIX_1_175875602: i32 = 9633;
+    const FIX_1_501321110: i32 = 12299;
+    const FIX_1_847759065: i32 = 15137;
+    const FIX_1_961570560: i32 = 16069;
+    const FIX_2_053119869: i32 = 16819;
+    const FIX_2_562915447: i32 = 20995;
+    const FIX_3_072711026: i32 = 25172;
+
+    const DCT_SIZE: usize = 8;
+
+    #[inline(always)]
+    fn descale(x: i32, n: i32) -> i32 {
+        // right shift with rounding
+        (x + (1 << (n - 1))) >> n
+    }
+
+    #[inline(always)]
+    fn into_el(v: i32) -> i16 {
+        v as i16
+    }
+
     /* Pass 1: process rows. */
     /* Note results are scaled up by sqrt(8) compared to a true DCT; */
     /* furthermore, we scale the results by 2**PASS1_BITS. */
@@ -134,14 +136,8 @@ pub fn fdct(data: &mut [i16; 64]) {
         data2[offset + 4] = (tmp10 - tmp11) << PASS1_BITS;
 
         let z1 = (tmp12 + tmp13) * FIX_0_541196100;
-        data2[offset + 2] = descale(
-            z1 + (tmp13 * FIX_0_765366865),
-            CONST_BITS - PASS1_BITS,
-        );
-        data2[offset + 6] = descale(
-            z1 + (tmp12 * -FIX_1_847759065),
-            CONST_BITS - PASS1_BITS,
-        );
+        data2[offset + 2] = descale(z1 + (tmp13 * FIX_0_765366865), CONST_BITS - PASS1_BITS);
+        data2[offset + 6] = descale(z1 + (tmp12 * -FIX_1_847759065), CONST_BITS - PASS1_BITS);
 
         /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
          * cK represents cos(K*pi/16).
@@ -239,12 +235,144 @@ pub fn fdct(data: &mut [i16; 64]) {
     }
 }
 
+/// This version uses the wide crate for SIMD operations, resulting in a 80% speedup on older SSE2-capable CPUs.
+///
+/// It also means that these operations can be done on ARM and WASM targets that support SIMD.
+///
+/// The pure AVX2 version is still another 10-20% faster if compiling for AVX2.
+#[cfg(feature = "use_wide")]
+pub fn fdct(data: &mut [i16; 64]) {
+    use bytemuck::cast;
+    use wide::{i16x8, i32x8};
+
+    const FIX_0_298631336: i16 = 2446;
+    const FIX_0_390180644: i16 = 3196;
+    const FIX_0_541196100: i16 = 4433;
+    const FIX_0_765366865: i16 = 6270;
+    const FIX_0_899976223: i16 = 7373;
+    const FIX_1_175875602: i16 = 9633;
+    const FIX_1_501321110: i16 = 12299;
+    const FIX_1_847759065: i16 = 15137;
+    const FIX_1_961570560: i16 = 16069;
+    const FIX_2_053119869: i16 = 16819;
+    const FIX_2_562915447: i16 = 20995;
+    const FIX_3_072711026: i16 = 25172;
+
+    #[inline(always)]
+    fn descale_w(x: i32x8, n: i32) -> i16x8 {
+        // right shift with rounding
+        i16x8::from_i32x8_saturate((x + (1 << (n - 1))) >> n)
+    }
+
+    #[inline(always)]
+    fn kernel_32<const SHIFT_BITS: i32>(data_w: [i16x8; 8]) -> [i16x8; 8] {
+        let tmp0 = data_w[0] + data_w[7];
+        let tmp7 = data_w[0] - data_w[7];
+        let tmp1 = data_w[1] + data_w[6];
+        let tmp6 = data_w[1] - data_w[6];
+        let tmp2 = data_w[2] + data_w[5];
+        let tmp5 = data_w[2] - data_w[5];
+        let tmp3 = data_w[3] + data_w[4];
+        let tmp4 = data_w[3] - data_w[4];
+
+        /* Even part per LL&M figure 1 --- note that published figure is faulty;
+         * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+         */
+
+        let tmp10 = tmp0 + tmp3;
+        let tmp13 = tmp0 - tmp3;
+        let tmp11 = tmp1 + tmp2;
+        let tmp12 = tmp1 - tmp2;
+
+        let data_0;
+        let data_4;
+
+        if SHIFT_BITS < 0 {
+            data_0 = tmp10 + tmp11 << -SHIFT_BITS;
+            data_4 = tmp10 - tmp11 << -SHIFT_BITS;
+        } else {
+            data_0 = descale_w(i32x8::from(tmp10 + tmp11), SHIFT_BITS);
+            data_4 = descale_w(i32x8::from(tmp10 - tmp11), SHIFT_BITS);
+        }
+
+        let z1 = (tmp12 + tmp13).mul_widen(i16x8::from(FIX_0_541196100));
+        let data_2 = descale_w(
+            z1 + (tmp13.mul_widen(i16x8::from(FIX_0_765366865))),
+            CONST_BITS + SHIFT_BITS,
+        );
+        let data_6 = descale_w(
+            z1 + (tmp12.mul_widen(i16x8::from(-FIX_1_847759065))),
+            CONST_BITS + SHIFT_BITS,
+        );
+
+        /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+         * cK represents cos(K*pi/16).
+         * i0..i3 in the paper are tmp4..tmp7 here.
+         */
+
+        let z1 = tmp4 + tmp7;
+        let z2 = tmp5 + tmp6;
+        let z3 = tmp4 + tmp6;
+        let z4 = tmp5 + tmp7;
+        let z5 = (z3 + z4).mul_widen(i16x8::from(FIX_1_175875602));
+        /* sqrt(2) * c3 */
+
+        let tmp4 = tmp4.mul_widen(i16x8::from(FIX_0_298631336));
+        /* sqrt(2) * (-c1+c3+c5-c7) */
+        let tmp5 = tmp5.mul_widen(i16x8::from(FIX_2_053119869));
+        /* sqrt(2) * ( c1+c3-c5+c7) */
+        let tmp6 = tmp6.mul_widen(i16x8::from(FIX_3_072711026));
+        /* sqrt(2) * ( c1+c3+c5-c7) */
+        let tmp7 = tmp7.mul_widen(i16x8::from(FIX_1_501321110));
+        /* sqrt(2) * ( c1+c3-c5-c7) */
+        let z1 = z1.mul_widen(i16x8::from(-FIX_0_899976223));
+        /* sqrt(2) * ( c7-c3) */
+        let z2 = z2.mul_widen(i16x8::from(-FIX_2_562915447));
+        /* sqrt(2) * (-c1-c3) */
+        let z3 = z3.mul_widen(i16x8::from(-FIX_1_961570560));
+        /* sqrt(2) * (-c3-c5) */
+        let z4 = z4.mul_widen(i16x8::from(-FIX_0_390180644));
+        /* sqrt(2) * ( c5-c3) */
+
+        let z3 = z3 + z5;
+        let z4 = z4 + z5;
+
+        let data_7 = descale_w(tmp4 + z1 + z3, CONST_BITS + SHIFT_BITS);
+        let data_5 = descale_w(tmp5 + z2 + z4, CONST_BITS + SHIFT_BITS);
+        let data_3 = descale_w(tmp6 + z2 + z3, CONST_BITS + SHIFT_BITS);
+        let data_1 = descale_w(tmp7 + z1 + z4, CONST_BITS + SHIFT_BITS);
+
+        [
+            data_0, data_1, data_2, data_3, data_4, data_5, data_6, data_7,
+        ]
+    }
+
+    /* Pass 1: process rows. */
+    /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+    /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+    let data_w = i16x8::transpose(cast(*data));
+
+    let pass1 = kernel_32::<-2>(data_w);
+
+    let data2 = i16x8::transpose(pass1);
+
+    /* Pass 2: process columns.
+     * We remove the PASS1_BITS scaling, but leave the results scaled up
+     * by an overall factor of 8.
+     */
+
+    let pass2 = kernel_32::<2>(data2);
+
+    *data = cast(pass2);
+}
+
 #[cfg(test)]
 mod tests {
 
     // Inputs and outputs are taken from libjpegs jpeg_fdct_islow for a typical image
 
-    use super::fdct;
+    use super::*;
 
     const INPUT1: [i16; 64] = [
         -70, -71, -70, -68, -67, -67, -67, -67, -72, -73, -72, -70, -69, -69, -68, -69, -75, -76,