Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/avx2.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
mod fdct;
mod ycbcr;

use crate::encoder::Operations;
use crate::encoder::{AlignedBlock, Operations};
pub use fdct::fdct_avx2;
pub use ycbcr::*;

pub(crate) struct AVX2Operations;

impl Operations for AVX2Operations {
#[inline(always)]
fn fdct(data: &mut [i16; 64]) {
fn fdct(data: &mut AlignedBlock) {
fdct_avx2(data);
}
}
10 changes: 7 additions & 3 deletions src/avx2/fdct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ use core::arch::x86_64::{
_mm256_unpacklo_epi16, _mm256_unpacklo_epi32,
};

use crate::encoder::AlignedBlock;

const CONST_BITS: i32 = 13;
const PASS1_BITS: i32 = 2;

Expand Down Expand Up @@ -57,14 +59,14 @@ const DESCALE_P1: i32 = CONST_BITS - PASS1_BITS;
const DESCALE_P2: i32 = CONST_BITS + PASS1_BITS;

#[inline(always)]
pub fn fdct_avx2(data: &mut [i16; 64]) {
pub fn fdct_avx2(data: &mut AlignedBlock) {
unsafe {
fdct_avx2_internal(data);
}
}

#[target_feature(enable = "avx2")]
fn fdct_avx2_internal(data: &mut [i16; 64]) {
fn fdct_avx2_internal(data: &mut AlignedBlock) {
#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline]
Expand Down Expand Up @@ -420,6 +422,8 @@ fn fdct_avx2_internal(data: &mut [i16; 64]) {
(t1, t2, t3, t4)
}

let data = &mut data.data;

let ymm4 = avx_load(&data[0..16]);
let ymm5 = avx_load(&data[16..32]);
let ymm6 = avx_load(&data[32..48]);
Expand Down Expand Up @@ -481,4 +485,4 @@ fn avx_store(input: __m256i, output: &mut [i16]) {
assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>());
// SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements.
unsafe { _mm256_storeu_si256(output.as_mut_ptr() as *mut __m256i, input) }
}
}
10 changes: 6 additions & 4 deletions src/avx2/ycbcr.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#[cfg(target_arch = "x86")]
use core::arch::x86::{
__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set_epi32, _mm256_set1_epi32,
_mm256_srli_epi32, _mm256_sub_epi32,
};

#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{
__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set_epi32, _mm256_set1_epi32,
_mm256_srli_epi32, _mm256_sub_epi32,
};

use alloc::vec::Vec;

use crate::{rgb_to_ycbcr, ImageBuffer, JpegColorType};
use crate::{ImageBuffer, JpegColorType, rgb_to_ycbcr};

macro_rules! ycbcr_image_avx2 {
($name:ident, $num_colors:expr, $o1:expr, $o2:expr, $o3:expr) => {
Expand Down Expand Up @@ -229,7 +229,9 @@ mod tests {
for (i, pixel) in scalar_result.iter().copied().enumerate() {
let avx_pixel: [u8; 3] = [buffers[0][i], buffers[1][i], buffers[2][i]];
if pixel != avx_pixel {
panic!("Mismatch at index {i}: scalar result is {pixel:?}, avx result is {avx_pixel:?}");
panic!(
"Mismatch at index {i}: scalar result is {pixel:?}, avx result is {avx_pixel:?}"
);
}
}
}
Expand Down
66 changes: 39 additions & 27 deletions src/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::image_buffer::*;
use crate::marker::Marker;
use crate::quantization::{QuantizationTable, QuantizationTableType};
use crate::writer::{JfifWrite, JfifWriter, ZIGZAG};
use crate::{PixelDensity, EncodingError};
use crate::{EncodingError, PixelDensity};

use alloc::vec;
use alloc::vec::Vec;
Expand Down Expand Up @@ -34,6 +34,24 @@ pub enum JpegColorType {
Ycck,
}

#[derive(Copy, Clone)]
#[repr(C, align(32))]
pub(crate) struct AlignedBlock {
pub data: [i16; 64],
}

impl AlignedBlock {
pub const fn new(data: [i16; 64]) -> Self {
AlignedBlock { data }
}
}

impl Default for AlignedBlock {
fn default() -> Self {
AlignedBlock { data: [0i16; 64] }
}
}

impl JpegColorType {
pub(crate) fn get_num_components(self) -> usize {
use JpegColorType::*;
Expand Down Expand Up @@ -353,11 +371,7 @@ impl<W: JfifWrite> Encoder<W> {
/// # Errors
///
/// Returns an error if the segment number is invalid or data exceeds the allowed size
pub fn add_app_segment(
&mut self,
segment_nr: u8,
data: Vec<u8>,
) -> Result<(), EncodingError> {
pub fn add_app_segment(&mut self, segment_nr: u8, data: Vec<u8>) -> Result<(), EncodingError> {
if segment_nr == 0 || segment_nr > 15 {
Err(EncodingError::InvalidAppSegment(segment_nr))
} else if data.len() > 65533 {
Expand Down Expand Up @@ -749,16 +763,14 @@ impl<W: JfifWrite> Encoder<W> {
&row[i],
block_x * 8 * max_h_sampling + (h_offset * 8),
v_offset * 8,
max_h_sampling
/ component.horizontal_sampling_factor as usize,
max_v_sampling
/ component.vertical_sampling_factor as usize,
max_h_sampling / component.horizontal_sampling_factor as usize,
max_v_sampling / component.vertical_sampling_factor as usize,
buffer_width,
);

OP::fdct(&mut block);

let mut q_block = [0i16; 64];
let mut q_block = AlignedBlock::default();

OP::quantize_block(
&block,
Expand All @@ -773,7 +785,7 @@ impl<W: JfifWrite> Encoder<W> {
&self.huffman_tables[component.ac_huffman_table as usize].1,
)?;

prev_dc[i] = q_block[0];
prev_dc[i] = q_block.data[0];
}
}
}
Expand Down Expand Up @@ -833,7 +845,7 @@ impl<W: JfifWrite> Encoder<W> {
&self.huffman_tables[component.ac_huffman_table as usize].1,
)?;

prev_dc = block[0];
prev_dc = block.data[0];

if restart_interval > 0 {
if restarts_to_go == 0 {
Expand Down Expand Up @@ -889,12 +901,12 @@ impl<W: JfifWrite> Encoder<W> {
}

self.writer.write_dc(
block[0],
block.data[0],
prev_dc,
&self.huffman_tables[component.dc_huffman_table as usize].0,
)?;

prev_dc = block[0];
prev_dc = block.data[0];

if restart_interval > 0 {
if restarts_to_go == 0 {
Expand Down Expand Up @@ -966,7 +978,7 @@ impl<W: JfifWrite> Encoder<W> {
&mut self,
image: &I,
q_tables: &[QuantizationTable; 2],
) -> [Vec<[i16; 64]>; 4] {
) -> [Vec<AlignedBlock>; 4] {
let width = image.width();
let height = image.height();

Expand Down Expand Up @@ -1028,7 +1040,7 @@ impl<W: JfifWrite> Encoder<W> {

OP::fdct(&mut block);

let mut q_block = [0i16; 64];
let mut q_block = AlignedBlock::default();

OP::quantize_block(
&block,
Expand All @@ -1043,7 +1055,7 @@ impl<W: JfifWrite> Encoder<W> {
blocks
}

fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec<[i16; 64]>; 4] {
fn init_block_buffers(&mut self, buffer_size: usize) -> [Vec<AlignedBlock>; 4] {
// To simplify the code and to give the compiler more infos to optimize stuff we always initialize 4 components
// Resource overhead should be minimal because an empty Vec doesn't allocate

Expand Down Expand Up @@ -1071,7 +1083,7 @@ impl<W: JfifWrite> Encoder<W> {
}

// Create new huffman tables optimized for this image
fn optimize_huffman_table(&mut self, blocks: &[Vec<[i16; 64]>; 4]) {
fn optimize_huffman_table(&mut self, blocks: &[Vec<AlignedBlock>; 4]) {
// TODO: Find out if it's possible to reuse some code from the writer

let max_tables = self.components.len().min(2) as u8;
Expand All @@ -1094,7 +1106,7 @@ impl<W: JfifWrite> Encoder<W> {
debug_assert!(!blocks[i].is_empty());

for block in &blocks[i] {
let value = block[0];
let value = block.data[0];
let diff = value - prev_dc;
let num_bits = get_num_bits(diff);

Expand Down Expand Up @@ -1126,7 +1138,7 @@ impl<W: JfifWrite> Encoder<W> {
for block in &blocks[i] {
let mut zero_run = 0;

for &value in &block[start..end] {
for &value in &block.data[start..end] {
if value == 0 {
zero_run += 1;
} else {
Expand All @@ -1152,7 +1164,7 @@ impl<W: JfifWrite> Encoder<W> {
for block in &blocks[i] {
let mut zero_run = 0;

for &value in &block[1..] {
for &value in &block.data[1..] {
if value == 0 {
zero_run += 1;
} else {
Expand Down Expand Up @@ -1214,7 +1226,7 @@ fn get_block(
col_stride: usize,
row_stride: usize,
width: usize,
) -> [i16; 64] {
) -> AlignedBlock {
let mut block = [0i16; 64];

for y in 0..8 {
Expand All @@ -1226,7 +1238,7 @@ fn get_block(
}
}

block
AlignedBlock::new(block)
}

fn ceil_div(value: usize, div: usize) -> usize {
Expand All @@ -1250,15 +1262,15 @@ fn get_num_bits(mut value: i16) -> u8 {

pub(crate) trait Operations {
#[inline(always)]
fn fdct(data: &mut [i16; 64]) {
fn fdct(data: &mut AlignedBlock) {
fdct(data);
}

#[inline(always)]
fn quantize_block(block: &[i16; 64], q_block: &mut [i16; 64], table: &QuantizationTable) {
fn quantize_block(block: &AlignedBlock, q_block: &mut AlignedBlock, table: &QuantizationTable) {
for i in 0..64 {
let z = ZIGZAG[i] as usize & 0x3f;
q_block[i] = table.quantize(block[z], z);
q_block.data[i] = table.quantize(block.data[z], z);
}
}
}
Expand Down
26 changes: 13 additions & 13 deletions src/fdct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@
* scaled fixed-point arithmetic, with a minimal number of shifts.
*/

use crate::encoder::AlignedBlock;

const CONST_BITS: i32 = 13;
const PASS1_BITS: i32 = 2;

Expand Down Expand Up @@ -102,7 +104,9 @@ fn into_el(v: i32) -> i16 {

#[allow(clippy::erasing_op)]
#[allow(clippy::identity_op)]
pub fn fdct(data: &mut [i16; 64]) {
pub fn fdct(data: &mut AlignedBlock) {
let data = &mut data.data;

/* Pass 1: process rows. */
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
/* furthermore, we scale the results by 2**PASS1_BITS. */
Expand Down Expand Up @@ -134,14 +138,8 @@ pub fn fdct(data: &mut [i16; 64]) {
data2[offset + 4] = (tmp10 - tmp11) << PASS1_BITS;

let z1 = (tmp12 + tmp13) * FIX_0_541196100;
data2[offset + 2] = descale(
z1 + (tmp13 * FIX_0_765366865),
CONST_BITS - PASS1_BITS,
);
data2[offset + 6] = descale(
z1 + (tmp12 * -FIX_1_847759065),
CONST_BITS - PASS1_BITS,
);
data2[offset + 2] = descale(z1 + (tmp13 * FIX_0_765366865), CONST_BITS - PASS1_BITS);
data2[offset + 6] = descale(z1 + (tmp12 * -FIX_1_847759065), CONST_BITS - PASS1_BITS);

/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
* cK represents cos(K*pi/16).
Expand Down Expand Up @@ -244,6 +242,8 @@ mod tests {

// Inputs and outputs are taken from libjpegs jpeg_fdct_islow for a typical image

use crate::encoder::AlignedBlock;

use super::fdct;

const INPUT1: [i16; 64] = [
Expand Down Expand Up @@ -275,12 +275,12 @@ mod tests {

#[test]
pub fn test_fdct_libjpeg() {
let mut i1 = INPUT1.clone();
let mut i1 = AlignedBlock::new(INPUT1.clone());
fdct(&mut i1);
assert_eq!(i1, OUTPUT1);
assert_eq!(i1.data, OUTPUT1);

let mut i2 = INPUT2.clone();
let mut i2 = AlignedBlock::new(INPUT2.clone());
fdct(&mut i2);
assert_eq!(i2, OUTPUT2);
assert_eq!(i2.data, OUTPUT2);
}
}
Loading