diff --git a/src/ops/f32.rs b/src/ops/f32.rs index 0546114..2c3bdee 100644 --- a/src/ops/f32.rs +++ b/src/ops/f32.rs @@ -363,14 +363,10 @@ impl_op! { _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128) -> __m128 { - let nums_arr = core::mem::transmute::<__m128, [f32; 4]>(a); - let ceil = [ - nums_arr[0].m_floor(), - nums_arr[1].m_floor(), - nums_arr[2].m_floor(), - nums_arr[3].m_floor(), - ]; - core::mem::transmute::<[f32; 4], __m128>(ceil) + let rounded = Self::round(a); + let mask = _mm_cmpgt_ps(rounded, a); + let one = _mm_and_ps(mask, _mm_set1_ps(1.0)); + _mm_sub_ps(rounded, one) } for Scalar(a: f32) -> f32 { a.m_floor() @@ -393,14 +389,10 @@ impl_op! { _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128) -> __m128 { - let nums_arr = core::mem::transmute::<__m128, [f32; 4]>(a); - let ceil = [ - nums_arr[0].m_ceil(), - nums_arr[1].m_ceil(), - nums_arr[2].m_ceil(), - nums_arr[3].m_ceil(), - ]; - core::mem::transmute::<[f32; 4], __m128>(ceil) + let rounded = Self::round(a); + let mask = _mm_cmplt_ps(rounded, a); + let one = _mm_and_ps(mask, _mm_set1_ps(1.0)); + _mm_add_ps(rounded, one) } for Scalar(a: f32) -> f32 { a.m_ceil() diff --git a/src/ops/f64.rs b/src/ops/f64.rs index afcb969..47483fb 100644 --- a/src/ops/f64.rs +++ b/src/ops/f64.rs @@ -345,12 +345,10 @@ impl_op! { _mm_round_pd(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128d) -> __m128d { - let nums_arr = core::mem::transmute::<__m128d, [f64; 2]>(a); - let ceil = [ - nums_arr[0].m_floor(), - nums_arr[1].m_floor(), - ]; - core::mem::transmute::<[f64; 2], __m128d>(ceil) + let rounded = Self::round(a); + let mask = _mm_cmpgt_pd(rounded, a); + let one = _mm_and_pd(mask, _mm_set1_pd(1.0)); + _mm_sub_pd(rounded, one) } for Scalar(a: f64) -> f64 { a.m_floor() @@ -373,12 +371,10 @@ impl_op! { _mm_round_pd(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128d) -> __m128d { - let nums_arr = core::mem::transmute::<__m128d, [f64; 2]>(a); - let ceil = [ - nums_arr[0].m_ceil(), - nums_arr[1].m_ceil(), - ]; - core::mem::transmute::<[f64; 2], __m128d>(ceil) + let rounded = Self::round(a); + let mask = _mm_cmplt_pd(rounded, a); + let one = _mm_and_pd(mask, _mm_set1_pd(1.0)); + _mm_add_pd(rounded, one) } for Scalar(a: f64) -> f64 { a.m_ceil()