diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml
index 8af7691..401c299 100644
--- a/.github/workflows/ci_test.yml
+++ b/.github/workflows/ci_test.yml
@@ -10,11 +10,11 @@ jobs:
       - name: Checkout sources
         uses: actions/checkout@v2
 
-      - name: Install stable toolchain
+      - name: Install nightly toolchain
         uses: actions-rs/toolchain@v1
         with:
           profile: minimal
-          toolchain: stable
+          toolchain: nightly
           override: true
 
       - name: Run cargo check
@@ -35,11 +35,11 @@ jobs:
       - name: Checkout sources
         uses: actions/checkout@v2
 
-      - name: Install stable toolchain
+      - name: Install nightly toolchain
         uses: actions-rs/toolchain@v1
         with:
           profile: minimal
-          toolchain: stable
+          toolchain: nightly
           override: true
 
       - name: Run cargo test
@@ -54,11 +54,11 @@ jobs:
       - name: Checkout sources
         uses: actions/checkout@v2
 
-      - name: Install stable toolchain
+      - name: Install nightly toolchain
         uses: actions-rs/toolchain@v1
         with:
           profile: minimal
-          toolchain: stable
+          toolchain: nightly
           override: true
           components: rustfmt, clippy
 
@@ -96,13 +96,6 @@ jobs:
           use-cross: true
           args: --target aarch64-unknown-linux-gnu
 
-      - name: Run cargo check with neon
-        uses: actions-rs/cargo@v1
-        with:
-          command: check
-          use-cross: true
-          args: --target aarch64-unknown-linux-gnu --features neon
-
       - name: Run cargo test for arm
         uses: actions-rs/cargo@v1
         with:
@@ -110,13 +103,6 @@ jobs:
           use-cross: true
           args: --target aarch64-unknown-linux-gnu
 
-      - name: Run cargo test for arm with neon
-        uses: actions-rs/cargo@v1
-        with:
-          command: test
-          use-cross: true
-          args: --target aarch64-unknown-linux-gnu --features neon
-
   check_test_wasm32:
     name: Check and test wasm32
     runs-on: ubuntu-latest
@@ -124,11 +110,11 @@ jobs:
       - name: Checkout sources
         uses: actions/checkout@v2
 
-      - name: Install stable toolchain
+      - name: Install nightly toolchain
         uses: actions-rs/toolchain@v1
         with:
           profile: minimal
-          toolchain: stable
+          toolchain: nightly
           target: wasm32-unknown-unknown
           override: true
 
diff --git a/Cargo.toml b/Cargo.toml
index b0466f6..6d9e88d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,7 @@
 [package]
 name = "rubato"
-version = "0.11.0"
+version = "0.12.0"
+rust-version = "1.61"
 authors = ["HEnquist <henrik.enquist@gmail.com>"]
 description = "Asynchronous resampling library intended for audio data"
 license = "MIT"
@@ -12,11 +13,6 @@ edition = "2018"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
-[features]
-default = ["avx"]
-avx = []
-neon = []
-
 [dependencies]
 log = { version = "0.4.14", optional = true }
 realfft = "3.0.0"
diff --git a/README.md b/README.md
index c548dde..bf062e1 100644
--- a/README.md
+++ b/README.md
@@ -42,15 +42,13 @@ This type of resampler is considerably faster but doesn't support changing the r
 
 ##### Asynchronous resampling
 
-The asynchronous resampler is designed to benefit from auto-vectorization, meaning that the Rust compiler
-can recognize calculations that can be done in parallel. It will then use SIMD instructions for those.
-This works quite well, but there is still room for improvement.
-To address this, it also has optimized SIMD support.
-This gets enabled at runtime by checking the SIMD support of the CPU.
+The asynchronous resampler supports SIMD on x86_64 and on aarch64.
+The SIMD capabilities of the CPU are determined at runtime.
+If no supported SIMD instruction set is available, it falls back to a scalar implementation.
 
-On x86_64 it will try to use SSE3. The speed benefit compared to auto-vectorization
-depends on the CPU, but tends to be in the range 20-30% for 64-bit data, and 50-100% for 32-bit data.
-There is also optional support for AVX on x86_64, and Neon on aarch64 via Cargo features.
+On x86_64 it will try to use AVX. If AVX isn't available, it will instead try SSE3.
+
+On aarch64 (64-bit Arm) it will use Neon if available.
 
 ##### Synchronous resampling
 
@@ -58,20 +56,6 @@ The synchronous resamplers benefit from the SIMD support of the RustFFT library.
 
 #### Cargo features
 
-###### `avx`: AVX on x86_64
-
-The `avx` feature is enabled by default, and enables the use of AVX when it's available.
-The speed increase compared to SSE depends on the CPU, and tends to range from zero to 50%.
-On other architectures than x86_64 the `avx` feature does nothing.
-
-###### `neon`: Experimental Neon support on aarch64
-
-Experimental support for Neon is available for aarch64 (64-bit Arm) by enabling the `neon` feature.
-This requires the use of a nightly compiler, as the Neon support in Rust is still experimental.
-On a Raspberry Pi 4, this gives a boost of about 10% for 64-bit floats and 50% for 32-bit floats when
-compared to the auto-vectorized implementation.
-Note that this only works on a full 64-bit operating system.
-
 ###### `log`: Enable logging
 
 This feature enables logging via the `log` crate. This is intended for debugging purposes.
@@ -106,10 +90,12 @@ let waves_out = resampler.process(&waves_in, None).unwrap();
 
 ### Compatibility
 
-The `rubato` crate requires rustc version 1.40 or newer.
+The `rubato` crate requires rustc version 1.61 or newer.
 
 ### Changelog
 
+- v0.12.0
+  - Always enable all simd acceleration (and remove the simd Cargo features).
 - v0.11.0
   - New api to allow use in realtime applications.
   - Configurable adjust range of asynchronous resamplers.
diff --git a/benches/resamplers.rs b/benches/resamplers.rs
index 27623a3..1098803 100644
--- a/benches/resamplers.rs
+++ b/benches/resamplers.rs
@@ -1,11 +1,11 @@
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
 extern crate rubato;
 
 use rubato::ScalarInterpolator;
 
-#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+#[cfg(target_arch = "x86_64")]
 use rubato::interpolator_avx::AvxInterpolator;
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+#[cfg(target_arch = "aarch64")]
 use rubato::interpolator_neon::NeonInterpolator;
 #[cfg(target_arch = "x86_64")]
 use rubato::interpolator_sse::SseInterpolator;
@@ -14,19 +14,19 @@ use rubato::{FftFixedIn, InterpolationType, Resampler, SincFixedIn, WindowFuncti
 
 fn bench_fftfixedin(c: &mut Criterion) {
     let chunksize = 1024;
-    let mut resampler = FftFixedIn::<f64>::new(44100, 192000, 1024, 2, 1);
+    let mut resampler = FftFixedIn::<f64>::new(44100, 192000, 1024, 2, 1).unwrap();
     let waveform = vec![vec![0.0 as f64; chunksize]; 1];
     c.bench_function("FftFixedIn f64", |b| {
-        b.iter(|| resampler.process(&waveform).unwrap())
+        b.iter(|| resampler.process(black_box(&waveform), None).unwrap())
     });
 }
 
 fn bench_fftfixedin_32(c: &mut Criterion) {
     let chunksize = 1024;
-    let mut resampler = FftFixedIn::<f32>::new(44100, 192000, 1024, 2, 1);
+    let mut resampler = FftFixedIn::<f32>::new(44100, 192000, 1024, 2, 1).unwrap();
     let waveform = vec![vec![0.0 as f32; chunksize]; 1];
     c.bench_function("FftFixedIn f32", |b| {
-        b.iter(|| resampler.process(&waveform).unwrap())
+        b.iter(|| resampler.process(black_box(&waveform), None).unwrap())
     });
 }
 
@@ -61,13 +61,14 @@ macro_rules! bench_async_resampler {
             let interpolator = Box::new(interpolator);
             let mut resampler = SincFixedIn::<$ft>::new_with_interpolator(
                 resample_ratio,
+                1.1,
                 interpolation_type,
                 interpolator,
                 chunksize,
                 1,
-            );
+            ).unwrap();
             let waveform = vec![vec![0.0 as $ft; chunksize]; 1];
-            c.bench_function($desc, |b| b.iter(|| resampler.process(&waveform).unwrap()));
+            c.bench_function($desc, |b| b.iter(|| resampler.process(black_box(&waveform), None).unwrap()));
         }
     };
 }
@@ -170,7 +171,7 @@ bench_async_resampler!(
     "sse async nearest 64"
 );
 
-#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+#[cfg(target_arch = "x86_64")]
 bench_async_resampler!(
     f32,
     AvxInterpolator,
@@ -178,7 +179,7 @@ bench_async_resampler!(
     bench_avx_async_cubic_32,
     "avx async cubic   32"
 );
-#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+#[cfg(target_arch = "x86_64")]
 bench_async_resampler!(
     f32,
     AvxInterpolator,
@@ -186,7 +187,7 @@ bench_async_resampler!(
     bench_avx_async_linear_32,
     "avx async linear  32"
 );
-#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+#[cfg(target_arch = "x86_64")]
 bench_async_resampler!(
     f32,
     AvxInterpolator,
@@ -194,7 +195,7 @@ bench_async_resampler!(
     bench_avx_async_nearest_32,
     "avx async nearest 32"
 );
-#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+#[cfg(target_arch = "x86_64")]
 bench_async_resampler!(
     f64,
     AvxInterpolator,
@@ -202,7 +203,7 @@ bench_async_resampler!(
     bench_avx_async_cubic_64,
     "avx async cubic   64"
 );
-#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+#[cfg(target_arch = "x86_64")]
 bench_async_resampler!(
     f64,
     AvxInterpolator,
@@ -210,7 +211,7 @@ bench_async_resampler!(
     bench_avx_async_linear_64,
     "avx async linear  64"
 );
-#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+#[cfg(target_arch = "x86_64")]
 bench_async_resampler!(
     f64,
     AvxInterpolator,
@@ -219,7 +220,7 @@ bench_async_resampler!(
     "avx async nearest 64"
 );
 
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+#[cfg(target_arch = "aarch64")]
 bench_async_resampler!(
     f32,
     NeonInterpolator,
@@ -227,7 +228,7 @@ bench_async_resampler!(
     bench_neon_async_cubic_32,
     "neon async cubic   32"
 );
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+#[cfg(target_arch = "aarch64")]
 bench_async_resampler!(
     f32,
     NeonInterpolator,
@@ -235,7 +236,7 @@ bench_async_resampler!(
     bench_neon_async_linear_32,
     "neon async linear  32"
 );
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+#[cfg(target_arch = "aarch64")]
 bench_async_resampler!(
     f32,
     NeonInterpolator,
@@ -243,7 +244,7 @@ bench_async_resampler!(
     bench_neon_async_nearest_32,
     "neon async nearest 32"
 );
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+#[cfg(target_arch = "aarch64")]
 bench_async_resampler!(
     f64,
     NeonInterpolator,
@@ -251,7 +252,7 @@ bench_async_resampler!(
     bench_neon_async_cubic_64,
     "neon async cubic   64"
 );
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+#[cfg(target_arch = "aarch64")]
 bench_async_resampler!(
     f64,
     NeonInterpolator,
@@ -259,7 +260,7 @@ bench_async_resampler!(
     bench_neon_async_linear_64,
     "neon async linear  64"
 );
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+#[cfg(target_arch = "aarch64")]
 bench_async_resampler!(
     f64,
     NeonInterpolator,
@@ -268,26 +269,7 @@ bench_async_resampler!(
     "neon async nearest 64"
 );
 
-#[cfg(all(target_arch = "x86_64", not(feature = "avx")))]
-criterion_group!(
-    benches,
-    bench_fftfixedin,
-    bench_fftfixedin_32,
-    bench_scalar_async_cubic_32,
-    bench_scalar_async_linear_32,
-    bench_scalar_async_nearest_32,
-    bench_scalar_async_cubic_64,
-    bench_scalar_async_linear_64,
-    bench_scalar_async_nearest_64,
-    bench_sse_async_cubic_32,
-    bench_sse_async_linear_32,
-    bench_sse_async_nearest_32,
-    bench_sse_async_cubic_64,
-    bench_sse_async_linear_64,
-    bench_sse_async_nearest_64,
-);
-
-#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+#[cfg(target_arch = "x86_64")]
 criterion_group!(
     benches,
     bench_fftfixedin,
@@ -312,20 +294,7 @@ criterion_group!(
     bench_avx_async_nearest_64,
 );
 
-#[cfg(all(target_arch = "aarch64", not(feature = "neon")))]
-criterion_group!(
-    benches,
-    bench_fftfixedin,
-    bench_fftfixedin_32,
-    bench_scalar_async_cubic_32,
-    bench_scalar_async_linear_32,
-    bench_scalar_async_nearest_32,
-    bench_scalar_async_cubic_64,
-    bench_scalar_async_linear_64,
-    bench_scalar_async_nearest_64,
-);
-
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+#[cfg(target_arch = "aarch64")]
 criterion_group!(
     benches,
     bench_fftfixedin,
diff --git a/src/asynchro.rs b/src/asynchro.rs
index 100ef97..323cdd4 100644
--- a/src/asynchro.rs
+++ b/src/asynchro.rs
@@ -1,8 +1,8 @@
 use crate::error::{ResampleError, ResampleResult, ResamplerConstructionError};
 use crate::interpolation::*;
-#[cfg(all(target_arch = "x86_64", feature = "avx"))]
+#[cfg(target_arch = "x86_64")]
 use crate::interpolator_avx::AvxInterpolator;
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
+#[cfg(target_arch = "aarch64")]
 use crate::interpolator_neon::NeonInterpolator;
 #[cfg(target_arch = "x86_64")]
 use crate::interpolator_sse::SseInterpolator;
@@ -187,7 +187,7 @@ where
         f_cutoff * resample_ratio as f32
     };
 
-    #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+    #[cfg(target_arch = "x86_64")]
     if let Ok(interpolator) =
         AvxInterpolator::<T>::new(sinc_len, oversampling_factor, f_cutoff, window)
     {
@@ -201,7 +201,7 @@ where
         return Box::new(interpolator);
     }
 
-    #[cfg(all(target_arch = "aarch64", feature = "neon"))]
+    #[cfg(target_arch = "aarch64")]
     if let Ok(interpolator) =
         NeonInterpolator::<T>::new(sinc_len, oversampling_factor, f_cutoff, window)
     {
diff --git a/src/error.rs b/src/error.rs
index 8968bbd..0343003 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -14,7 +14,7 @@ pub enum CpuFeature {
     #[cfg(target_arch = "x86_64")]
     Fma,
     /// aarc64 neon cpu feature.
-    #[cfg(all(feature = "neon", target_arch = "aarch64"))]
+    #[cfg(target_arch = "aarch64")]
     Neon,
 }
 
@@ -34,7 +34,7 @@ impl CpuFeature {
             CpuFeature::Fma => {
                 is_x86_feature_detected!("fma")
             }
-            #[cfg(all(feature = "neon", target_arch = "aarch64"))]
+            #[cfg(target_arch = "aarch64")]
             CpuFeature::Neon => {
                 std::arch::is_aarch64_feature_detected!("neon")
             }
@@ -58,7 +58,7 @@ impl fmt::Display for CpuFeature {
             CpuFeature::Fma => {
                 write!(f, "fma")
             }
-            #[cfg(all(feature = "neon", target_arch = "aarch64"))]
+            #[cfg(target_arch = "aarch64")]
             CpuFeature::Neon => {
                 write!(f, "neon")
             }
diff --git a/src/lib.rs b/src/lib.rs
index 719d3cc..5fb9896 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -40,15 +40,13 @@
 //!
 //! #### Asynchronous resampling
 //!
-//! The asynchronous resampler is designed to benefit from auto-vectorization, meaning that the Rust compiler
-//! can recognize calculations that can be done in parallel. It will then use SIMD instructions for those.
-//! This works quite well, but there is still room for improvement.
-//! To address this, it also has optimized SIMD support.
-//! This gets enabled at runtime by checking the SIMD support of the CPU.
+//! The asynchronous resampler supports SIMD on x86_64 and on aarch64.
+//! The SIMD capabilities of the CPU are determined at runtime.
+//! If no supported SIMD instruction set is available, it falls back to a scalar implementation.
 //!
-//! On x86_64 it will try to use SSE3. The speed benefit compared to auto-vectorization
-//! depends on the CPU, but tends to be in the range 20-30% for 64-bit data, and 50-100% for 32-bit data.
-//! There is also optional support for AVX on x86_64, and Neon on aarch64 via Cargo features.
+//! On x86_64 it will try to use AVX. If AVX isn't available, it will instead try SSE3.
+//!
+//! On aarch64 (64-bit Arm) it will use Neon if available.
 //!
 //! #### Synchronous resampling
 //!
@@ -56,20 +54,6 @@
 //!
 //! ### Cargo features
 //!
-//! ##### `avx`: AVX on x86_64
-//!
-//! The `avx` feature is enabled by default, and enables the use of AVX when it's available.
-//! The speed increase compared to SSE depends on the CPU, and tends to range from zero to 50%.
-//! On other architectures than x86_64 the `avx` feature does nothing.
-//!
-//! ##### `neon`: Experimental Neon support on aarch64
-//!
-//! Experimental support for Neon is available for aarch64 (64-bit Arm) by enabling the `neon` feature.
-//! This requires the use of a nightly compiler, as the Neon support in Rust is still experimental.
-//! On a Raspberry Pi 4, this gives a boost of about 10% for 64-bit floats and 50% for 32-bit floats when
-//! compared to the auto-vectorized implementation.
-//! Note that this only works on a full 64-bit operating system.
-//!
 //! ##### `log`: Enable logging
 //!
 //! This feature enables logging via the `log` crate. This is intended for debugging purposes.
@@ -104,10 +88,12 @@
 //!
 //! ## Compatibility
 //!
-//! The `rubato` crate requires rustc version 1.40 or newer.
+//! The `rubato` crate requires rustc version 1.61 or newer.
 //!
 //! ## Changelog
 //!
+//! - v0.12.0
+//!   - Always enable all simd acceleration (and remove the simd Cargo features).
 //! - v0.11.0
 //!   - New api to allow use in realtime applications.
 //!   - Configurable adjust range of asynchronous resamplers.
@@ -118,9 +104,6 @@
 //! - v0.9.0
 //!   - Accept any AsRef<[T]> as input.
 
-#![cfg_attr(feature = "neon", feature(aarch64_target_feature))]
-#![cfg_attr(feature = "neon", feature(stdsimd))]
-
 #[cfg(feature = "log")]
 extern crate log;
 
@@ -199,7 +182,7 @@ macro_rules! interpolator {
 }
 
 interpolator! {
-    #[cfg(all(target_arch = "x86_64", feature = "avx"))]
+    #[cfg(target_arch = "x86_64")]
     mod interpolator_avx;
     trait AvxSample;
 }
@@ -211,7 +194,7 @@ interpolator! {
 }
 
 interpolator! {
-    #[cfg(all(target_arch = "aarch64", feature = "neon"))]
+    #[cfg(target_arch = "aarch64")]
     mod interpolator_neon;
     trait NeonSample;
 }