diff --git a/esphome/components/audio/audio.cpp b/esphome/components/audio/audio.cpp
new file mode 100644
index 0000000000..3eaed7954a
--- /dev/null
+++ b/esphome/components/audio/audio.cpp
@@ -0,0 +1,48 @@
+#include "audio.h"
+
+namespace esphome {
+namespace audio {
+
+/* *************** AudioListener **************** */
+
+AudioStreamer *AudioListener::start(AudioStreamInfo &info) {
+  if (current_streamer_ != nullptr) {
+    return nullptr;
+  }
+  if (this->starting(info)) {
+    this->current_streamer_ = new AudioStreamer();
+    this->current_streamer_->set_parent(this);
+  }
+  return nullptr;
+}
+
+AudioStreamer *AudioListener::start() {
+  AudioStreamInfo info;
+  this->get_default_audio_stream_info(info);
+  this->start(info);
+}
+
+bool AudioListener::can_stream(AudioStreamer *streamer) {
+  return this->current_streamer_ == streamer && this->is_running();
+}
+
+/* *************** AudioStreamer **************** */
+
+AudioStreamer::~AudioStreamer() {
+  if (this->parent_ != nullptr && this->parent_->current_streamer_ == this) {
+    this->parent_->current_streamer_ = nullptr;
+    this->parent_->stopping();
+    this->parent_ = nullptr;
+  }
+}
+
+bool AudioStreamer::is_running() { return (this->parent_ == nullptr) ? false : this->parent_->can_stream(this); }
+
+size_t AudioStreamer::stream(const uint8_t *data, const size_t size, TickType_t ticks_to_wait) {
+  if (!this->is_running(this))
+    return 0;
+  return this->parent_->streaming(data, size, ticks_to_wait);
+}
+
+}  // namespace audio
+}  // namespace esphome
diff --git a/esphome/components/audio/audio.h b/esphome/components/audio/audio.h
index b0968dc8da..eaa1eeb4af 100644
--- a/esphome/components/audio/audio.h
+++ b/esphome/components/audio/audio.h
@@ -2,10 +2,22 @@
 
 #include <cstdint>
 #include <stddef.h>
+#include "esphome/core/helpers.h"
 
 namespace esphome {
 namespace audio {
 
+#ifndef USE_ESP32
+using TickType_t = size_t;
+#endif
+
+enum State : uint8_t {
+  STATE_STOPPED = 0,
+  STATE_STARTING,
+  STATE_RUNNING,
+  STATE_STOPPING,
+};
+
 struct AudioStreamInfo {
   bool operator==(const AudioStreamInfo &rhs) const {
     return (channels == rhs.channels) && (bits_per_sample == rhs.bits_per_sample) && (sample_rate == rhs.sample_rate);
@@ -17,5 +29,55 @@ struct AudioStreamInfo {
   uint32_t sample_rate = 16000;
 };
 
+class AudioListener;
+
+class AudioStreamer : public Parented<AudioListener> {
+ public:
+  virtual ~AudioStreamer();
+
+  /// @brief Plays the provided audio data or receive the audio from the mic.
+  /// @param length The length of the audio data in bytes.
+  /// @return The number of bytes that were actually written to the speaker's internal buffer.
+
+  size_t stream(const uint8_t *data, const size_t size, TickType_t ticks_to_wait = 0);
+  bool is_running();
+};
+
+class AudioListener {
+ public:
+  /// @brief Initialize the audio device
+  /// If the audio stream is not the default defined in "esphome/core/audio.h"
+  /// and the speaker component implements it,
+  /// then this should be called after calling ``set_audio_stream_info``.
+  /// @param data Audio data in the format specified by ``set_audio_stream_info`` method.
+  /// @return the AudioStreamer object to be used to stream to or from the device.
+  AudioStreamer *start(const AudioStreamInfo &audio_stream_info);
+  AudioStreamer *start();
+
+  void stop() {
+    if (this->current_streamer_ != nullptr) {
+      delete this->current_streamer_;
+    }
+  }
+
+  virtual bool can_stream(AudioStreamer *streamer);
+
+  bool is_running() const { return this->state_ == audio::STATE_RUNNING; }
+  bool is_stopped() const { return this->state_ == audio::STATE_STOPPED; }
+
+  void set_audio_stream_info(const AudioStreamInfo &audio_stream_info) { this->audio_stream_info_ = audio_stream_info; }
+  virtual void get_default_audio_stream_info(AudioStreamInfo &audio_stream_info) {}
+
+ protected:
+  virtual bool starting(const AudioStreamInfo &audio_stream_info) = 0;
+  virtual size_t streaming(const uint8_t *data, size_t size, TickType_t ticks_to_wait) = 0;
+  virtual void stopping(){};
+
+  AudioStreamer *current_streamer_{nullptr};
+  audio::AudioStreamInfo audio_stream_info_;
+  State state_{STATE_STOPPED};
+  bool finish_before_stop_{false};
+};
+
 }  // namespace audio
 }  // namespace esphome
diff --git a/esphome/components/i2s_audio/i2s_audio.cpp b/esphome/components/i2s_audio/i2s_audio.cpp
index 507e43cf52..ad73b383fe 100644
--- a/esphome/components/i2s_audio/i2s_audio.cpp
+++ b/esphome/components/i2s_audio/i2s_audio.cpp
@@ -28,23 +28,6 @@ void I2SAudioComponent::setup() {
   ESP_LOGCONFIG(TAG, "Setting up I2S Audio...");
 }
 
-bool I2SAudioComponent::lock_component(I2SAudioBase *audio) {
-  if (!this->is_compoment_locked(audio)) {
-    this->audio_base_ = audio;
-    return true;
-  }
-  return false;
-}
-void I2SAudioComponent::unlock_component(I2SAudioBase *audio) {
-  if (!this->is_compoment_locked(audio)) {
-    this->audio_base_ = nullptr;
-  }
-}
-
-bool I2SAudioComponent::is_compoment_locked(I2SAudioBase *audio) {
-  return !(this->audio_base_ == nullptr || this->audio_base_ == audio);
-}
-
 }  // namespace i2s_audio
 }  // namespace esphome
 
diff --git a/esphome/components/i2s_audio/i2s_audio.h b/esphome/components/i2s_audio/i2s_audio.h
index e39cade356..a6e379b2cf 100644
--- a/esphome/components/i2s_audio/i2s_audio.h
+++ b/esphome/components/i2s_audio/i2s_audio.h
@@ -55,17 +55,11 @@ class I2SAudioComponent : public Component {
   bool try_lock() { return this->lock_.try_lock(); }
   void unlock() { this->lock_.unlock(); }
 
-  virtual bool lock_component(I2SAudioBase *audio);
-  virtual void unlock_component(I2SAudioBase *audio);
-  virtual bool is_compoment_locked(I2SAudioBase *audio);
-
   i2s_port_t get_port() const { return this->port_; }
 
  protected:
   Mutex lock_;
 
-  I2SAudioBase *audio_base_{nullptr};
-
   int mclk_pin_{I2S_PIN_NO_CHANGE};
   int bclk_pin_{I2S_PIN_NO_CHANGE};
   int lrclk_pin_;
diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
index f1ec97150c..a61bd382d2 100644
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
@@ -26,7 +26,6 @@ static const char *const TAG = "i2s_audio.speaker";
 enum SpeakerEventGroupBits : uint32_t {
   COMMAND_START = (1 << 0),                           // Starts the main task purpose
   COMMAND_STOP = (1 << 1),                            // stops the main task
-  COMMAND_STOP_GRACEFULLY = (1 << 2),                 // Stops the task once all data has been written
   MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE = (1 << 5),  // Locks the ring buffer when not set
   STATE_STARTING = (1 << 10),
   STATE_RUNNING = (1 << 11),
@@ -107,29 +106,26 @@ void I2SAudioSpeaker::loop() {
 
   if (event_group_bits & SpeakerEventGroupBits::STATE_STARTING) {
     ESP_LOGD(TAG, "Starting Speaker");
-    this->state_ = speaker::STATE_STARTING;
+    this->state_ = audio::STATE_STARTING;
     xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STARTING);
   }
   if (event_group_bits & SpeakerEventGroupBits::STATE_RUNNING) {
     ESP_LOGD(TAG, "Started Speaker");
-    this->state_ = speaker::STATE_RUNNING;
+    this->state_ = audio::STATE_RUNNING;
     xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
     this->status_clear_warning();
     this->status_clear_error();
   }
   if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPING) {
     ESP_LOGD(TAG, "Stopping Speaker");
-    this->state_ = speaker::STATE_STOPPING;
+    this->state_ = audio::STATE_STOPPING;
     xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
   }
   if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPED) {
-    if (!this->task_created_) {
-      ESP_LOGD(TAG, "Stopped Speaker");
-      this->state_ = speaker::STATE_STOPPED;
-      xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ALL_BITS);
-      this->speaker_task_handle_ = nullptr;
-    }
-    this->parent_->unlock_component(this);
+    this->stop();
+    ESP_LOGD(TAG, "Speaker Stopped.");
+    this->state_ = audio::STATE_STOPPED;
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ALL_BITS);
   }
 
   if (event_group_bits & SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START) {
@@ -191,20 +187,115 @@ void I2SAudioSpeaker::set_mute_state(bool mute_state) {
   }
 }
 
-size_t I2SAudioSpeaker::play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) {
+bool I2SAudioSpeaker::has_buffered_data() const {
+  if (this->audio_ring_buffer_ != nullptr) {
+    return this->audio_ring_buffer_->available() > 0;
+  }
+  return false;
+}
+
+void I2SAudioSpeaker::speaker_task(void *params) {
+  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) params;
+  uint32_t event_group_bits = xEventGroupWaitBits(
+      this_speaker->event_group_,
+      SpeakerEventGroupBits::COMMAND_START | SpeakerEventGroupBits::COMMAND_STOP,  // Bit message to read
+      pdTRUE,                                                                      // Clear the bits on exit
+      pdFALSE,                                                                     // Don't wait for all the bits,
+      portMAX_DELAY);  // Block indefinitely until a bit is set
+
+  if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
+    // Received a stop signal before the task was requested to start
+    this_speaker->delete_task_();
+  }
+
+  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
+
+  uint32_t last_data_received_time = millis();
+  size_t dma_buffers_size = this_speaker->get_dma_buffers_size();
+
+  while ((millis() - last_data_received_time) <= this_speaker->timeout_) {
+    event_group_bits = xEventGroupGetBits(this_speaker->event_group_);
+
+    if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
+      break;
+    }
+
+    size_t bytes_to_read = dma_buffers_size;
+    size_t bytes_read = this_speaker->audio_ring_buffer_->read((void *) this_speaker->data_buffer_, bytes_to_read,
+                                                               pdMS_TO_TICKS(TASK_DELAY_MS));
+
+    if (bytes_read > 0) {
+      last_data_received_time = millis();
+      size_t bytes_written = 0;
+
+      if ((this_speaker->audio_stream_info_.bits_per_sample == 16) && (this_speaker->q15_volume_factor_ < INT16_MAX)) {
+        // Scale samples by the volume factor in place
+        q15_multiplication((int16_t *) this_speaker->data_buffer_, (int16_t *) this_speaker->data_buffer_,
+                           bytes_read / sizeof(int16_t), this_speaker->q15_volume_factor_);
+      }
+
+      if (this_speaker->audio_stream_info_.bits_per_sample == (uint8_t) this_speaker->bits_per_sample_) {
+        i2s_write(this_speaker->parent_->get_port(), this_speaker->data_buffer_, bytes_read, &bytes_written,
+                  portMAX_DELAY);
+      } else if (this_speaker->audio_stream_info_.bits_per_sample < (uint8_t) this_speaker->bits_per_sample_) {
+        i2s_write_expand(this_speaker->parent_->get_port(), this_speaker->data_buffer_, bytes_read,
+                         this_speaker->audio_stream_info_.bits_per_sample, this_speaker->bits_per_sample_,
+                         &bytes_written, portMAX_DELAY);
+      }
+
+      if (bytes_written != bytes_read) {
+        xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
+      }
+    }
+  }
+  this_speaker->delete_task_();
+}
+
+bool I2SAudioSpeaker::starting(const audio::AudioStreamInfo &audio_stream_info) {
+  if (this->is_failed() || this->status_has_error())
+    return false;
+  if ((this->state_ != audio::STATE_STOPPED))
+    return false;
+  if (this->speaker_task_handle_ != nullptr)
+    return false;
+
+  this->audio_stream_info_ = audio_stream_info;
+  if (this->send_esp_err_to_event_group_(this->reconfigure_i2s_stream_info_(this->audio_stream_info_))) {
+    return false;
+  }
+
+  if (this->allocate_buffers_()) {
+    // Failed to allocate buffers
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
+    this->delete_task_();
+    return false;
+  }
+
+  if (this->send_esp_err_to_event_group_(this->start_i2s_driver_())) {
+    // Failed to start I2S driver
+    this->delete_task_();
+    return false;
+  }
+  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
+
+  xTaskCreate(I2SAudioSpeaker::speaker_task, "speaker_task", TASK_STACK_SIZE, (void *) this, TASK_PRIORITY,
+              &this->speaker_task_handle_);
+
+  if (this->speaker_task_handle_ == nullptr) {
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
+    this->delete_task_();
+  }
+
+  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE);
+
+  return true;
+}
+
+size_t I2SAudioSpeaker::streaming(const uint8_t *data, size_t size, TickType_t ticks_to_wait) {
   if (this->is_failed()) {
     ESP_LOGE(TAG, "Cannot play audio, speaker failed to setup");
     return 0;
   }
-  // prevent adding new data until the speaker has stopped.
-  if (!this->parent_->lock_component(this)) {
-    ESP_LOGE(TAG, "Cannot play new audio, it being used by an other audio component.");
-    return 0;
-  }
-
-  if (this->state_ != speaker::STATE_RUNNING && this->state_ != speaker::STATE_STARTING) {
-    this->start();
-  }
 
   // Wait for the ring buffer to be available
   uint32_t event_bits =
@@ -216,7 +307,7 @@ size_t I2SAudioSpeaker::play(const uint8_t *data, size_t length, TickType_t tick
 
     // Lock the ring buffer, write to it, then unlock it
     xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE);
-    size_t bytes_written = this->audio_ring_buffer_->write_without_replacement((void *) data, length, ticks_to_wait);
+    size_t bytes_written = this->audio_ring_buffer_->write_without_replacement((void *) data, size, ticks_to_wait);
     xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE);
 
     return bytes_written;
@@ -225,163 +316,12 @@ size_t I2SAudioSpeaker::play(const uint8_t *data, size_t length, TickType_t tick
   return 0;
 }
 
-bool I2SAudioSpeaker::has_buffered_data() const {
-  if (this->audio_ring_buffer_ != nullptr) {
-    return this->audio_ring_buffer_->available() > 0;
-  }
-  return false;
-}
-
-void I2SAudioSpeaker::speaker_task(void *params) {
-  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) params;
-  uint32_t event_group_bits =
-      xEventGroupWaitBits(this_speaker->event_group_,
-                          SpeakerEventGroupBits::COMMAND_START | SpeakerEventGroupBits::COMMAND_STOP |
-                              SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY,  // Bit message to read
-                          pdTRUE,                                              // Clear the bits on exit
-                          pdFALSE,                                             // Don't wait for all the bits,
-                          portMAX_DELAY);                                      // Block indefinitely until a bit is set
-
-  if (event_group_bits & (SpeakerEventGroupBits::COMMAND_STOP | SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY)) {
-    // Received a stop signal before the task was requested to start
-    this_speaker->delete_task_(0);
-  }
-
-  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STARTING);
-
-  audio::AudioStreamInfo audio_stream_info = this_speaker->audio_stream_info_;
-  const ssize_t bytes_per_sample = audio_stream_info.get_bytes_per_sample();
-  const uint8_t number_of_channels = audio_stream_info.channels;
-
-  const size_t dma_buffers_size = FRAMES_IN_ALL_DMA_BUFFERS * bytes_per_sample * number_of_channels;
-
-  if (this_speaker->send_esp_err_to_event_group_(
-          this_speaker->allocate_buffers_(dma_buffers_size, RING_BUFFER_SAMPLES * bytes_per_sample))) {
-    // Failed to allocate buffers
-    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
-    this_speaker->delete_task_(dma_buffers_size);
-  }
-
-  if (this_speaker->send_esp_err_to_event_group_(this_speaker->start_i2s_driver_())) {
-    // Failed to start I2S driver
-    this_speaker->delete_task_(dma_buffers_size);
-  } else {
-    // Ring buffer is allocated, so indicate its can be written to
-    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE);
-  }
-
-  if (!this_speaker->send_esp_err_to_event_group_(this_speaker->reconfigure_i2s_stream_info_(audio_stream_info))) {
-    // Successfully set the I2S stream info, ready to write audio data to the I2S port
-
-    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
-
-    bool stop_gracefully = false;
-    uint32_t last_data_received_time = millis();
-
-    while ((millis() - last_data_received_time) <= this_speaker->timeout_) {
-      event_group_bits = xEventGroupGetBits(this_speaker->event_group_);
-
-      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
-        break;
-      }
-      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY) {
-        stop_gracefully = true;
-      }
-
-      size_t bytes_to_read = dma_buffers_size;
-      size_t bytes_read = this_speaker->audio_ring_buffer_->read((void *) this_speaker->data_buffer_, bytes_to_read,
-                                                                 pdMS_TO_TICKS(TASK_DELAY_MS));
-
-      if (bytes_read > 0) {
-        last_data_received_time = millis();
-        size_t bytes_written = 0;
-
-        if ((audio_stream_info.bits_per_sample == 16) && (this_speaker->q15_volume_factor_ < INT16_MAX)) {
-          // Scale samples by the volume factor in place
-          q15_multiplication((int16_t *) this_speaker->data_buffer_, (int16_t *) this_speaker->data_buffer_,
-                             bytes_read / sizeof(int16_t), this_speaker->q15_volume_factor_);
-        }
-
-        if (audio_stream_info.bits_per_sample == (uint8_t) this_speaker->bits_per_sample_) {
-          i2s_write(this_speaker->parent_->get_port(), this_speaker->data_buffer_, bytes_read, &bytes_written,
-                    portMAX_DELAY);
-        } else if (audio_stream_info.bits_per_sample < (uint8_t) this_speaker->bits_per_sample_) {
-          i2s_write_expand(this_speaker->parent_->get_port(), this_speaker->data_buffer_, bytes_read,
-                           audio_stream_info.bits_per_sample, this_speaker->bits_per_sample_, &bytes_written,
-                           portMAX_DELAY);
-        }
-
-        if (bytes_written != bytes_read) {
-          xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
-        }
-
-      } else {
-        // No data received
-
-        if (stop_gracefully) {
-          break;
-        }
-
-        i2s_zero_dma_buffer(this_speaker->parent_->get_port());
-      }
-    }
-  } else {
-    // Couldn't configure the I2S port to be compatible with the incoming audio
-    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_INVALID_FORMAT);
-  }
-  i2s_zero_dma_buffer(this_speaker->parent_->get_port());
-
-  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
-
-  i2s_stop(this_speaker->parent_->get_port());
-  i2s_driver_uninstall(this_speaker->parent_->get_port());
-
-  this_speaker->parent_->unlock();
-  this_speaker->delete_task_(dma_buffers_size);
-}
-
-void I2SAudioSpeaker::start() {
-  if (this->is_failed() || this->status_has_error())
-    return;
-  if ((this->state_ == speaker::STATE_STARTING) || (this->state_ == speaker::STATE_RUNNING))
-    return;
-
-  // prevent adding new data until the speaker has stopped.
-  if (!this->parent_->lock_component(this)) {
-    ESP_LOGE(TAG, "Cannot play audio, it being used by an other audio component.");
-    return;
-  }
-
-  if (this->speaker_task_handle_ == nullptr) {
-    xTaskCreate(I2SAudioSpeaker::speaker_task, "speaker_task", TASK_STACK_SIZE, (void *) this, TASK_PRIORITY,
-                &this->speaker_task_handle_);
-  }
-
-  if (this->speaker_task_handle_ != nullptr) {
-    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
-    this->task_created_ = true;
-  } else {
-    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
-  }
-}
-
-void I2SAudioSpeaker::stop() { this->stop_(false); }
-
-void I2SAudioSpeaker::finish() { this->stop_(true); }
-
-void I2SAudioSpeaker::stop_(bool wait_on_empty) {
+void I2SAudioSpeaker::stopping() {
   if (this->is_failed())
     return;
-  if (this->state_ == speaker::STATE_STOPPED)
+  if (this->state_ == audio::STATE_STOPPED)
     return;
-  if (this->parent_->is_compoment_locked(this))
-    return;
-
-  if (wait_on_empty) {
-    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY);
-  } else {
-    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
-  }
+  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
 }
 
 bool I2SAudioSpeaker::send_esp_err_to_event_group_(esp_err_t err) {
@@ -406,27 +346,38 @@ bool I2SAudioSpeaker::send_esp_err_to_event_group_(esp_err_t err) {
   }
 }
 
-esp_err_t I2SAudioSpeaker::allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size) {
-  if (this->data_buffer_ == nullptr) {
-    // Allocate data buffer for temporarily storing audio from the ring buffer before writing to the I2S bus
-    ExternalRAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
-    this->data_buffer_ = allocator.allocate(data_buffer_size);
+size_t I2SAudioSpeaker::get_dma_buffers_size() {
+  const ssize_t bytes_per_sample = this->audio_stream_info_.get_bytes_per_sample();
+  const uint8_t number_of_channels = this->audio_stream_info_.channels;
+
+  return FRAMES_IN_ALL_DMA_BUFFERS * bytes_per_sample * number_of_channels;
+}
+
+size_t I2SAudioSpeaker::get_ring_buffer_size() {
+  const ssize_t bytes_per_sample = this->audio_stream_info_.get_bytes_per_sample();
+  return RING_BUFFER_SAMPLES * bytes_per_sample;
+}
+
+bool I2SAudioSpeaker::allocate_buffers_() {
+  size_t data_buffer_size = this->get_dma_buffers_size();
+  size_t ring_buffer_size = this->get_ring_buffer_size();
+
+  if ((this->data_buffer_ != nullptr) || (this->audio_ring_buffer_ != nullptr)) {
+    return this->send_esp_err_to_event_group_(ESP_ERR_INVALID_STATE);
   }
 
-  if (this->data_buffer_ == nullptr) {
-    return ESP_ERR_NO_MEM;
+  // Allocate data buffer for temporarily storing audio from the ring buffer before writing to the I2S bus
+  ExternalRAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
+  this->data_buffer_ = allocator.allocate(data_buffer_size);
+
+  // Allocate ring buffer
+  this->audio_ring_buffer_ = RingBuffer::create(ring_buffer_size);
+
+  if ((this->data_buffer_ == nullptr) || (this->audio_ring_buffer_ == nullptr)) {
+    return this->send_esp_err_to_event_group_(ESP_ERR_NO_MEM);
   }
 
-  if (this->audio_ring_buffer_ == nullptr) {
-    // Allocate ring buffer
-    this->audio_ring_buffer_ = RingBuffer::create(ring_buffer_size);
-  }
-
-  if (this->audio_ring_buffer_ == nullptr) {
-    return ESP_ERR_NO_MEM;
-  }
-
-  return ESP_OK;
+  return true;
 }
 
 esp_err_t I2SAudioSpeaker::start_i2s_driver_() {
@@ -487,6 +438,8 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_() {
     // Failed to set the data out pin, so uninstall the driver and unlock the I2S port
     i2s_driver_uninstall(this->parent_->get_port());
     this->parent_->unlock();
+  } else {
+    this->stream_created_ = true;
   }
 
   return err;
@@ -516,7 +469,15 @@ esp_err_t I2SAudioSpeaker::reconfigure_i2s_stream_info_(audio::AudioStreamInfo &
   return ESP_ERR_INVALID_ARG;
 }
 
-void I2SAudioSpeaker::delete_task_(size_t buffer_size) {
+void I2SAudioSpeaker::delete_task_() {
+  if (this->stream_created_) {
+    i2s_zero_dma_buffer(this->parent_->get_port());
+
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
+
+    i2s_stop(this->parent_->get_port());
+    i2s_driver_uninstall(this->parent_->get_port());
+  }
   if (this->audio_ring_buffer_ != nullptr) {
     xEventGroupWaitBits(this->event_group_,
                         MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE,  // Bit message to read
@@ -530,14 +491,16 @@ void I2SAudioSpeaker::delete_task_(size_t buffer_size) {
 
   if (this->data_buffer_ != nullptr) {
     ExternalRAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
-    allocator.deallocate(this->data_buffer_, buffer_size);
+    allocator.deallocate(this->data_buffer_, this->get_dma_buffers_size());
     this->data_buffer_ = nullptr;
   }
 
-  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPED);
-
-  this->task_created_ = false;
-  vTaskDelete(nullptr);
+  this->parent_->unlock();
+  if (this->speaker_task_handle_ != nullptr) {
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPED);
+    vTaskDelete(this->speaker_task_handle_);
+    this->speaker_task_handle_ = nullptr;
+  }
 }
 
 }  // namespace i2s_audio
diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
index 3c512d4d4d..46a845d812 100644
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
@@ -34,19 +34,6 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
 #endif
   void set_i2s_comm_fmt(i2s_comm_format_t mode) { this->i2s_comm_fmt_ = mode; }
 
-  void start() override;
-  void stop() override;
-  void finish() override;
-
-  /// @brief Plays the provided audio data.
-  /// Starts the speaker task, if necessary. Writes the audio data to the ring buffer.
-  /// @param data Audio data in the format set by the parent speaker classes ``set_audio_stream_info`` method.
-  /// @param length The length of the audio data in bytes.
-  /// @param ticks_to_wait The FreeRTOS ticks to wait before writing as much data as possible to the ring buffer.
-  /// @return The number of bytes that were actually written to the ring buffer.
-  size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) override;
-  size_t play(const uint8_t *data, size_t length) override { return play(data, length, 0); }
-
   bool has_buffered_data() const override;
 
   /// @brief Sets the volume of the speaker. Uses the speaker's configured audio dac component. If unavailble, it is
@@ -61,7 +48,26 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
   /// @param mute_state true for muting, false for unmuting
   void set_mute_state(bool mute_state) override;
 
+  void get_default_audio_stream_info(audio::AudioStreamInfo &audio_stream_info) override;
+
+  size_t get_dma_buffers_size();
+  size_t get_ring_buffer_size();
+
  protected:
+  /// @brief Try to start the speaker .
+  /// @return The true when every thing was setup correctly.
+  bool starting(const audio::AudioStreamInfo &audio_stream_info) override;
+  /// @brief Sends a stop command to the speaker task via event_group_.
+  void stopping() override;
+
+  /// @brief Plays the provided audio data.
+  /// Starts the speaker task, if necessary. Writes the audio data to the ring buffer.
+  /// @param data Audio data in the format set by the parent speaker classes ``set_audio_stream_info`` method.
+  /// @param length The length of the audio data in bytes.
+  /// @param ticks_to_wait The FreeRTOS ticks to wait before writing as much data as possible to the ring buffer.
+  /// @return The number of bytes that were actually written to the ring buffer.
+  size_t streaming(const uint8_t *data, size_t size, TickType_t ticks_to_wait) override;
+
   /// @brief Function for the FreeRTOS task handling audio output.
   /// After receiving the COMMAND_START signal, allocates space for the buffers, starts the I2S driver, and reads
   /// audio from the ring buffer and writes audio to the I2S port. Stops immmiately after receiving the COMMAND_STOP
@@ -72,10 +78,6 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
   /// @param params I2SAudioSpeaker component
   static void speaker_task(void *params);
 
-  /// @brief Sends a stop command to the speaker task via event_group_.
-  /// @param wait_on_empty If false, sends the COMMAND_STOP signal. If true, sends the COMMAND_STOP_GRACEFULLY signal.
-  void stop_(bool wait_on_empty);
-
   /// @brief Sets the corresponding ERR_ESP event group bits.
   /// @param err esp_err_t error code.
   /// @return True if an ERR_ESP bit is set and false if err == ESP_OK
@@ -86,7 +88,7 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
   /// @param ring_buffer_size Number of bytes to allocate for the ring buffer.
   /// @return ESP_ERR_NO_MEM if either buffer fails to allocate
   ///         ESP_OK if successful
-  esp_err_t allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size);
+  bool allocate_buffers_();
 
   /// @brief Starts the ESP32 I2S driver.
   /// Attempts to lock the I2S port, starts the I2S driver, and sets the data out pin. If it fails, it will unlock
@@ -112,7 +114,7 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
   /// Deallocates the data_buffer_ and audio_ring_buffer_, if necessary, and deletes the task. Should only be called by
   /// the speaker_task itself.
   /// @param buffer_size The allocated size of the data_buffer_.
-  void delete_task_(size_t buffer_size);
+  void delete_task_();
 
   TaskHandle_t speaker_task_handle_{nullptr};
   EventGroupHandle_t event_group_{nullptr};
@@ -123,7 +125,7 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
   uint32_t timeout_;
   uint8_t dout_pin_;
 
-  bool task_created_{false};
+  bool stream_created_{false};
 
   int16_t q15_volume_factor_{INT16_MAX};
 
diff --git a/esphome/components/micro_wake_word/micro_wake_word.cpp b/esphome/components/micro_wake_word/micro_wake_word.cpp
index b58c7ec434..9a3ccc3a09 100644
--- a/esphome/components/micro_wake_word/micro_wake_word.cpp
+++ b/esphome/components/micro_wake_word/micro_wake_word.cpp
@@ -191,6 +191,8 @@ void MicroWakeWord::stop() {
 }
 
 void MicroWakeWord::set_state_(State state) {
+  if (this->state_ == state)
+    return;
   ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(micro_wake_word_state_to_string(this->state_)),
            LOG_STR_ARG(micro_wake_word_state_to_string(state)));
   this->state_ = state;
diff --git a/esphome/components/rtttl/rtttl.cpp b/esphome/components/rtttl/rtttl.cpp
index db4cc731e4..7612d656ae 100644
--- a/esphome/components/rtttl/rtttl.cpp
+++ b/esphome/components/rtttl/rtttl.cpp
@@ -16,8 +16,6 @@ static const uint16_t NOTES[] = {0,    262,  277,  294,  311,  330,  349,  370,
                                  1109, 1175, 1245, 1319, 1397, 1480, 1568, 1661, 1760, 1865, 1976, 2093, 2217,
                                  2349, 2489, 2637, 2794, 2960, 3136, 3322, 3520, 3729, 3951};
 
-static const uint16_t I2S_SPEED = 1000;
-
 #undef HALF_PI
 static const double HALF_PI = 1.5707963267948966192313216916398;
 
@@ -145,23 +143,18 @@ void Rtttl::loop() {
 
 #ifdef USE_SPEAKER
   if (this->speaker_ != nullptr) {
-    if (this->state_ == State::STATE_STOPPING) {
+    if (this->state_ == State::STATE_INIT) {
       if (this->speaker_->is_stopped()) {
-        this->set_state_(State::STATE_STOPPED);
+        this->streamer_ = this->speaker_->start();
+        if (this->streamer_ != nullptr)
+          this->set_state_(State::STATE_STARTING);
       }
-    } else if (this->state_ == State::STATE_INIT) {
-      if (this->speaker_->is_stopped()) {
-        this->speaker_->start();
-        this->set_state_(State::STATE_STARTING);
-      }
-    } else if (this->state_ == State::STATE_STARTING) {
-      if (this->speaker_->is_running()) {
-        this->set_state_(State::STATE_RUNNING);
-      }
-    }
-    if (!this->speaker_->is_running()) {
       return;
     }
+    if (!this->streamer_->is_running()) {
+      return;
+    }
+    this->set_state_(State::STATE_RUNNING);
     if (this->samples_sent_ != this->samples_count_) {
       SpeakerSample sample[SAMPLE_BUFFER_SIZE + 2];
       int x = 0;
@@ -190,7 +183,7 @@ void Rtttl::loop() {
         x++;
       }
       if (x > 0) {
-        int send = this->speaker_->play((uint8_t *) (&sample), x * 2);
+        int send = this->streamer_->stream((uint8_t *) (&sample), x * 2);
         if (send != x * 4) {
           this->samples_sent_ -= (x - (send / 2));
         }
@@ -318,9 +311,9 @@ void Rtttl::loop() {
     this->samples_sent_ = 0;
     this->samples_gap_ = 0;
     this->samples_per_wave_ = 0;
-    this->samples_count_ = (this->sample_rate_ * this->note_duration_) / 1600;  //(ms);
+    this->samples_count_ = (this->sample_rate_ * this->note_duration_) / 1000;  //(ms);
     if (need_note_gap) {
-      this->samples_gap_ = (this->sample_rate_ * DOUBLE_NOTE_GAP_MS) / 1600;  //(ms);
+      this->samples_gap_ = (this->sample_rate_ * DOUBLE_NOTE_GAP_MS) / 1000;  //(ms);
     }
     if (this->output_freq_ != 0) {
       // make sure there is enough samples to add a full last sinus.
@@ -346,22 +339,15 @@ void Rtttl::finish_() {
 #ifdef USE_OUTPUT
   if (this->output_ != nullptr) {
     this->output_->set_level(0.0);
-    this->set_state_(State::STATE_STOPPED);
   }
 #endif
 #ifdef USE_SPEAKER
-  if (this->speaker_ != nullptr) {
-    SpeakerSample sample[2];
-    sample[0].left = 0;
-    sample[0].right = 0;
-    sample[1].left = 0;
-    sample[1].right = 0;
-    this->speaker_->play((uint8_t *) (&sample), 8);
-
-    this->speaker_->finish();
-    this->set_state_(State::STATE_STOPPING);
+  if (this->speaker_ != nullptr && this->streamer_ != nullptr) {
+    delete this->streamer_;
+    this->streamer_ = nullptr;
   }
 #endif
+  this->set_state_(State::STATE_STOPPED);
   this->note_duration_ = 0;
   this->on_finished_playback_callback_.call();
   ESP_LOGD(TAG, "Playback finished");
diff --git a/esphome/components/rtttl/rtttl.h b/esphome/components/rtttl/rtttl.h
index 10c290c5fb..0a779f8077 100644
--- a/esphome/components/rtttl/rtttl.h
+++ b/esphome/components/rtttl/rtttl.h
@@ -8,6 +8,7 @@
 #endif
 
 #ifdef USE_SPEAKER
+#include "esphome/components/audio/audio.h"
 #include "esphome/components/speaker/speaker.h"
 #endif
 
@@ -87,6 +88,7 @@ class Rtttl : public Component {
 
 #ifdef USE_SPEAKER
   speaker::Speaker *speaker_{nullptr};
+  audio::AudioStreamer *streamer_{nullptr};
   int sample_rate_{16000};
   int samples_per_wave_{0};
   int samples_sent_{0};
diff --git a/esphome/components/speaker/__init__.py b/esphome/components/speaker/__init__.py
index 7a668dc2f3..7e28f96fe8 100644
--- a/esphome/components/speaker/__init__.py
+++ b/esphome/components/speaker/__init__.py
@@ -95,9 +95,6 @@ async def speaker_play_action(config, action_id, template_arg, args):
 automation.register_action("speaker.stop", StopAction, SPEAKER_AUTOMATION_SCHEMA)(
     speaker_action
 )
-automation.register_action("speaker.finish", FinishAction, SPEAKER_AUTOMATION_SCHEMA)(
-    speaker_action
-)
 
 automation.register_condition(
     "speaker.is_playing", IsPlayingCondition, SPEAKER_AUTOMATION_SCHEMA
diff --git a/esphome/components/speaker/automation.h b/esphome/components/speaker/automation.h
index c083796eea..c62b61b892 100644
--- a/esphome/components/speaker/automation.h
+++ b/esphome/components/speaker/automation.h
@@ -20,12 +20,14 @@ template<typename... Ts> class PlayAction : public Action<Ts...>, public Parente
   }
 
   void play(Ts... x) override {
-    if (this->static_) {
-      this->parent_->play(this->data_static_);
-    } else {
-      auto val = this->data_func_(x...);
-      this->parent_->play(val);
+    auto val = this->data_func_(x...);
+
+    if (!this->static_) {
+      val = this->data_func_(x...);
     }
+    auto streamer = this->parent_->start();
+    streamer->stream(val);
+    delete streamer;
   }
 
  protected:
@@ -64,11 +66,6 @@ template<typename... Ts> class StopAction : public Action<Ts...>, public Parente
   void play(Ts... x) override { this->parent_->stop(); }
 };
 
-template<typename... Ts> class FinishAction : public Action<Ts...>, public Parented<Speaker> {
- public:
-  void play(Ts... x) override { this->parent_->finish(); }
-};
-
 template<typename... Ts> class IsPlayingCondition : public Condition<Ts...>, public Parented<Speaker> {
  public:
   bool check(Ts... x) override { return this->parent_->is_running(); }
diff --git a/esphome/components/speaker/speaker.h b/esphome/components/speaker/speaker.h
index 96843e2d5a..2083a4314b 100644
--- a/esphome/components/speaker/speaker.h
+++ b/esphome/components/speaker/speaker.h
@@ -18,49 +18,13 @@
 namespace esphome {
 namespace speaker {
 
-enum State : uint8_t {
-  STATE_STOPPED = 0,
-  STATE_STARTING,
-  STATE_RUNNING,
-  STATE_STOPPING,
-};
+using AudioStreamer = audio::AudioStreamer;
+using State = audio::State;
 
-class Speaker {
+class Speaker : public audio::AudioListener {
  public:
-#ifdef USE_ESP32
-  /// @brief Plays the provided audio data.
-  /// If the speaker component doesn't implement this method, it falls back to the play method without this parameter.
-  /// @param data Audio data in the format specified by ``set_audio_stream_info`` method.
-  /// @param length The length of the audio data in bytes.
-  /// @param ticks_to_wait The FreeRTOS ticks to wait before writing as much data as possible to the ring buffer.
-  /// @return The number of bytes that were actually written to the speaker's internal buffer.
-  virtual size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) {
-    return this->play(data, length);
-  };
-#endif
-
-  /// @brief Plays the provided audio data.
-  /// If the audio stream is not the default defined in "esphome/core/audio.h" and the speaker component implements it,
-  /// then this should be called after calling ``set_audio_stream_info``.
-  /// @param data Audio data in the format specified by ``set_audio_stream_info`` method.
-  /// @param length The length of the audio data in bytes.
-  /// @return The number of bytes that were actually written to the speaker's internal buffer.
-  virtual size_t play(const uint8_t *data, size_t length) = 0;
-
-  size_t play(const std::vector<uint8_t> &data) { return this->play(data.data(), data.size()); }
-
-  virtual void start() = 0;
-  virtual void stop() = 0;
-  // In compare between *STOP()* and *FINISH()*; *FINISH()* will stop after emptying the play buffer,
-  // while *STOP()* will break directly.
-  // When finish() is not implemented on the platform component it should just do a normal stop.
-  virtual void finish() { this->stop(); }
-
   virtual bool has_buffered_data() const = 0;
 
-  bool is_running() const { return this->state_ == STATE_RUNNING; }
-  bool is_stopped() const { return this->state_ == STATE_STOPPED; }
-
   // Volume control is handled by a configured audio dac component. Individual speaker components can
   // override and implement in software if an audio dac isn't available.
   virtual void set_volume(float volume) {
@@ -91,13 +55,7 @@ class Speaker {
   void set_audio_dac(audio_dac::AudioDac *audio_dac) { this->audio_dac_ = audio_dac; }
 #endif
 
-  void set_audio_stream_info(const audio::AudioStreamInfo &audio_stream_info) {
-    this->audio_stream_info_ = audio_stream_info;
-  }
-
  protected:
-  State state_{STATE_STOPPED};
-  audio::AudioStreamInfo audio_stream_info_;
   float volume_{1.0f};
   bool mute_state_{false};