diff --git a/CODEOWNERS b/CODEOWNERS
index d6104c9345..7ac6aa2f76 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -48,6 +48,7 @@ esphome/components/at581x/* @X-Ryl669
 esphome/components/atc_mithermometer/* @ahpohl
 esphome/components/atm90e26/* @danieltwagner
 esphome/components/atm90e32/* @circuitsetup @descipher
+esphome/components/audio/* @kahrendt
 esphome/components/audio_dac/* @kbx81
 esphome/components/b_parasite/* @rbaron
 esphome/components/ballu/* @bazuchan
diff --git a/esphome/components/audio/__init__.py b/esphome/components/audio/__init__.py
new file mode 100644
index 0000000000..4ffdc401dc
--- /dev/null
+++ b/esphome/components/audio/__init__.py
@@ -0,0 +1,9 @@
+import esphome.codegen as cg
+import esphome.config_validation as cv
+
+CODEOWNERS = ["@kahrendt"]
+audio_ns = cg.esphome_ns.namespace("audio")
+
+CONFIG_SCHEMA = cv.All(
+    cv.Schema({}),
+)
diff --git a/esphome/components/audio/audio.h b/esphome/components/audio/audio.h
new file mode 100644
index 0000000000..b0968dc8da
--- /dev/null
+++ b/esphome/components/audio/audio.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <cstdint>
+#include <stddef.h>
+
+namespace esphome {
+namespace audio {
+
+struct AudioStreamInfo {
+  bool operator==(const AudioStreamInfo &rhs) const {
+    return (channels == rhs.channels) && (bits_per_sample == rhs.bits_per_sample) && (sample_rate == rhs.sample_rate);
+  }
+  bool operator!=(const AudioStreamInfo &rhs) const { return !operator==(rhs); }
+  size_t get_bytes_per_sample() const { return bits_per_sample / 8; }
+  uint8_t channels = 1;
+  uint8_t bits_per_sample = 16;
+  uint32_t sample_rate = 16000;
+};
+
+}  // namespace audio
+}  // namespace esphome
diff --git a/esphome/components/i2s_audio/speaker/__init__.py b/esphome/components/i2s_audio/speaker/__init__.py
index bba886b39b..9fdaced64c 100644
--- a/esphome/components/i2s_audio/speaker/__init__.py
+++ b/esphome/components/i2s_audio/speaker/__init__.py
@@ -16,6 +16,7 @@ from .. import (
     register_i2s_audio_component,
 )
 
+AUTO_LOAD = ["audio"]
 CODEOWNERS = ["@jesserockz"]
 DEPENDENCIES = ["i2s_audio"]
 
@@ -72,7 +73,7 @@ BASE_SCHEMA = (
     .extend(
         {
             cv.Optional(
-                CONF_TIMEOUT, default="100ms"
+                CONF_TIMEOUT, default="500ms"
             ): cv.positive_time_period_milliseconds,
         }
     )
diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
index 97c1d86c36..4fc489d0a3 100644
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
@@ -4,6 +4,8 @@
 
 #include <driver/i2s.h>
 
+#include "esphome/components/audio/audio.h"
+
 #include "esphome/core/application.h"
 #include "esphome/core/hal.h"
 #include "esphome/core/log.h"
@@ -11,186 +13,296 @@
 namespace esphome {
 namespace i2s_audio {
 
-static const size_t BUFFER_COUNT = 20;
+static const size_t DMA_BUFFER_SIZE = 512;
+static const size_t DMA_BUFFERS_COUNT = 4;
+static const size_t FRAMES_IN_ALL_DMA_BUFFERS = DMA_BUFFER_SIZE * DMA_BUFFERS_COUNT;
+static const size_t RING_BUFFER_SAMPLES = 8192;
+static const size_t TASK_DELAY_MS = 10;
+static const size_t TASK_STACK_SIZE = 4096;
+static const ssize_t TASK_PRIORITY = 23;
 
 static const char *const TAG = "i2s_audio.speaker";
 
+enum SpeakerEventGroupBits : uint32_t {
+  COMMAND_START = (1 << 0),                           // Starts the main task purpose
+  COMMAND_STOP = (1 << 1),                            // stops the main task
+  COMMAND_STOP_GRACEFULLY = (1 << 2),                 // Stops the task once all data has been written
+  MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE = (1 << 5),  // Locks the ring buffer when not set
+  STATE_STARTING = (1 << 10),
+  STATE_RUNNING = (1 << 11),
+  STATE_STOPPING = (1 << 12),
+  STATE_STOPPED = (1 << 13),
+  ERR_TASK_FAILED_TO_START = (1 << 15),
+  ERR_ESP_INVALID_STATE = (1 << 16),
+  ERR_ESP_INVALID_ARG = (1 << 17),
+  ERR_ESP_INVALID_SIZE = (1 << 18),
+  ERR_ESP_NO_MEM = (1 << 19),
+  ERR_ESP_FAIL = (1 << 20),
+  ALL_ERR_ESP_BITS = ERR_ESP_INVALID_STATE | ERR_ESP_INVALID_ARG | ERR_ESP_INVALID_SIZE | ERR_ESP_NO_MEM | ERR_ESP_FAIL,
+  ALL_BITS = 0x00FFFFFF,  // All valid FreeRTOS event group bits
+};
+
+// Translates a SpeakerEventGroupBits ERR_ESP bit to the coressponding esp_err_t
+static esp_err_t err_bit_to_esp_err(uint32_t bit) {
+  switch (bit) {
+    case SpeakerEventGroupBits::ERR_ESP_INVALID_STATE:
+      return ESP_ERR_INVALID_STATE;
+    case SpeakerEventGroupBits::ERR_ESP_INVALID_ARG:
+      return ESP_ERR_INVALID_ARG;
+    case SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE:
+      return ESP_ERR_INVALID_SIZE;
+    case SpeakerEventGroupBits::ERR_ESP_NO_MEM:
+      return ESP_ERR_NO_MEM;
+    default:
+      return ESP_FAIL;
+  }
+}
+
+/// @brief Multiplies the input array of Q15 numbers by a Q15 constant factor
+///
+/// Based on `dsps_mulc_s16_ansi` from the esp-dsp library:
+/// https://github.com/espressif/esp-dsp/blob/master/modules/math/mulc/fixed/dsps_mulc_s16_ansi.c
+/// (accessed on 2024-09-30).
+/// @param input Array of Q15 numbers
+/// @param output Array of Q15 numbers
+/// @param len Length of array
+/// @param c Q15 constant factor
+static void q15_multiplication(const int16_t *input, int16_t *output, size_t len, int16_t c) {
+  for (int i = 0; i < len; i++) {
+    int32_t acc = (int32_t) input[i] * (int32_t) c;
+    output[i] = (int16_t) (acc >> 15);
+  }
+}
+
+// Lists the Q15 fixed point scaling factor for volume reduction.
+// Has 100 values representing silence and a reduction [49, 48.5, ... 0.5, 0] dB.
+// dB to PCM scaling factor formula: floating_point_scale_factor = 2^(-db/6.014)
+// float to Q15 fixed point formula: q15_scale_factor = floating_point_scale_factor * 2^(15)
+static const std::vector<int16_t> Q15_VOLUME_SCALING_FACTORS = {
+    0,     116,   122,   130,   137,   146,   154,   163,   173,   183,   194,   206,   218,   231,   244,
+    259,   274,   291,   308,   326,   345,   366,   388,   411,   435,   461,   488,   517,   548,   580,
+    615,   651,   690,   731,   774,   820,   868,   920,   974,   1032,  1094,  1158,  1227,  1300,  1377,
+    1459,  1545,  1637,  1734,  1837,  1946,  2061,  2184,  2313,  2450,  2596,  2750,  2913,  3085,  3269,
+    3462,  3668,  3885,  4116,  4360,  4619,  4893,  5183,  5490,  5816,  6161,  6527,  6914,  7324,  7758,
+    8218,  8706,  9222,  9770,  10349, 10963, 11613, 12302, 13032, 13805, 14624, 15491, 16410, 17384, 18415,
+    19508, 20665, 21891, 23189, 24565, 26022, 27566, 29201, 30933, 32767};
+
 void I2SAudioSpeaker::setup() {
   ESP_LOGCONFIG(TAG, "Setting up I2S Audio Speaker...");
 
-  this->buffer_queue_ = xQueueCreate(BUFFER_COUNT, sizeof(DataEvent));
-  if (this->buffer_queue_ == nullptr) {
-    ESP_LOGE(TAG, "Failed to create buffer queue");
-    this->mark_failed();
-    return;
+  if (this->event_group_ == nullptr) {
+    this->event_group_ = xEventGroupCreate();
   }
 
-  this->event_queue_ = xQueueCreate(BUFFER_COUNT, sizeof(TaskEvent));
-  if (this->event_queue_ == nullptr) {
-    ESP_LOGE(TAG, "Failed to create event queue");
+  if (this->event_group_ == nullptr) {
+    ESP_LOGE(TAG, "Failed to create event group");
     this->mark_failed();
     return;
   }
 }
 
+void I2SAudioSpeaker::loop() {
+  uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);
+
+  if (event_group_bits & SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START) {
+    this->status_set_error("Failed to start speaker task");
+  }
+
+  if (event_group_bits & SpeakerEventGroupBits::ALL_ERR_ESP_BITS) {
+    uint32_t error_bits = event_group_bits & SpeakerEventGroupBits::ALL_ERR_ESP_BITS;
+    ESP_LOGW(TAG, "Error writing to I2S: %s", esp_err_to_name(err_bit_to_esp_err(error_bits)));
+    this->status_set_warning();
+  }
+
+  if (event_group_bits & SpeakerEventGroupBits::STATE_STARTING) {
+    ESP_LOGD(TAG, "Starting Speaker");
+    this->state_ = speaker::STATE_STARTING;
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STARTING);
+  }
+  if (event_group_bits & SpeakerEventGroupBits::STATE_RUNNING) {
+    ESP_LOGD(TAG, "Started Speaker");
+    this->state_ = speaker::STATE_RUNNING;
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
+    this->status_clear_warning();
+    this->status_clear_error();
+  }
+  if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPING) {
+    ESP_LOGD(TAG, "Stopping Speaker");
+    this->state_ = speaker::STATE_STOPPING;
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
+  }
+  if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPED) {
+    if (!this->task_created_) {
+      ESP_LOGD(TAG, "Stopped Speaker");
+      this->state_ = speaker::STATE_STOPPED;
+      xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ALL_BITS);
+      this->speaker_task_handle_ = nullptr;
+    }
+  }
+}
+
+void I2SAudioSpeaker::set_volume(float volume) {
+  this->volume_ = volume;
+  ssize_t decibel_index = remap<ssize_t, float>(volume, 0.0f, 1.0f, 0, Q15_VOLUME_SCALING_FACTORS.size() - 1);
+  this->q15_volume_factor_ = Q15_VOLUME_SCALING_FACTORS[decibel_index];
+}
+
+size_t I2SAudioSpeaker::play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) {
+  if (this->is_failed()) {
+    ESP_LOGE(TAG, "Cannot play audio, speaker failed to setup");
+    return 0;
+  }
+  if (this->state_ != speaker::STATE_RUNNING && this->state_ != speaker::STATE_STARTING) {
+    this->start();
+  }
+
+  // Wait for the ring buffer to be available
+  uint32_t event_bits =
+      xEventGroupWaitBits(this->event_group_, SpeakerEventGroupBits::MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE, pdFALSE,
+                          pdFALSE, pdMS_TO_TICKS(TASK_DELAY_MS));
+
+  if (event_bits & SpeakerEventGroupBits::MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE) {
+    // Ring buffer is available to write
+
+    // Lock the ring buffer, write to it, then unlock it
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE);
+    size_t bytes_written = this->audio_ring_buffer_->write_without_replacement((void *) data, length, ticks_to_wait);
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE);
+
+    return bytes_written;
+  }
+
+  return 0;
+}
+
+bool I2SAudioSpeaker::has_buffered_data() const {
+  if (this->audio_ring_buffer_ != nullptr) {
+    return this->audio_ring_buffer_->available() > 0;
+  }
+  return false;
+}
+
+void I2SAudioSpeaker::speaker_task(void *params) {
+  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) params;
+  uint32_t event_group_bits =
+      xEventGroupWaitBits(this_speaker->event_group_,
+                          SpeakerEventGroupBits::COMMAND_START | SpeakerEventGroupBits::COMMAND_STOP |
+                              SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY,  // Bit message to read
+                          pdTRUE,                                              // Clear the bits on exit
+                          pdFALSE,                                             // Don't wait for all the bits,
+                          portMAX_DELAY);                                      // Block indefinitely until a bit is set
+
+  if (event_group_bits & (SpeakerEventGroupBits::COMMAND_STOP | SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY)) {
+    // Received a stop signal before the task was requested to start
+    this_speaker->delete_task_(0);
+  }
+
+  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STARTING);
+
+  audio::AudioStreamInfo audio_stream_info = this_speaker->audio_stream_info_;
+  const ssize_t bytes_per_sample = audio_stream_info.get_bytes_per_sample();
+  const uint8_t number_of_channels = audio_stream_info.channels;
+
+  const size_t dma_buffers_size = FRAMES_IN_ALL_DMA_BUFFERS * bytes_per_sample * number_of_channels;
+
+  if (this_speaker->send_esp_err_to_event_group_(
+          this_speaker->allocate_buffers_(dma_buffers_size, RING_BUFFER_SAMPLES * bytes_per_sample))) {
+    // Failed to allocate buffers
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
+    this_speaker->delete_task_(dma_buffers_size);
+  }
+
+  if (this_speaker->send_esp_err_to_event_group_(this_speaker->start_i2s_driver_())) {
+    // Failed to start I2S driver
+    this_speaker->delete_task_(dma_buffers_size);
+  } else {
+    // Ring buffer is allocated, so indicate its can be written to
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE);
+  }
+
+  if (!this_speaker->send_esp_err_to_event_group_(this_speaker->reconfigure_i2s_stream_info_(audio_stream_info))) {
+    // Successfully set the I2S stream info, ready to write audio data to the I2S port
+
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
+
+    bool stop_gracefully = false;
+    uint32_t last_data_received_time = millis();
+
+    while ((millis() - last_data_received_time) <= this_speaker->timeout_) {
+      event_group_bits = xEventGroupGetBits(this_speaker->event_group_);
+
+      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
+        break;
+      }
+      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY) {
+        stop_gracefully = true;
+      }
+
+      size_t bytes_to_read = dma_buffers_size;
+      size_t bytes_read = this_speaker->audio_ring_buffer_->read((void *) this_speaker->data_buffer_, bytes_to_read,
+                                                                 pdMS_TO_TICKS(TASK_DELAY_MS));
+
+      if (bytes_read > 0) {
+        last_data_received_time = millis();
+        size_t bytes_written = 0;
+
+        if ((audio_stream_info.bits_per_sample == 16) && (this_speaker->q15_volume_factor_ < INT16_MAX)) {
+          // Scale samples by the volume factor in place
+          q15_multiplication((int16_t *) this_speaker->data_buffer_, (int16_t *) this_speaker->data_buffer_,
+                             bytes_read / sizeof(int16_t), this_speaker->q15_volume_factor_);
+        }
+
+        if (audio_stream_info.bits_per_sample == (uint8_t) this_speaker->bits_per_sample_) {
+          i2s_write(this_speaker->parent_->get_port(), this_speaker->data_buffer_, bytes_read, &bytes_written,
+                    portMAX_DELAY);
+        } else if (audio_stream_info.bits_per_sample < (uint8_t) this_speaker->bits_per_sample_) {
+          i2s_write_expand(this_speaker->parent_->get_port(), this_speaker->data_buffer_, bytes_read,
+                           audio_stream_info.bits_per_sample, this_speaker->bits_per_sample_, &bytes_written,
+                           portMAX_DELAY);
+        }
+
+        if (bytes_written != bytes_read) {
+          xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
+        }
+
+      } else {
+        // No data received
+
+        if (stop_gracefully) {
+          break;
+        }
+
+        i2s_zero_dma_buffer(this_speaker->parent_->get_port());
+      }
+    }
+  }
+  i2s_zero_dma_buffer(this_speaker->parent_->get_port());
+
+  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
+
+  i2s_stop(this_speaker->parent_->get_port());
+  i2s_driver_uninstall(this_speaker->parent_->get_port());
+
+  this_speaker->parent_->unlock();
+  this_speaker->delete_task_(dma_buffers_size);
+}
+
 void I2SAudioSpeaker::start() {
-  if (this->is_failed()) {
-    ESP_LOGE(TAG, "Cannot start audio, speaker failed to setup");
+  if (this->is_failed())
     return;
-  }
-  if (this->task_created_) {
-    ESP_LOGW(TAG, "Called start while task has been already created.");
+  if ((this->state_ == speaker::STATE_STARTING) || (this->state_ == speaker::STATE_RUNNING))
     return;
-  }
-  this->state_ = speaker::STATE_STARTING;
-}
-void I2SAudioSpeaker::start_() {
-  if (this->task_created_) {
-    return;
-  }
-  if (!this->parent_->try_lock()) {
-    return;  // Waiting for another i2s component to return lock
+
+  if (this->speaker_task_handle_ == nullptr) {
+    xTaskCreate(I2SAudioSpeaker::speaker_task, "speaker_task", TASK_STACK_SIZE, (void *) this, TASK_PRIORITY,
+                &this->speaker_task_handle_);
   }
 
-  xTaskCreate(I2SAudioSpeaker::player_task, "speaker_task", 8192, (void *) this, 1, &this->player_task_handle_);
-  this->task_created_ = true;
-}
-
-template<typename a, typename b> const uint8_t *convert_data_format(const a *from, b *to, size_t &bytes, bool repeat) {
-  if (sizeof(a) == sizeof(b) && !repeat) {
-    return reinterpret_cast<const uint8_t *>(from);
-  }
-  const b *result = to;
-  for (size_t i = 0; i < bytes; i += sizeof(a)) {
-    b value = static_cast<b>(*from++) << (sizeof(b) - sizeof(a)) * 8;
-    *to++ = value;
-    if (repeat)
-      *to++ = value;
-  }
-  bytes *= (sizeof(b) / sizeof(a)) * (repeat ? 2 : 1);  // NOLINT
-  return reinterpret_cast<const uint8_t *>(result);
-}
-
-void I2SAudioSpeaker::player_task(void *params) {
-  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) params;
-
-  TaskEvent event;
-  event.type = TaskEventType::STARTING;
-  xQueueSend(this_speaker->event_queue_, &event, portMAX_DELAY);
-
-  i2s_driver_config_t config = {
-      .mode = (i2s_mode_t) (this_speaker->i2s_mode_ | I2S_MODE_TX),
-      .sample_rate = this_speaker->sample_rate_,
-      .bits_per_sample = this_speaker->bits_per_sample_,
-      .channel_format = this_speaker->channel_,
-      .communication_format = this_speaker->i2s_comm_fmt_,
-      .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
-      .dma_buf_count = 8,
-      .dma_buf_len = 256,
-      .use_apll = this_speaker->use_apll_,
-      .tx_desc_auto_clear = true,
-      .fixed_mclk = 0,
-      .mclk_multiple = I2S_MCLK_MULTIPLE_256,
-      .bits_per_chan = this_speaker->bits_per_channel_,
-  };
-#if SOC_I2S_SUPPORTS_DAC
-  if (this_speaker->internal_dac_mode_ != I2S_DAC_CHANNEL_DISABLE) {
-    config.mode = (i2s_mode_t) (config.mode | I2S_MODE_DAC_BUILT_IN);
-  }
-#endif
-
-  esp_err_t err = i2s_driver_install(this_speaker->parent_->get_port(), &config, 0, nullptr);
-  if (err != ESP_OK) {
-    event.type = TaskEventType::WARNING;
-    event.err = err;
-    xQueueSend(this_speaker->event_queue_, &event, 0);
-    event.type = TaskEventType::STOPPED;
-    xQueueSend(this_speaker->event_queue_, &event, 0);
-    while (true) {
-      delay(10);
-    }
-  }
-
-#if SOC_I2S_SUPPORTS_DAC
-  if (this_speaker->internal_dac_mode_ == I2S_DAC_CHANNEL_DISABLE) {
-#endif
-    i2s_pin_config_t pin_config = this_speaker->parent_->get_pin_config();
-    pin_config.data_out_num = this_speaker->dout_pin_;
-
-    i2s_set_pin(this_speaker->parent_->get_port(), &pin_config);
-#if SOC_I2S_SUPPORTS_DAC
+  if (this->speaker_task_handle_ != nullptr) {
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
+    this->task_created_ = true;
   } else {
-    i2s_set_dac_mode(this_speaker->internal_dac_mode_);
-  }
-#endif
-
-  DataEvent data_event;
-
-  event.type = TaskEventType::STARTED;
-  xQueueSend(this_speaker->event_queue_, &event, portMAX_DELAY);
-
-  int32_t buffer[BUFFER_SIZE];
-
-  while (true) {
-    if (xQueueReceive(this_speaker->buffer_queue_, &data_event, this_speaker->timeout_ / portTICK_PERIOD_MS) !=
-        pdTRUE) {
-      break;  // End of audio from main thread
-    }
-    if (data_event.stop) {
-      // Stop signal from main thread
-      xQueueReset(this_speaker->buffer_queue_);  // Flush queue
-      break;
-    }
-
-    const uint8_t *data = data_event.data;
-    size_t remaining = data_event.len;
-    switch (this_speaker->bits_per_sample_) {
-      case I2S_BITS_PER_SAMPLE_8BIT:
-      case I2S_BITS_PER_SAMPLE_16BIT: {
-        data = convert_data_format(reinterpret_cast<const int16_t *>(data), reinterpret_cast<int16_t *>(buffer),
-                                   remaining, this_speaker->channel_ == I2S_CHANNEL_FMT_ALL_LEFT);
-        break;
-      }
-      case I2S_BITS_PER_SAMPLE_24BIT:
-      case I2S_BITS_PER_SAMPLE_32BIT: {
-        data = convert_data_format(reinterpret_cast<const int16_t *>(data), reinterpret_cast<int32_t *>(buffer),
-                                   remaining, this_speaker->channel_ == I2S_CHANNEL_FMT_ALL_LEFT);
-        break;
-      }
-    }
-
-    while (remaining != 0) {
-      size_t bytes_written;
-      esp_err_t err =
-          i2s_write(this_speaker->parent_->get_port(), data, remaining, &bytes_written, (32 / portTICK_PERIOD_MS));
-      if (err != ESP_OK) {
-        event = {.type = TaskEventType::WARNING, .err = err};
-        if (xQueueSend(this_speaker->event_queue_, &event, 10 / portTICK_PERIOD_MS) != pdTRUE) {
-          ESP_LOGW(TAG, "Failed to send WARNING event");
-        }
-        continue;
-      }
-      data += bytes_written;
-      remaining -= bytes_written;
-    }
-  }
-
-  event.type = TaskEventType::STOPPING;
-  if (xQueueSend(this_speaker->event_queue_, &event, 10 / portTICK_PERIOD_MS) != pdTRUE) {
-    ESP_LOGW(TAG, "Failed to send STOPPING event");
-  }
-
-  i2s_zero_dma_buffer(this_speaker->parent_->get_port());
-
-  i2s_driver_uninstall(this_speaker->parent_->get_port());
-
-  event.type = TaskEventType::STOPPED;
-  if (xQueueSend(this_speaker->event_queue_, &event, 10 / portTICK_PERIOD_MS) != pdTRUE) {
-    ESP_LOGW(TAG, "Failed to send STOPPED event");
-  }
-
-  while (true) {
-    delay(10);
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
   }
 }
 
@@ -203,92 +315,169 @@ void I2SAudioSpeaker::stop_(bool wait_on_empty) {
     return;
   if (this->state_ == speaker::STATE_STOPPED)
     return;
-  if (this->state_ == speaker::STATE_STARTING) {
-    this->state_ = speaker::STATE_STOPPED;
-    return;
-  }
-  this->state_ = speaker::STATE_STOPPING;
-  DataEvent data;
-  data.stop = true;
+
   if (wait_on_empty) {
-    xQueueSend(this->buffer_queue_, &data, portMAX_DELAY);
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY);
   } else {
-    xQueueSendToFront(this->buffer_queue_, &data, portMAX_DELAY);
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
   }
 }
 
-void I2SAudioSpeaker::watch_() {
-  TaskEvent event;
-  if (xQueueReceive(this->event_queue_, &event, 0) == pdTRUE) {
-    switch (event.type) {
-      case TaskEventType::STARTING:
-        ESP_LOGD(TAG, "Starting I2S Audio Speaker");
-        break;
-      case TaskEventType::STARTED:
-        ESP_LOGD(TAG, "Started I2S Audio Speaker");
-        this->state_ = speaker::STATE_RUNNING;
-        this->status_clear_warning();
-        break;
-      case TaskEventType::STOPPING:
-        ESP_LOGD(TAG, "Stopping I2S Audio Speaker");
-        break;
-      case TaskEventType::STOPPED:
-        this->state_ = speaker::STATE_STOPPED;
-        vTaskDelete(this->player_task_handle_);
-        this->task_created_ = false;
-        this->player_task_handle_ = nullptr;
-        this->parent_->unlock();
-        xQueueReset(this->buffer_queue_);
-        ESP_LOGD(TAG, "Stopped I2S Audio Speaker");
-        break;
-      case TaskEventType::WARNING:
-        ESP_LOGW(TAG, "Error writing to I2S: %s", esp_err_to_name(event.err));
-        this->status_set_warning();
-        break;
-    }
+bool I2SAudioSpeaker::send_esp_err_to_event_group_(esp_err_t err) {
+  switch (err) {
+    case ESP_OK:
+      return false;
+    case ESP_ERR_INVALID_STATE:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_STATE);
+      return true;
+    case ESP_ERR_INVALID_ARG:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_ARG);
+      return true;
+    case ESP_ERR_INVALID_SIZE:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
+      return true;
+    case ESP_ERR_NO_MEM:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
+      return true;
+    default:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_FAIL);
+      return true;
   }
 }
 
-void I2SAudioSpeaker::loop() {
-  switch (this->state_) {
-    case speaker::STATE_STARTING:
-      this->start_();
-      [[fallthrough]];
-    case speaker::STATE_RUNNING:
-    case speaker::STATE_STOPPING:
-      this->watch_();
-      break;
-    case speaker::STATE_STOPPED:
-      break;
+esp_err_t I2SAudioSpeaker::allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size) {
+  if (this->data_buffer_ == nullptr) {
+    // Allocate data buffer for temporarily storing audio from the ring buffer before writing to the I2S bus
+    ExternalRAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
+    this->data_buffer_ = allocator.allocate(data_buffer_size);
   }
+
+  if (this->data_buffer_ == nullptr) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  if (this->audio_ring_buffer_ == nullptr) {
+    // Allocate ring buffer
+    this->audio_ring_buffer_ = RingBuffer::create(ring_buffer_size);
+  }
+
+  if (this->audio_ring_buffer_ == nullptr) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  return ESP_OK;
 }
 
-size_t I2SAudioSpeaker::play(const uint8_t *data, size_t length) {
-  if (this->is_failed()) {
-    ESP_LOGE(TAG, "Cannot play audio, speaker failed to setup");
-    return 0;
+esp_err_t I2SAudioSpeaker::start_i2s_driver_() {
+  if (!this->parent_->try_lock()) {
+    return ESP_ERR_INVALID_STATE;
   }
-  if (this->state_ != speaker::STATE_RUNNING && this->state_ != speaker::STATE_STARTING) {
-    this->start();
+
+  i2s_driver_config_t config = {
+    .mode = (i2s_mode_t) (this->i2s_mode_ | I2S_MODE_TX),
+    .sample_rate = this->sample_rate_,
+    .bits_per_sample = this->bits_per_sample_,
+    .channel_format = this->channel_,
+    .communication_format = this->i2s_comm_fmt_,
+    .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
+    .dma_buf_count = DMA_BUFFERS_COUNT,
+    .dma_buf_len = DMA_BUFFER_SIZE,
+    .use_apll = this->use_apll_,
+    .tx_desc_auto_clear = true,
+    .fixed_mclk = I2S_PIN_NO_CHANGE,
+    .mclk_multiple = I2S_MCLK_MULTIPLE_256,
+    .bits_per_chan = this->bits_per_channel_,
+#if SOC_I2S_SUPPORTS_TDM
+    .chan_mask = (i2s_channel_t) (I2S_TDM_ACTIVE_CH0 | I2S_TDM_ACTIVE_CH1),
+    .total_chan = 2,
+    .left_align = false,
+    .big_edin = false,
+    .bit_order_msb = false,
+    .skip_msk = false,
+#endif
+  };
+#if SOC_I2S_SUPPORTS_DAC
+  if (this->internal_dac_mode_ != I2S_DAC_CHANNEL_DISABLE) {
+    config.mode = (i2s_mode_t) (config.mode | I2S_MODE_DAC_BUILT_IN);
   }
-  size_t remaining = length;
-  size_t index = 0;
-  while (remaining > 0) {
-    DataEvent event;
-    event.stop = false;
-    size_t to_send_length = std::min(remaining, BUFFER_SIZE);
-    event.len = to_send_length;
-    memcpy(event.data, data + index, to_send_length);
-    if (xQueueSend(this->buffer_queue_, &event, 0) != pdTRUE) {
-      return index;
-    }
-    remaining -= to_send_length;
-    index += to_send_length;
+#endif
+
+  esp_err_t err = i2s_driver_install(this->parent_->get_port(), &config, 0, nullptr);
+  if (err != ESP_OK) {
+    // Failed to install the driver, so unlock the I2S port
+    this->parent_->unlock();
+    return err;
   }
-  return index;
+
+#if SOC_I2S_SUPPORTS_DAC
+  if (this->internal_dac_mode_ == I2S_DAC_CHANNEL_DISABLE) {
+#endif
+    i2s_pin_config_t pin_config = this->parent_->get_pin_config();
+    pin_config.data_out_num = this->dout_pin_;
+
+    err = i2s_set_pin(this->parent_->get_port(), &pin_config);
+#if SOC_I2S_SUPPORTS_DAC
+  } else {
+    i2s_set_dac_mode(this->internal_dac_mode_);
+  }
+#endif
+
+  if (err != ESP_OK) {
+    // Failed to set the data out pin, so uninstall the driver and unlock the I2S port
+    i2s_driver_uninstall(this->parent_->get_port());
+    this->parent_->unlock();
+  }
+
+  return err;
 }
 
-bool I2SAudioSpeaker::has_buffered_data() const { return uxQueueMessagesWaiting(this->buffer_queue_) > 0; }
+esp_err_t I2SAudioSpeaker::reconfigure_i2s_stream_info_(audio::AudioStreamInfo &audio_stream_info) {
+  if (this->i2s_mode_ & I2S_MODE_MASTER) {
+    // ESP controls for the the I2S bus, so adjust the sample rate and bits per sample to match the incoming audio
+    this->sample_rate_ = audio_stream_info.sample_rate;
+    this->bits_per_sample_ = (i2s_bits_per_sample_t) audio_stream_info.bits_per_sample;
+  } else if (this->sample_rate_ != audio_stream_info.sample_rate) {
+    // Can't reconfigure I2S bus, so the sample rate must match the configured value
+    return ESP_ERR_INVALID_ARG;
+  }
+
+  if ((i2s_bits_per_sample_t) audio_stream_info.bits_per_sample > this->bits_per_sample_) {
+    // Currently can't handle the case when the incoming audio has more bits per sample than the configured value
+    return ESP_ERR_INVALID_ARG;
+  }
+
+  if (audio_stream_info.channels == 1) {
+    return i2s_set_clk(this->parent_->get_port(), this->sample_rate_, this->bits_per_sample_, I2S_CHANNEL_MONO);
+  } else if (audio_stream_info.channels == 2) {
+    return i2s_set_clk(this->parent_->get_port(), this->sample_rate_, this->bits_per_sample_, I2S_CHANNEL_STEREO);
+  }
+
+  return ESP_ERR_INVALID_ARG;
+}
+
+void I2SAudioSpeaker::delete_task_(size_t buffer_size) {
+  if (this->audio_ring_buffer_ != nullptr) {
+    xEventGroupWaitBits(this->event_group_,
+                        MESSAGE_RING_BUFFER_AVAILABLE_TO_WRITE,  // Bit message to read
+                        pdFALSE,                                 // Don't clear the bits on exit
+                        pdTRUE,                                  // Don't wait for all the bits,
+                        portMAX_DELAY);                          // Block indefinitely until a command bit is set
+
+    this->audio_ring_buffer_.reset();  // Deallocates the ring buffer stored in the unique_ptr
+    this->audio_ring_buffer_ = nullptr;
+  }
+
+  if (this->data_buffer_ != nullptr) {
+    ExternalRAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
+    allocator.deallocate(this->data_buffer_, buffer_size);
+    this->data_buffer_ = nullptr;
+  }
+
+  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPED);
+
+  this->task_created_ = false;
+  vTaskDelete(nullptr);
+}
 
 }  // namespace i2s_audio
 }  // namespace esphome
diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
index 9d1817c86f..245f97d1e7 100644
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
@@ -5,38 +5,21 @@
 #include "../i2s_audio.h"
 
 #include <driver/i2s.h>
-#include <freertos/FreeRTOS.h>
-#include <freertos/queue.h>
 
+#include <freertos/event_groups.h>
+#include <freertos/FreeRTOS.h>
+
+#include "esphome/components/audio/audio.h"
 #include "esphome/components/speaker/speaker.h"
+
 #include "esphome/core/component.h"
 #include "esphome/core/gpio.h"
 #include "esphome/core/helpers.h"
+#include "esphome/core/ring_buffer.h"
 
 namespace esphome {
 namespace i2s_audio {
 
-static const size_t BUFFER_SIZE = 1024;
-
-enum class TaskEventType : uint8_t {
-  STARTING = 0,
-  STARTED,
-  STOPPING,
-  STOPPED,
-  WARNING = 255,
-};
-
-struct TaskEvent {
-  TaskEventType type;
-  esp_err_t err;
-};
-
-struct DataEvent {
-  bool stop;
-  size_t len;
-  uint8_t data[BUFFER_SIZE];
-};
-
 class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Component {
  public:
   float get_setup_priority() const override { return esphome::setup_priority::LATE; }
@@ -55,25 +38,89 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
   void stop() override;
   void finish() override;
 
-  size_t play(const uint8_t *data, size_t length) override;
+  /// @brief Plays the provided audio data.
+  /// Starts the speaker task, if necessary. Writes the audio data to the ring buffer.
+  /// @param data Audio data in the format set by the parent speaker classes ``set_audio_stream_info`` method.
+  /// @param length The length of the audio data in bytes.
+  /// @param ticks_to_wait The FreeRTOS ticks to wait before writing as much data as possible to the ring buffer.
+  /// @return The number of bytes that were actually written to the ring buffer.
+  size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) override;
+  size_t play(const uint8_t *data, size_t length) override { return play(data, length, 0); }
 
   bool has_buffered_data() const override;
 
+  /// @brief Sets the volume of the speaker. It is implemented as a software volume control.
+  /// Overrides the default setter to convert the floating point volume to a Q15 fixed-point factor.
+  /// @param volume
+  void set_volume(float volume) override;
+  float get_volume() override { return this->volume_; }
+
  protected:
-  void start_();
+  /// @brief Function for the FreeRTOS task handling audio output.
+  /// After receiving the COMMAND_START signal, allocates space for the buffers, starts the I2S driver, and reads
+  /// audio from the ring buffer and writes audio to the I2S port. Stops immmiately after receiving the COMMAND_STOP
+  /// signal and stops only after the ring buffer is empty after receiving the COMMAND_STOP_GRACEFULLY signal. Stops if
+  /// the ring buffer hasn't read data for more than timeout_ milliseconds. When stopping, it deallocates the buffers,
+  /// stops the I2S driver, unlocks the I2S port, and deletes the task. It communicates the state and any errors via
+  /// event_group_.
+  /// @param params I2SAudioSpeaker component
+  static void speaker_task(void *params);
+
+  /// @brief Sends a stop command to the speaker task via event_group_.
+  /// @param wait_on_empty If false, sends the COMMAND_STOP signal. If true, sends the COMMAND_STOP_GRACEFULLY signal.
   void stop_(bool wait_on_empty);
-  void watch_();
 
-  static void player_task(void *params);
+  /// @brief Sets the corresponding ERR_ESP event group bits.
+  /// @param err esp_err_t error code.
+  /// @return True if an ERR_ESP bit is set and false if err == ESP_OK
+  bool send_esp_err_to_event_group_(esp_err_t err);
 
-  TaskHandle_t player_task_handle_{nullptr};
-  QueueHandle_t buffer_queue_;
-  QueueHandle_t event_queue_;
+  /// @brief Allocates the data buffer and ring buffer
+  /// @param data_buffer_size Number of bytes to allocate for the data buffer.
+  /// @param ring_buffer_size Number of bytes to allocate for the ring buffer.
+  /// @return ESP_ERR_NO_MEM if either buffer fails to allocate
+  ///         ESP_OK if successful
+  esp_err_t allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size);
+
+  /// @brief Starts the ESP32 I2S driver.
+  /// Attempts to lock the I2S port, starts the I2S driver, and sets the data out pin. If it fails, it will unlock
+  /// the I2S port and uninstall the driver, if necessary.
+  /// @return ESP_ERR_INVALID_STATE if the I2S port is already locked.
+  ///         ESP_ERR_INVALID_ARG if installing the driver or setting the data out pin fails due to a parameter error.
+  ///         ESP_ERR_NO_MEM if the driver fails to install due to a memory allocation error.
+  ///         ESP_FAIL if setting the data out pin fails due to an IO error
+  ///         ESP_OK if successful
+  esp_err_t start_i2s_driver_();
+
+  /// @brief Adjusts the I2S driver configuration to match the incoming audio stream.
+  /// Modifies I2S driver's sample rate, bits per sample, and number of channel settings. If the I2S is in secondary
+  /// mode, it only modifies the number of channels.
+  /// @param audio_stream_info  Describes the incoming audio stream
+  /// @return ESP_ERR_INVALID_ARG if there is a parameter error, if there is more than 2 channels in the stream, or if
+  ///           the audio settings are incompatible with the configuration.
+  ///         ESP_ERR_NO_MEM if the driver fails to reconfigure due to a memory allocation error.
+  ///         ESP_OK if successful.
+  esp_err_t reconfigure_i2s_stream_info_(audio::AudioStreamInfo &audio_stream_info);
+
+  /// @brief Deletes the speaker's task.
+  /// Deallocates the data_buffer_ and audio_ring_buffer_, if necessary, and deletes the task. Should only be called by
+  /// the speaker_task itself.
+  /// @param buffer_size The allocated size of the data_buffer_.
+  void delete_task_(size_t buffer_size);
+
+  TaskHandle_t speaker_task_handle_{nullptr};
+  EventGroupHandle_t event_group_{nullptr};
+
+  uint8_t *data_buffer_;
+  std::unique_ptr<RingBuffer> audio_ring_buffer_;
+
+  uint32_t timeout_;
+  uint8_t dout_pin_;
 
-  uint32_t timeout_{0};
-  uint8_t dout_pin_{0};
   bool task_created_{false};
 
+  int16_t q15_volume_factor_{INT16_MAX};
+
 #if SOC_I2S_SUPPORTS_DAC
   i2s_dac_mode_t internal_dac_mode_{I2S_DAC_CHANNEL_DISABLE};
 #endif
diff --git a/esphome/components/speaker/__init__.py b/esphome/components/speaker/__init__.py
index d28b726d1f..1bbc0b02ef 100644
--- a/esphome/components/speaker/__init__.py
+++ b/esphome/components/speaker/__init__.py
@@ -2,7 +2,7 @@ from esphome import automation
 from esphome.automation import maybe_simple_id
 import esphome.codegen as cg
 import esphome.config_validation as cv
-from esphome.const import CONF_DATA, CONF_ID
+from esphome.const import CONF_DATA, CONF_ID, CONF_VOLUME
 from esphome.core import CORE
 from esphome.coroutine import coroutine_with_priority
 
@@ -23,6 +23,10 @@ StopAction = speaker_ns.class_(
 FinishAction = speaker_ns.class_(
     "FinishAction", automation.Action, cg.Parented.template(Speaker)
 )
+VolumeSetAction = speaker_ns.class_(
+    "VolumeSetAction", automation.Action, cg.Parented.template(Speaker)
+)
+
 
 IsPlayingCondition = speaker_ns.class_("IsPlayingCondition", automation.Condition)
 IsStoppedCondition = speaker_ns.class_("IsStoppedCondition", automation.Condition)
@@ -90,6 +94,25 @@ automation.register_condition(
 )(speaker_action)
 
 
+@automation.register_action(
+    "speaker.volume_set",
+    VolumeSetAction,
+    cv.maybe_simple_value(
+        {
+            cv.GenerateID(): cv.use_id(Speaker),
+            cv.Required(CONF_VOLUME): cv.templatable(cv.percentage),
+        },
+        key=CONF_VOLUME,
+    ),
+)
+async def speaker_volume_set_action(config, action_id, template_arg, args):
+    var = cg.new_Pvariable(action_id, template_arg)
+    await cg.register_parented(var, config[CONF_ID])
+    volume = await cg.templatable(config[CONF_VOLUME], args, float)
+    cg.add(var.set_volume(volume))
+    return var
+
+
 @coroutine_with_priority(100.0)
 async def to_code(config):
     cg.add_global(speaker_ns.using)
diff --git a/esphome/components/speaker/automation.h b/esphome/components/speaker/automation.h
index 2716fe6100..9efda011f2 100644
--- a/esphome/components/speaker/automation.h
+++ b/esphome/components/speaker/automation.h
@@ -34,6 +34,11 @@ template<typename... Ts> class PlayAction : public Action<Ts...>, public Parente
   std::vector<uint8_t> data_static_{};
 };
 
+template<typename... Ts> class VolumeSetAction : public Action<Ts...>, public Parented<Speaker> {
+  TEMPLATABLE_VALUE(float, volume)
+  void play(Ts... x) override { this->parent_->set_volume(this->volume_.value(x...)); }
+};
+
 template<typename... Ts> class StopAction : public Action<Ts...>, public Parented<Speaker> {
  public:
   void play(Ts... x) override { this->parent_->stop(); }
diff --git a/esphome/components/speaker/speaker.h b/esphome/components/speaker/speaker.h
index 375ccc4e8c..9390e4edb7 100644
--- a/esphome/components/speaker/speaker.h
+++ b/esphome/components/speaker/speaker.h
@@ -4,6 +4,12 @@
 #include <cstdint>
 #include <vector>
 
+#ifdef USE_ESP32
+#include <freertos/FreeRTOS.h>
+#endif
+
+#include "esphome/components/audio/audio.h"
+
 namespace esphome {
 namespace speaker {
 
@@ -16,14 +22,33 @@ enum State : uint8_t {
 
 class Speaker {
  public:
+#ifdef USE_ESP32
+  /// @brief Plays the provided audio data.
+  /// If the speaker component doesn't implement this method, it falls back to the play method without this parameter.
+  /// @param data Audio data in the format specified by ``set_audio_stream_info`` method.
+  /// @param length The length of the audio data in bytes.
+  /// @param ticks_to_wait The FreeRTOS ticks to wait before writing as much data as possible to the ring buffer.
+  /// @return The number of bytes that were actually written to the speaker's internal buffer.
+  virtual size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) {
+    return this->play(data, length);
+  };
+#endif
+
+  /// @brief Plays the provided audio data.
+  /// If the audio stream is not the default defined in "esphome/core/audio.h" and the speaker component implements it,
+  /// then this should be called after calling ``set_audio_stream_info``.
+  /// @param data Audio data in the format specified by ``set_audio_stream_info`` method.
+  /// @param length The length of the audio data in bytes.
+  /// @return The number of bytes that were actually written to the speaker's internal buffer.
   virtual size_t play(const uint8_t *data, size_t length) = 0;
+
   size_t play(const std::vector<uint8_t> &data) { return this->play(data.data(), data.size()); }
 
   virtual void start() = 0;
   virtual void stop() = 0;
   // In compare between *STOP()* and *FINISH()*; *FINISH()* will stop after emptying the play buffer,
   // while *STOP()* will break directly.
-  // When finish() is not implemented on the plateform component it should just do a normal stop.
+  // When finish() is not implemented on the platform component it should just do a normal stop.
   virtual void finish() { this->stop(); }
 
   virtual bool has_buffered_data() const = 0;
@@ -31,8 +56,18 @@ class Speaker {
   bool is_running() const { return this->state_ == STATE_RUNNING; }
   bool is_stopped() const { return this->state_ == STATE_STOPPED; }
 
+  // Volume control must be implemented by each speaker component, otherwise it will have no effect.
+  virtual void set_volume(float volume) { this->volume_ = volume; };
+  virtual float get_volume() { return this->volume_; }
+
+  void set_audio_stream_info(const audio::AudioStreamInfo &audio_stream_info) {
+    this->audio_stream_info_ = audio_stream_info;
+  }
+
  protected:
   State state_{STATE_STOPPED};
+  audio::AudioStreamInfo audio_stream_info_;
+  float volume_{1.0f};
 };
 
 }  // namespace speaker
diff --git a/tests/components/speaker/test.esp32-ard.yaml b/tests/components/speaker/test.esp32-ard.yaml
index ab20f36eb6..9a24d00f68 100644
--- a/tests/components/speaker/test.esp32-ard.yaml
+++ b/tests/components/speaker/test.esp32-ard.yaml
@@ -5,6 +5,7 @@ esphome:
           condition: speaker.is_stopped
           then:
             - speaker.play: [0, 1, 2, 3]
+      - speaker.volume_set: 0.9
       - if:
           condition: speaker.is_playing
           then:
diff --git a/tests/components/speaker/test.esp32-c3-ard.yaml b/tests/components/speaker/test.esp32-c3-ard.yaml
index c966f9daa7..f28014337c 100644
--- a/tests/components/speaker/test.esp32-c3-ard.yaml
+++ b/tests/components/speaker/test.esp32-c3-ard.yaml
@@ -5,6 +5,7 @@ esphome:
           condition: speaker.is_stopped
           then:
             - speaker.play: [0, 1, 2, 3]
+      - speaker.volume_set: 0.9
       - if:
           condition: speaker.is_playing
           then:
diff --git a/tests/components/speaker/test.esp32-c3-idf.yaml b/tests/components/speaker/test.esp32-c3-idf.yaml
index c966f9daa7..f28014337c 100644
--- a/tests/components/speaker/test.esp32-c3-idf.yaml
+++ b/tests/components/speaker/test.esp32-c3-idf.yaml
@@ -5,6 +5,7 @@ esphome:
           condition: speaker.is_stopped
           then:
             - speaker.play: [0, 1, 2, 3]
+      - speaker.volume_set: 0.9
       - if:
           condition: speaker.is_playing
           then:
diff --git a/tests/components/speaker/test.esp32-idf.yaml b/tests/components/speaker/test.esp32-idf.yaml
index ab20f36eb6..9a24d00f68 100644
--- a/tests/components/speaker/test.esp32-idf.yaml
+++ b/tests/components/speaker/test.esp32-idf.yaml
@@ -5,6 +5,7 @@ esphome:
           condition: speaker.is_stopped
           then:
             - speaker.play: [0, 1, 2, 3]
+      - speaker.volume_set: 0.9
       - if:
           condition: speaker.is_playing
           then: