diff --git a/esphome/components/nabu/__init__.py b/esphome/components/nabu/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/esphome/components/nabu/audio_decoder.cpp b/esphome/components/nabu/audio_decoder.cpp new file mode 100644 index 0000000000..20c53b2d6e --- /dev/null +++ b/esphome/components/nabu/audio_decoder.cpp @@ -0,0 +1,384 @@ +#ifdef USE_ESP_IDF + +#include "audio_decoder.h" + +#include "mp3_decoder.h" + +#include "esphome/core/ring_buffer.h" + +namespace esphome { +namespace nabu { + +static const size_t READ_WRITE_TIMEOUT_MS = 20; + +AudioDecoder::AudioDecoder(RingBuffer *input_ring_buffer, RingBuffer *output_ring_buffer, size_t internal_buffer_size) { + this->input_ring_buffer_ = input_ring_buffer; + this->output_ring_buffer_ = output_ring_buffer; + this->internal_buffer_size_ = internal_buffer_size; +} + +AudioDecoder::~AudioDecoder() { + ExternalRAMAllocator allocator(ExternalRAMAllocator::ALLOW_FAILURE); + if (this->input_buffer_ != nullptr) { + allocator.deallocate(this->input_buffer_, this->internal_buffer_size_); + } + if (this->output_buffer_ != nullptr) { + allocator.deallocate(this->output_buffer_, this->internal_buffer_size_); + } + + if (this->flac_decoder_ != nullptr) { + this->flac_decoder_->free_buffers(); + this->flac_decoder_.reset(); // Free the unique_ptr + this->flac_decoder_ = nullptr; + } + + if (this->media_file_type_ == MediaFileType::MP3) { + MP3FreeDecoder(this->mp3_decoder_); + } + + if (this->wav_decoder_ != nullptr) { + this->wav_decoder_.reset(); // Free the unique_ptr + this->wav_decoder_ = nullptr; + } +} + +esp_err_t AudioDecoder::start(MediaFileType media_file_type) { + esp_err_t err = this->allocate_buffers_(); + + if (err != ESP_OK) { + return err; + } + + this->media_file_type_ = media_file_type; + + this->input_buffer_current_ = this->input_buffer_; + this->input_buffer_length_ = 0; + this->output_buffer_current_ = this->output_buffer_; + this->output_buffer_length_ = 0; + + this->potentially_failed_count_ = 0; + this->end_of_file_ = false; + + switch (this->media_file_type_) { + case MediaFileType::FLAC: + this->flac_decoder_ = make_unique(this->input_buffer_); + break; + case MediaFileType::MP3: + this->mp3_decoder_ = MP3InitDecoder(); + break; + case MediaFileType::WAV: + this->wav_decoder_ = make_unique(&this->input_buffer_current_); + this->wav_decoder_->reset(); + break; + case MediaFileType::NONE: + return ESP_ERR_NOT_SUPPORTED; + break; + } + + return ESP_OK; +} + +AudioDecoderState AudioDecoder::decode(bool stop_gracefully) { + if (stop_gracefully) { + if (this->output_buffer_length_ == 0) { + // If the file decoder believes it the end of file + if (this->end_of_file_) { + return AudioDecoderState::FINISHED; + } + // If all the internal buffers are empty, the decoding is done + if ((this->input_ring_buffer_->available() == 0) && (this->input_buffer_length_ == 0)) { + return AudioDecoderState::FINISHED; + } + + // If the ring buffer has no new data and the decoding failed last time, mark done + if ((this->input_ring_buffer_->available() == 0) && (this->potentially_failed_count_ > 0)) { + return AudioDecoderState::FINISHED; + } + } + } + + if (this->potentially_failed_count_ > 10) { + return AudioDecoderState::FAILED; + } + + FileDecoderState state = FileDecoderState::MORE_TO_PROCESS; + + while (state == FileDecoderState::MORE_TO_PROCESS) { + if (this->output_buffer_length_ > 0) { + // Have decoded data, write it to the output ring buffer + + size_t bytes_to_write = this->output_buffer_length_; + + if (bytes_to_write > 0) { + size_t bytes_written = this->output_ring_buffer_->write_without_replacement( + (void *) this->output_buffer_current_, bytes_to_write, pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + + this->output_buffer_length_ -= bytes_written; + this->output_buffer_current_ += bytes_written; + } + + if (this->output_buffer_length_ > 0) { + // Output buffer still has decoded audio to write + return AudioDecoderState::DECODING; + } + } else { + // Decode more data + + // Shift unread data in input buffer to start + if (this->input_buffer_length_ > 0) { + memmove(this->input_buffer_, this->input_buffer_current_, this->input_buffer_length_); + } + this->input_buffer_current_ = this->input_buffer_; + + // read in new ring buffer data to fill the remaining input buffer + size_t bytes_read = 0; + + size_t bytes_to_read = this->internal_buffer_size_ - this->input_buffer_length_; + + if (bytes_to_read > 0) { + uint8_t *new_audio_data = this->input_buffer_ + this->input_buffer_length_; + bytes_read = this->input_ring_buffer_->read((void *) new_audio_data, bytes_to_read, + pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + + this->input_buffer_length_ += bytes_read; + } + + if ((this->potentially_failed_count_ > 0) && (bytes_read == 0)) { + // Failed to decode in last attempt and there is no new data + + if (bytes_to_read == 0) { + // The input buffer is full. Since it previously failed on the exact same data, we can never recover + state = FileDecoderState::FAILED; + } else { + // Attempt to get more data next time + state = FileDecoderState::IDLE; + } + } else if (this->input_buffer_length_ == 0) { + // No data to decode, attempt to get more data next time + state = FileDecoderState::IDLE; + } else { + switch (this->media_file_type_) { + case MediaFileType::FLAC: + state = this->decode_flac_(); + break; + case MediaFileType::MP3: + state = this->decode_mp3_(); + break; + case MediaFileType::WAV: + state = this->decode_wav_(); + break; + case MediaFileType::NONE: + state = FileDecoderState::IDLE; + break; + } + } + } + if (state == FileDecoderState::POTENTIALLY_FAILED) { + ++this->potentially_failed_count_; + } else if (state == FileDecoderState::END_OF_FILE) { + this->end_of_file_ = true; + } else if (state == FileDecoderState::FAILED) { + return AudioDecoderState::FAILED; + } else if ((state == FileDecoderState::MORE_TO_PROCESS)) { + this->potentially_failed_count_ = 0; + } + } + return AudioDecoderState::DECODING; +} + +esp_err_t AudioDecoder::allocate_buffers_() { + ExternalRAMAllocator allocator(ExternalRAMAllocator::ALLOW_FAILURE); + + if (this->input_buffer_ == nullptr) + this->input_buffer_ = allocator.allocate(this->internal_buffer_size_); + + if (this->output_buffer_ == nullptr) + this->output_buffer_ = allocator.allocate(this->internal_buffer_size_); + + if ((this->input_buffer_ == nullptr) || (this->output_buffer_ == nullptr)) { + return ESP_ERR_NO_MEM; + } + + return ESP_OK; +} + +FileDecoderState AudioDecoder::decode_flac_() { + if (!this->audio_stream_info_.has_value()) { + // Header hasn't been read + auto result = this->flac_decoder_->read_header(this->input_buffer_length_); + + if (result == flac::FLAC_DECODER_HEADER_OUT_OF_DATA) { + return FileDecoderState::POTENTIALLY_FAILED; + } + + if (result != flac::FLAC_DECODER_SUCCESS) { + // Couldn't read FLAC header + return FileDecoderState::FAILED; + } + + size_t bytes_consumed = this->flac_decoder_->get_bytes_index(); + this->input_buffer_current_ += bytes_consumed; + this->input_buffer_length_ = this->flac_decoder_->get_bytes_left(); + + size_t flac_decoder_output_buffer_min_size = flac_decoder_->get_output_buffer_size(); + if (this->internal_buffer_size_ < flac_decoder_output_buffer_min_size * sizeof(int16_t)) { + // Output buffer is not big enough + return FileDecoderState::FAILED; + } + + audio::AudioStreamInfo audio_stream_info; + audio_stream_info.channels = this->flac_decoder_->get_num_channels(); + audio_stream_info.sample_rate = this->flac_decoder_->get_sample_rate(); + audio_stream_info.bits_per_sample = this->flac_decoder_->get_sample_depth(); + + this->audio_stream_info_ = audio_stream_info; + + return FileDecoderState::MORE_TO_PROCESS; + } + + uint32_t output_samples = 0; + auto result = + this->flac_decoder_->decode_frame(this->input_buffer_length_, (int16_t *) this->output_buffer_, &output_samples); + + if (result == flac::FLAC_DECODER_ERROR_OUT_OF_DATA) { + // Not an issue, just needs more data that we'll get next time. + return FileDecoderState::POTENTIALLY_FAILED; + } else if (result > flac::FLAC_DECODER_ERROR_OUT_OF_DATA) { + // Corrupted frame, don't retry with current buffer content, wait for new sync + size_t bytes_consumed = this->flac_decoder_->get_bytes_index(); + this->input_buffer_current_ += bytes_consumed; + this->input_buffer_length_ = this->flac_decoder_->get_bytes_left(); + + return FileDecoderState::POTENTIALLY_FAILED; + } + + // We have successfully decoded some input data and have new output data + size_t bytes_consumed = this->flac_decoder_->get_bytes_index(); + this->input_buffer_current_ += bytes_consumed; + this->input_buffer_length_ = this->flac_decoder_->get_bytes_left(); + + this->output_buffer_current_ = this->output_buffer_; + this->output_buffer_length_ = output_samples * sizeof(int16_t); + + if (result == flac::FLAC_DECODER_NO_MORE_FRAMES) { + return FileDecoderState::END_OF_FILE; + } + + return FileDecoderState::IDLE; +} + +FileDecoderState AudioDecoder::decode_mp3_() { + // Look for the next sync word + int32_t offset = MP3FindSyncWord(this->input_buffer_current_, this->input_buffer_length_); + if (offset < 0) { + // We may recover if we have more data + return FileDecoderState::POTENTIALLY_FAILED; + } + + // Advance read pointer + this->input_buffer_current_ += offset; + this->input_buffer_length_ -= offset; + + int err = MP3Decode(this->mp3_decoder_, &this->input_buffer_current_, (int *) &this->input_buffer_length_, + (int16_t *) this->output_buffer_, 0); + if (err) { + switch (err) { + case ERR_MP3_MAINDATA_UNDERFLOW: + // Not a problem. Next call to decode will provide more data. + return FileDecoderState::POTENTIALLY_FAILED; + break; + default: + return FileDecoderState::FAILED; + break; + } + } else { + MP3FrameInfo mp3_frame_info; + MP3GetLastFrameInfo(this->mp3_decoder_, &mp3_frame_info); + if (mp3_frame_info.outputSamps > 0) { + int bytes_per_sample = (mp3_frame_info.bitsPerSample / 8); + this->output_buffer_length_ = mp3_frame_info.outputSamps * bytes_per_sample; + this->output_buffer_current_ = this->output_buffer_; + + audio::AudioStreamInfo stream_info; + stream_info.channels = mp3_frame_info.nChans; + stream_info.sample_rate = mp3_frame_info.samprate; + stream_info.bits_per_sample = mp3_frame_info.bitsPerSample; + this->audio_stream_info_ = stream_info; + } + } + + return FileDecoderState::MORE_TO_PROCESS; +} + +FileDecoderState AudioDecoder::decode_wav_() { + if (!this->audio_stream_info_.has_value() && (this->input_buffer_length_ > 44)) { + // Header hasn't been processed + + size_t original_buffer_length = this->input_buffer_length_; + + size_t wav_bytes_to_skip = this->wav_decoder_->bytes_to_skip(); + size_t wav_bytes_to_read = this->wav_decoder_->bytes_needed(); + + bool header_finished = false; + while (!header_finished) { + if (wav_bytes_to_skip > 0) { + // Adjust pointer to skip the appropriate bytes + this->input_buffer_current_ += wav_bytes_to_skip; + this->input_buffer_length_ -= wav_bytes_to_skip; + wav_bytes_to_skip = 0; + } else if (wav_bytes_to_read > 0) { + wav_decoder::WAVDecoderResult result = this->wav_decoder_->next(); + this->input_buffer_current_ += wav_bytes_to_read; + this->input_buffer_length_ -= wav_bytes_to_read; + + if (result == wav_decoder::WAV_DECODER_SUCCESS_IN_DATA) { + // Header parsing is complete + + // Assume PCM + audio::AudioStreamInfo audio_stream_info; + audio_stream_info.channels = this->wav_decoder_->num_channels(); + audio_stream_info.sample_rate = this->wav_decoder_->sample_rate(); + audio_stream_info.bits_per_sample = this->wav_decoder_->bits_per_sample(); + this->audio_stream_info_ = audio_stream_info; + this->wav_bytes_left_ = this->wav_decoder_->chunk_bytes_left(); + header_finished = true; + } else if (result == wav_decoder::WAV_DECODER_SUCCESS_NEXT) { + // Continue parsing header + wav_bytes_to_skip = this->wav_decoder_->bytes_to_skip(); + wav_bytes_to_read = this->wav_decoder_->bytes_needed(); + } else { + // Unexpected error parsing the wav header + return FileDecoderState::FAILED; + } + } else { + // Something unexpected has happened + // Reset state and hope we have enough info next time + this->input_buffer_length_ = original_buffer_length; + this->input_buffer_current_ = this->input_buffer_; + return FileDecoderState::POTENTIALLY_FAILED; + } + } + } + + if (this->wav_bytes_left_ > 0) { + size_t bytes_to_write = std::min(this->wav_bytes_left_, this->input_buffer_length_); + bytes_to_write = std::min(bytes_to_write, this->internal_buffer_size_); + if (bytes_to_write > 0) { + std::memcpy(this->output_buffer_, this->input_buffer_current_, bytes_to_write); + this->input_buffer_current_ += bytes_to_write; + this->input_buffer_length_ -= bytes_to_write; + this->output_buffer_current_ = this->output_buffer_; + this->output_buffer_length_ = bytes_to_write; + this->wav_bytes_left_ -= bytes_to_write; + } + + return FileDecoderState::IDLE; + } + + return FileDecoderState::END_OF_FILE; +} + +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/audio_decoder.h b/esphome/components/nabu/audio_decoder.h new file mode 100644 index 0000000000..8267d8c01d --- /dev/null +++ b/esphome/components/nabu/audio_decoder.h @@ -0,0 +1,81 @@ +#pragma once + +#ifdef USE_ESP_IDF + +#include +#include +#include + +#include "nabu_media_helpers.h" +#include "esphome/components/audio/audio.h" + +#include "esphome/core/helpers.h" +#include "esphome/core/ring_buffer.h" + +namespace esphome { +namespace nabu { + +enum class AudioDecoderState : uint8_t { + INITIALIZED = 0, + DECODING, + FINISHED, + FAILED, +}; + +// Only used within the AudioDecoder class; conveys the state of the particular file type decoder +enum class FileDecoderState : uint8_t { + MORE_TO_PROCESS, + IDLE, + POTENTIALLY_FAILED, + FAILED, + END_OF_FILE, +}; + +class AudioDecoder { + public: + AudioDecoder(esphome::RingBuffer *input_ring_buffer, esphome::RingBuffer *output_ring_buffer, + size_t internal_buffer_size); + ~AudioDecoder(); + + esp_err_t start(MediaFileType media_file_type); + + AudioDecoderState decode(bool stop_gracefully); + + const optional &get_audio_stream_info() const { return this->audio_stream_info_; } + + protected: + esp_err_t allocate_buffers_(); + + FileDecoderState decode_flac_(); + FileDecoderState decode_mp3_(); + FileDecoderState decode_wav_(); + + esphome::RingBuffer *input_ring_buffer_; + esphome::RingBuffer *output_ring_buffer_; + size_t internal_buffer_size_; + + uint8_t *input_buffer_{nullptr}; + uint8_t *input_buffer_current_{nullptr}; + size_t input_buffer_length_; + + uint8_t *output_buffer_{nullptr}; + uint8_t *output_buffer_current_{nullptr}; + size_t output_buffer_length_; + + std::unique_ptr flac_decoder_; + + HMP3Decoder mp3_decoder_; + + std::unique_ptr wav_decoder_; + size_t wav_bytes_left_; + + MediaFileType media_file_type_{MediaFileType::NONE}; + optional audio_stream_info_{}; + + size_t potentially_failed_count_{0}; + bool end_of_file_{false}; +}; +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/audio_mixer.cpp b/esphome/components/nabu/audio_mixer.cpp new file mode 100644 index 0000000000..61dcd4207d --- /dev/null +++ b/esphome/components/nabu/audio_mixer.cpp @@ -0,0 +1,382 @@ +#ifdef USE_ESP_IDF + +#include "audio_mixer.h" + +#include + +#include "esphome/core/hal.h" +#include "esphome/core/helpers.h" + +namespace esphome { +namespace nabu { + +static const size_t INPUT_RING_BUFFER_SAMPLES = 24000; +static const size_t OUTPUT_BUFFER_SAMPLES = 8192; +static const size_t QUEUE_COUNT = 20; + +static const uint32_t TASK_STACK_SIZE = 3072; +static const size_t TASK_DELAY_MS = 25; + +static const int16_t MAX_AUDIO_SAMPLE_VALUE = INT16_MAX; +static const int16_t MIN_AUDIO_SAMPLE_VALUE = INT16_MIN; + +esp_err_t AudioMixer::start(speaker::Speaker *speaker, const std::string &task_name, UBaseType_t priority) { + esp_err_t err = this->allocate_buffers_(); + + if (err != ESP_OK) { + return err; + } + + if (this->task_handle_ == nullptr) { + this->task_handle_ = xTaskCreateStatic(AudioMixer::audio_mixer_task_, task_name.c_str(), TASK_STACK_SIZE, + (void *) this, priority, this->stack_buffer_, &this->task_stack_); + } + + if (this->task_handle_ == nullptr) { + return ESP_FAIL; + } + + this->speaker_ = speaker; + + return ESP_OK; +} + +void AudioMixer::stop() { + vTaskDelete(this->task_handle_); + this->task_handle_ = nullptr; + + xQueueReset(this->event_queue_); + xQueueReset(this->command_queue_); +} + +void AudioMixer::suspend_task() { + if (this->task_handle_ != nullptr) { + vTaskSuspend(this->task_handle_); + } +} + +void AudioMixer::resume_task() { + if (this->task_handle_ != nullptr) { + vTaskResume(task_handle_); + } +} + +void AudioMixer::audio_mixer_task_(void *params) { + AudioMixer *this_mixer = (AudioMixer *) params; + + TaskEvent event; + CommandEvent command_event; + + ExternalRAMAllocator allocator(ExternalRAMAllocator::ALLOW_FAILURE); + int16_t *media_buffer = allocator.allocate(OUTPUT_BUFFER_SAMPLES); + int16_t *announcement_buffer = allocator.allocate(OUTPUT_BUFFER_SAMPLES); + int16_t *combination_buffer = allocator.allocate(OUTPUT_BUFFER_SAMPLES); + + int16_t *combination_buffer_current = combination_buffer; + size_t combination_buffer_length = 0; + + if ((media_buffer == nullptr) || (announcement_buffer == nullptr)) { + event.type = EventType::WARNING; + event.err = ESP_ERR_NO_MEM; + xQueueSend(this_mixer->event_queue_, &event, portMAX_DELAY); + + event.type = EventType::STOPPED; + event.err = ESP_OK; + xQueueSend(this_mixer->event_queue_, &event, portMAX_DELAY); + + while (true) { + delay(TASK_DELAY_MS); + } + + return; + } + + // Handles media stream pausing + bool transfer_media = true; + + // Parameters to control the ducking dB reduction and its transitions + // There is a built in negative sign; e.g., reducing by 5 dB is changing the gain by -5 dB + int8_t target_ducking_db_reduction = 0; + int8_t current_ducking_db_reduction = 0; + + // Each step represents a change in 1 dB. Positive 1 means the dB reduction is increasing. Negative 1 means the dB + // reduction is decreasing. + int8_t db_change_per_ducking_step = 1; + + size_t ducking_transition_samples_remaining = 0; + size_t samples_per_ducking_step = 0; + + event.type = EventType::STARTED; + xQueueSend(this_mixer->event_queue_, &event, portMAX_DELAY); + + while (true) { + if (xQueueReceive(this_mixer->command_queue_, &command_event, 0) == pdTRUE) { + if (command_event.command == CommandEventType::STOP) { + break; + } else if (command_event.command == CommandEventType::DUCK) { + if (target_ducking_db_reduction != command_event.decibel_reduction) { + current_ducking_db_reduction = target_ducking_db_reduction; + + target_ducking_db_reduction = command_event.decibel_reduction; + + uint8_t total_ducking_steps = 0; + if (target_ducking_db_reduction > current_ducking_db_reduction) { + // The dB reduction level is increasing (which results in quieter audio) + total_ducking_steps = target_ducking_db_reduction - current_ducking_db_reduction - 1; + db_change_per_ducking_step = 1; + } else { + // The dB reduction level is decreasing (which results in louder audio) + total_ducking_steps = current_ducking_db_reduction - target_ducking_db_reduction - 1; + db_change_per_ducking_step = -1; + } + if (total_ducking_steps > 0) { + ducking_transition_samples_remaining = command_event.transition_samples; + + samples_per_ducking_step = ducking_transition_samples_remaining / total_ducking_steps; + } else { + ducking_transition_samples_remaining = 0; + } + } + } else if (command_event.command == CommandEventType::PAUSE_MEDIA) { + transfer_media = false; + } else if (command_event.command == CommandEventType::RESUME_MEDIA) { + transfer_media = true; + } else if (command_event.command == CommandEventType::CLEAR_MEDIA) { + this_mixer->media_ring_buffer_->reset(); + } else if (command_event.command == CommandEventType::CLEAR_ANNOUNCEMENT) { + this_mixer->announcement_ring_buffer_->reset(); + } + } + + if (combination_buffer_length > 0) { + size_t output_bytes_written = this_mixer->speaker_->play((uint8_t *) combination_buffer, + combination_buffer_length, pdMS_TO_TICKS(TASK_DELAY_MS)); + combination_buffer_length -= output_bytes_written; + if ((combination_buffer_length > 0) && (output_bytes_written > 0)) { + memmove(combination_buffer, combination_buffer + output_bytes_written / sizeof(int16_t), + combination_buffer_length); + } + } else { + size_t media_available = this_mixer->media_ring_buffer_->available(); + size_t announcement_available = this_mixer->announcement_ring_buffer_->available(); + + if (media_available * transfer_media + announcement_available > 0) { + size_t bytes_to_read = OUTPUT_BUFFER_SAMPLES * sizeof(int16_t); + + if (media_available * transfer_media > 0) { + bytes_to_read = std::min(bytes_to_read, media_available); + } + + if (announcement_available > 0) { + bytes_to_read = std::min(bytes_to_read, announcement_available); + } + + if (bytes_to_read > 0) { + size_t media_bytes_read = 0; + if (media_available * transfer_media > 0) { + media_bytes_read = this_mixer->media_ring_buffer_->read((void *) media_buffer, bytes_to_read, 0); + if (media_bytes_read > 0) { + size_t samples_read = media_bytes_read / sizeof(int16_t); + if (ducking_transition_samples_remaining > 0) { + // Ducking level is still transitioning + + size_t samples_left = ducking_transition_samples_remaining; + + // There may be more than one step worth of samples to duck in the buffers, so manage positions + int16_t *current_media_buffer = media_buffer; + + size_t samples_left_in_step = samples_left % samples_per_ducking_step; + if (samples_left_in_step == 0) { + // Start of a new ducking step + + current_ducking_db_reduction += db_change_per_ducking_step; + samples_left_in_step = samples_per_ducking_step; + } + size_t samples_left_to_duck = std::min(samples_left_in_step, samples_read); + + size_t total_samples_ducked = 0; + + while (samples_left_to_duck > 0) { + // Ensure we only point to valid index in the Q15 scaling factor table + uint8_t safe_db_reduction_index = + clamp(current_ducking_db_reduction, 0, decibel_reduction_table.size() - 1); + + int16_t q15_scale_factor = decibel_reduction_table[safe_db_reduction_index]; + this_mixer->scale_audio_samples_(current_media_buffer, current_media_buffer, q15_scale_factor, + samples_left_to_duck); + + current_media_buffer += samples_left_to_duck; + + samples_read -= samples_left_to_duck; + samples_left -= samples_left_to_duck; + + total_samples_ducked += samples_left_to_duck; + + samples_left_in_step = samples_left % samples_per_ducking_step; + if (samples_left_in_step == 0) { + // Start of a new step + + current_ducking_db_reduction += db_change_per_ducking_step; + samples_left_in_step = samples_per_ducking_step; + } + samples_left_to_duck = std::min(samples_left_in_step, samples_read); + } + } else if (target_ducking_db_reduction > 0) { + // We still need to apply a ducking scaling, but we are done transitioning + + uint8_t safe_db_reduction_index = + clamp(target_ducking_db_reduction, 0, decibel_reduction_table.size() - 1); + + int16_t q15_scale_factor = decibel_reduction_table[safe_db_reduction_index]; + this_mixer->scale_audio_samples_(media_buffer, media_buffer, q15_scale_factor, samples_read); + } + } + } + + size_t announcement_bytes_read = 0; + if (announcement_available > 0) { + announcement_bytes_read = + this_mixer->announcement_ring_buffer_->read((void *) announcement_buffer, bytes_to_read, 0); + } + + if ((media_bytes_read > 0) && (announcement_bytes_read > 0)) { + // We have both a media and an announcement stream, so mix them together + + size_t samples_read = bytes_to_read / sizeof(int16_t); + + this_mixer->mix_audio_samples_without_clipping_(media_buffer, announcement_buffer, combination_buffer, + samples_read); + + combination_buffer_length = samples_read * sizeof(int16_t); + } else if (media_bytes_read > 0) { + memcpy(combination_buffer, media_buffer, media_bytes_read); + combination_buffer_length = media_bytes_read; + } else if (announcement_bytes_read > 0) { + memcpy(combination_buffer, announcement_buffer, announcement_bytes_read); + combination_buffer_length = announcement_bytes_read; + } + + size_t samples_written = combination_buffer_length / sizeof(int16_t); + if (ducking_transition_samples_remaining > 0) { + ducking_transition_samples_remaining -= std::min(samples_written, ducking_transition_samples_remaining); + } + } + } else { + // No audio data available in either buffer + + delay(TASK_DELAY_MS); + } + } + } + + event.type = EventType::STOPPING; + xQueueSend(this_mixer->event_queue_, &event, portMAX_DELAY); + + this_mixer->reset_ring_buffers_(); + allocator.deallocate(media_buffer, OUTPUT_BUFFER_SAMPLES); + allocator.deallocate(announcement_buffer, OUTPUT_BUFFER_SAMPLES); + allocator.deallocate(combination_buffer, OUTPUT_BUFFER_SAMPLES); + + event.type = EventType::STOPPED; + xQueueSend(this_mixer->event_queue_, &event, portMAX_DELAY); + + while (true) { + delay(TASK_DELAY_MS); + } +} + +esp_err_t AudioMixer::allocate_buffers_() { + if (this->media_ring_buffer_ == nullptr) + this->media_ring_buffer_ = RingBuffer::create(INPUT_RING_BUFFER_SAMPLES * sizeof(int16_t)); + + if (this->announcement_ring_buffer_ == nullptr) + this->announcement_ring_buffer_ = RingBuffer::create(INPUT_RING_BUFFER_SAMPLES * sizeof(int16_t)); + + if ((this->announcement_ring_buffer_ == nullptr) || (this->media_ring_buffer_ == nullptr)) { + return ESP_ERR_NO_MEM; + } + + if (this->stack_buffer_ == nullptr) + this->stack_buffer_ = (StackType_t *) malloc(TASK_STACK_SIZE); + + if (this->stack_buffer_ == nullptr) { + return ESP_ERR_NO_MEM; + } + + if (this->event_queue_ == nullptr) + this->event_queue_ = xQueueCreate(QUEUE_COUNT, sizeof(TaskEvent)); + + if (this->command_queue_ == nullptr) + this->command_queue_ = xQueueCreate(QUEUE_COUNT, sizeof(CommandEvent)); + + if ((this->event_queue_ == nullptr) || (this->command_queue_ == nullptr)) { + return ESP_ERR_NO_MEM; + } + + return ESP_OK; +} + +void AudioMixer::reset_ring_buffers_() { + this->media_ring_buffer_->reset(); + this->announcement_ring_buffer_->reset(); +} + +void AudioMixer::mix_audio_samples_without_clipping_(int16_t *media_buffer, int16_t *announcement_buffer, + int16_t *combination_buffer, size_t samples_to_mix) { + // We first test adding the two clips samples together and check for any clipping + // We want the announcement volume to be consistent, regardless if media is playing or not + // If there is clipping, we determine what factor we need to multiply that media sample by to avoid it + // We take the smallest factor necessary for all the samples so the media volume is consistent on this batch + // of samples + // Note: This may not be the best approach. Adding 2 audio samples together makes both sound louder, even if + // we are not clipping. As a result, the mixed announcement will sound louder (by around 3dB if the audio + // streams are independent?) than if it were by itself. + + int16_t q15_scaling_factor = MAX_AUDIO_SAMPLE_VALUE; + + for (int i = 0; i < samples_to_mix; ++i) { + int32_t added_sample = static_cast(media_buffer[i]) + static_cast(announcement_buffer[i]); + + if ((added_sample > MAX_AUDIO_SAMPLE_VALUE) || (added_sample < MIN_AUDIO_SAMPLE_VALUE)) { + // The largest magnitude the media sample can be to avoid clipping (converted to Q30 fixed point) + int32_t q30_media_sample_safe_max = + static_cast(std::abs(MIN_AUDIO_SAMPLE_VALUE) - std::abs(announcement_buffer[i])) << 15; + + // Actual media sample value (Q15 number stored in an int32 for future division) + int32_t media_sample_value = abs(media_buffer[i]); + + // Calculation to perform the Q15 division for media_sample_safe_max/media_sample_value + // Reference: https://sestevenson.wordpress.com/2010/09/20/fixed-point-division-2/ (accessed August 15, + // 2024) + int16_t necessary_q15_factor = static_cast(q30_media_sample_safe_max / media_sample_value); + // Take the minimum scaling factor (the smaller the factor, the more it needs to be scaled down) + q15_scaling_factor = std::min(necessary_q15_factor, q15_scaling_factor); + } else { + // Store the combined samples in the combination buffer. If we do not need to scale, then the samples are already + // mixed. + combination_buffer[i] = added_sample; + } + } + + if (q15_scaling_factor < MAX_AUDIO_SAMPLE_VALUE) { + // Need to scale to avoid clipping + + this->scale_audio_samples_(media_buffer, media_buffer, q15_scaling_factor, samples_to_mix); + + // Mix both stream by adding them together with no bitshift + // The dsps_add functions have the following inputs: + // (buffer 1, buffer 2, output buffer, number of samples, buffer 1 step, buffer 2 step, output, buffer step, + // bitshift) + dsps_add_s16(media_buffer, announcement_buffer, combination_buffer, samples_to_mix, 1, 1, 1, 0); + } +} + +void AudioMixer::scale_audio_samples_(int16_t *audio_samples, int16_t *output_buffer, int16_t scale_factor, + size_t samples_to_scale) { + // Scale the audio samples and store them in the output buffer + dsps_mulc_s16(audio_samples, output_buffer, samples_to_scale, scale_factor, 1, 1); +} + +} // namespace nabu +} // namespace esphome +#endif diff --git a/esphome/components/nabu/audio_mixer.h b/esphome/components/nabu/audio_mixer.h new file mode 100644 index 0000000000..7e4cef75f5 --- /dev/null +++ b/esphome/components/nabu/audio_mixer.h @@ -0,0 +1,159 @@ +#pragma once + +#ifdef USE_ESP_IDF + +#include "esphome/components/media_player/media_player.h" +#include "esphome/components/speaker/speaker.h" + +#include "esphome/core/hal.h" +#include "esphome/core/helpers.h" +#include "esphome/core/ring_buffer.h" + +#include +#include + +namespace esphome { +namespace nabu { + +// Mixes two incoming audio streams together +// - The media stream intended for music playback +// - Able to duck (made quieter) +// - Able to pause +// - The announcement stream is intended for TTS reponses or various beeps/sound effects +// - Unable to duck +// - Unable to pause +// - Each stream has a corresponding input ring buffer. Retrieved via the `get_media_ring_buffer` and +// `get_announcement_ring_buffer` functions +// - The mixed audio is sent to the configured speaker component. +// - The mixer runs as a FreeRTOS task +// - The task reports its state using the TaskEvent queue. Regularly call the `read_event` function to obtain the +// current state +// - Commands are sent to the task using a the CommandEvent queue. Use the `send_command` function to do so. +// - Use the `start` function to initiate. The `stop` function deletes the task, but be sure to send a STOP command +// first to avoid memory leaks. + +enum class EventType : uint8_t { + STARTING = 0, + STARTED, + RUNNING, + IDLE, + STOPPING, + STOPPED, + WARNING = 255, +}; + +// Used for reporting the state of the mixer task +struct TaskEvent { + EventType type; + esp_err_t err; +}; + +enum class CommandEventType : uint8_t { + STOP, // Stop mixing to prepare for stopping the mixing task + DUCK, // Duck the media audio + PAUSE_MEDIA, // Pauses the media stream + RESUME_MEDIA, // Resumes the media stream + CLEAR_MEDIA, // Resets the media ring buffer + CLEAR_ANNOUNCEMENT, // Resets the announcement ring buffer +}; + +// Used to send commands to the mixer task +struct CommandEvent { + CommandEventType command; + uint8_t decibel_reduction; + size_t transition_samples = 0; +}; + +// Gives the Q15 fixed point scaling factor to reduce by 0 dB, 1dB, ..., 50 dB +// dB to PCM scaling factor formula: floating_point_scale_factor = 2^(-db/6.014) +// float to Q15 fixed point formula: q15_scale_factor = floating_point_scale_factor * 2^(15) +static const std::vector decibel_reduction_table = { + 32767, 29201, 26022, 23189, 20665, 18415, 16410, 14624, 13032, 11613, 10349, 9222, 8218, 7324, 6527, 5816, 5183, + 4619, 4116, 3668, 3269, 2913, 2596, 2313, 2061, 1837, 1637, 1459, 1300, 1158, 1032, 920, 820, 731, + 651, 580, 517, 461, 411, 366, 326, 291, 259, 231, 206, 183, 163, 146, 130, 116, 103}; + +class AudioMixer { + public: + /// @brief Sends a CommandEvent to the command queue + /// @param command Pointer to CommandEvent object to be sent + /// @param ticks_to_wait The number of FreeRTOS ticks to wait for an event to appear on the queue. Defaults to 0. + /// @return pdTRUE if successful, pdFALSE otherwises + BaseType_t send_command(CommandEvent *command, TickType_t ticks_to_wait = portMAX_DELAY) { + return xQueueSend(this->command_queue_, command, ticks_to_wait); + } + + /// @brief Reads a TaskEvent from the event queue indicating its current status + /// @param event Pointer to TaskEvent object to store the event in + /// @param ticks_to_wait The number of FreeRTOS ticks to wait for an event to appear on the queue. Defaults to 0. + /// @return pdTRUE if successful, pdFALSE otherwise + BaseType_t read_event(TaskEvent *event, TickType_t ticks_to_wait = 0) { + return xQueueReceive(this->event_queue_, event, ticks_to_wait); + } + + /// @brief Starts the mixer task + /// @param speaker Pointer to Speaker component + /// @param task_name FreeRTOS task name + /// @param priority FreeRTOS task priority. Defaults to 1 + /// @return ESP_OK if successful, and error otherwise + esp_err_t start(speaker::Speaker *speaker, const std::string &task_name, UBaseType_t priority = 1); + + /// @brief Stops the mixer task and clears the queues + void stop(); + + /// @brief Retrieves the media stream's ring buffer pointer + /// @return pointer to media ring buffer + RingBuffer *get_media_ring_buffer() { return this->media_ring_buffer_.get(); } + + /// @brief Retrieves the announcement stream's ring buffer pointer + /// @return pointer to announcement ring buffer + RingBuffer *get_announcement_ring_buffer() { return this->announcement_ring_buffer_.get(); } + + /// @brief Suspends the mixer task + void suspend_task(); + /// @brief Resumes the mixer task + void resume_task(); + + protected: + /// @brief Allocates the ring buffers, task stack, and queues + /// @return ESP_OK if successful or an error otherwise + esp_err_t allocate_buffers_(); + + /// @brief Resets the media and anouncement ring buffers + void reset_ring_buffers_(); + + /// @brief Mixes the media and announcement samples. If the resulting audio clips, the media samples are first scaled. + /// @param media_buffer buffer for media samples + /// @param announcement_buffer buffer for announcement samples + /// @param combination_buffer buffer for the mixed samples + /// @param samples_to_mix number of samples in the media and annoucnement buffers to mix together + void mix_audio_samples_without_clipping_(int16_t *media_buffer, int16_t *announcement_buffer, + int16_t *combination_buffer, size_t samples_to_mix); + + /// @brief Scales audio samples. Scales in place when audio_samples == output_buffer. + /// @param audio_samples PCM int16 audio samples + /// @param output_buffer Buffer to store the scaled samples + /// @param scale_factor Q15 fixed point scaling factor + /// @param samples_to_scale Number of samples to scale + void scale_audio_samples_(int16_t *audio_samples, int16_t *output_buffer, int16_t scale_factor, + size_t samples_to_scale); + + static void audio_mixer_task_(void *params); + TaskHandle_t task_handle_{nullptr}; + StaticTask_t task_stack_; + StackType_t *stack_buffer_{nullptr}; + + // Reports events from the mixer task + QueueHandle_t event_queue_; + + // Stores commands to send the mixer task + QueueHandle_t command_queue_; + + speaker::Speaker *speaker_{nullptr}; + + std::unique_ptr media_ring_buffer_; + std::unique_ptr announcement_ring_buffer_; +}; +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/audio_pipeline.cpp b/esphome/components/nabu/audio_pipeline.cpp new file mode 100644 index 0000000000..f405808d1b --- /dev/null +++ b/esphome/components/nabu/audio_pipeline.cpp @@ -0,0 +1,540 @@ +#ifdef USE_ESP_IDF + +#include "audio_pipeline.h" + +#include "esphome/core/helpers.h" +#include "esphome/core/log.h" + +namespace esphome { +namespace nabu { + +static const size_t FILE_BUFFER_SIZE = 32 * 1024; +static const size_t FILE_RING_BUFFER_SIZE = 64 * 1024; +static const size_t BUFFER_SIZE_SAMPLES = 32768; +static const size_t BUFFER_SIZE_BYTES = BUFFER_SIZE_SAMPLES * sizeof(int16_t); + +static const uint32_t READER_TASK_STACK_SIZE = 5 * 1024; +static const uint32_t DECODER_TASK_STACK_SIZE = 3 * 1024; +static const uint32_t RESAMPLER_TASK_STACK_SIZE = 3 * 1024; + +static const size_t INFO_ERROR_QUEUE_COUNT = 5; + +static const char *const TAG = "nabu_media_player.pipeline"; + +enum EventGroupBits : uint32_t { + // The stop() function clears all unfinished bits + // MESSAGE_* bits are only set by their respective tasks + + // Stops all activity in the pipeline elements and set by stop() or by each task + PIPELINE_COMMAND_STOP = (1 << 0), + + // Read audio from an HTTP source; cleared by reader task and set by start(uri,...) + READER_COMMAND_INIT_HTTP = (1 << 4), + // Read audio from an audio file from the flash; cleared by reader task and set by start(media_file,...) + READER_COMMAND_INIT_FILE = (1 << 5), + + // Audio file type is read after checking it is supported; cleared by decoder task + READER_MESSAGE_LOADED_MEDIA_TYPE = (1 << 6), + // Reader is done (either through a failure or just end of the stream); cleared by reader task + READER_MESSAGE_FINISHED = (1 << 7), + // Error reading the file; cleared by get_state() + READER_MESSAGE_ERROR = (1 << 8), + + // Decoder has determined the stream information; cleared by resampler + DECODER_MESSAGE_LOADED_STREAM_INFO = (1 << 11), + // Decoder is done (either through a faiilure or the end of the stream); cleared by decoder task + DECODER_MESSAGE_FINISHED = (1 << 12), + // Error decoding the file; cleared by get_state() by decoder task + DECODER_MESSAGE_ERROR = (1 << 13), + + // Resampler is done (either through a failure or the end of the stream); cleared by resampler task + RESAMPLER_MESSAGE_FINISHED = (1 << 17), + // Error resampling the file; cleared by get_state() + RESAMPLER_MESSAGE_ERROR = (1 << 18), + + // Cleared by respective tasks + FINISHED_BITS = READER_MESSAGE_FINISHED | DECODER_MESSAGE_FINISHED | RESAMPLER_MESSAGE_FINISHED, + UNFINISHED_BITS = ~(FINISHED_BITS | 0xff000000), // Only 24 bits are valid for the event group, so make sure first 8 + // bits of uint32 are not set; cleared by stop() +}; + +AudioPipeline::AudioPipeline(AudioMixer *mixer, AudioPipelineType pipeline_type) { + this->mixer_ = mixer; + this->pipeline_type_ = pipeline_type; +} + +esp_err_t AudioPipeline::start(const std::string &uri, uint32_t target_sample_rate, const std::string &task_name, + UBaseType_t priority) { + esp_err_t err = this->common_start_(target_sample_rate, task_name, priority); + + if (err == ESP_OK) { + this->current_uri_ = uri; + xEventGroupSetBits(this->event_group_, READER_COMMAND_INIT_HTTP); + } + + return err; +} + +esp_err_t AudioPipeline::start(MediaFile *media_file, uint32_t target_sample_rate, const std::string &task_name, + UBaseType_t priority) { + esp_err_t err = this->common_start_(target_sample_rate, task_name, priority); + + if (err == ESP_OK) { + this->current_media_file_ = media_file; + xEventGroupSetBits(this->event_group_, READER_COMMAND_INIT_FILE); + } + + return err; +} + +esp_err_t AudioPipeline::allocate_buffers_() { + if (this->raw_file_ring_buffer_ == nullptr) + this->raw_file_ring_buffer_ = RingBuffer::create(FILE_RING_BUFFER_SIZE); + + if (this->decoded_ring_buffer_ == nullptr) + this->decoded_ring_buffer_ = RingBuffer::create(BUFFER_SIZE_BYTES); + + if ((this->raw_file_ring_buffer_ == nullptr) || (this->decoded_ring_buffer_ == nullptr)) { + return ESP_ERR_NO_MEM; + } + + if (this->read_task_stack_buffer_ == nullptr) + this->read_task_stack_buffer_ = (StackType_t *) malloc(READER_TASK_STACK_SIZE); + + if (this->decode_task_stack_buffer_ == nullptr) + this->decode_task_stack_buffer_ = (StackType_t *) malloc(DECODER_TASK_STACK_SIZE); + + if (this->resample_task_stack_buffer_ == nullptr) + this->resample_task_stack_buffer_ = (StackType_t *) malloc(RESAMPLER_TASK_STACK_SIZE); + + if ((this->read_task_stack_buffer_ == nullptr) || (this->decode_task_stack_buffer_ == nullptr) || + (this->resample_task_stack_buffer_ == nullptr)) { + return ESP_ERR_NO_MEM; + } + + if (this->event_group_ == nullptr) + this->event_group_ = xEventGroupCreate(); + + if (this->event_group_ == nullptr) { + return ESP_ERR_NO_MEM; + } + + if (this->info_error_queue_ == nullptr) + this->info_error_queue_ = xQueueCreate(INFO_ERROR_QUEUE_COUNT, sizeof(InfoErrorEvent)); + + if (this->info_error_queue_ == nullptr) + return ESP_ERR_NO_MEM; + + return ESP_OK; +} + +esp_err_t AudioPipeline::common_start_(uint32_t target_sample_rate, const std::string &task_name, + UBaseType_t priority) { + esp_err_t err = this->allocate_buffers_(); + if (err != ESP_OK) { + return err; + } + + if (this->read_task_handle_ == nullptr) { + this->read_task_handle_ = + xTaskCreateStatic(AudioPipeline::read_task_, (task_name + "_read").c_str(), READER_TASK_STACK_SIZE, + (void *) this, priority, this->read_task_stack_buffer_, &this->read_task_stack_); + } + if (this->decode_task_handle_ == nullptr) { + this->decode_task_handle_ = + xTaskCreateStatic(AudioPipeline::decode_task_, (task_name + "_decode").c_str(), DECODER_TASK_STACK_SIZE, + (void *) this, priority, this->decode_task_stack_buffer_, &this->decode_task_stack_); + } + if (this->resample_task_handle_ == nullptr) { + this->resample_task_handle_ = + xTaskCreateStatic(AudioPipeline::resample_task_, (task_name + "_resample").c_str(), RESAMPLER_TASK_STACK_SIZE, + (void *) this, priority, this->resample_task_stack_buffer_, &this->resample_task_stack_); + } + + if ((this->read_task_handle_ == nullptr) || (this->decode_task_handle_ == nullptr) || + (this->resample_task_handle_ == nullptr)) { + return ESP_FAIL; + } + + this->target_sample_rate_ = target_sample_rate; + + return this->stop(); +} + +AudioPipelineState AudioPipeline::get_state() { + InfoErrorEvent event; + if (this->info_error_queue_ != nullptr) { + while (xQueueReceive(this->info_error_queue_, &event, 0)) { + switch (event.source) { + case InfoErrorSource::READER: + if (event.err.has_value()) { + ESP_LOGE(TAG, "Media reader encountered an error: %s", esp_err_to_name(event.err.value())); + } else if (event.file_type.has_value()) { + ESP_LOGD(TAG, "Reading %s file type", media_player_file_type_to_string(event.file_type.value())); + } + + break; + case InfoErrorSource::DECODER: + if (event.err.has_value()) { + ESP_LOGE(TAG, "Decoder encountered an error: %s", esp_err_to_name(event.err.value())); + } + + if (event.audio_stream_info.has_value()) { + ESP_LOGD(TAG, "Decoded audio has %d channels, %" PRId32 " Hz sample rate, and %d bits per sample", + event.audio_stream_info.value().channels, event.audio_stream_info.value().sample_rate, + event.audio_stream_info.value().bits_per_sample); + } + + if (event.decoding_err.has_value()) { + switch (event.decoding_err.value()) { + case DecodingError::FAILED_HEADER: + ESP_LOGE(TAG, "Failed to parse the file's header."); + break; + case DecodingError::INCOMPATIBLE_BITS_PER_SAMPLE: + ESP_LOGE(TAG, "Incompatible bits per sample. Only 16 bits per sample is supported"); + break; + case DecodingError::INCOMPATIBLE_CHANNELS: + ESP_LOGE(TAG, "Incompatible number of channels. Only 1 or 2 channel audio is supported."); + break; + } + } + break; + case InfoErrorSource::RESAMPLER: + if (event.err.has_value()) { + ESP_LOGE(TAG, "Resampler encountered an error: %s", esp_err_to_name(event.err.has_value())); + } else if (event.resample_info.has_value()) { + if (event.resample_info.value().resample) { + ESP_LOGD(TAG, "Converting the audio sample rate"); + } + if (event.resample_info.value().mono_to_stereo) { + ESP_LOGD(TAG, "Converting mono channel audio to stereo channel audio"); + } + } + break; + } + } + } + + EventBits_t event_bits = xEventGroupGetBits(this->event_group_); + if (!this->read_task_handle_ && !this->decode_task_handle_ && !this->resample_task_handle_) { + return AudioPipelineState::STOPPED; + } + + if ((event_bits & READER_MESSAGE_ERROR)) { + xEventGroupClearBits(this->event_group_, READER_MESSAGE_ERROR); + return AudioPipelineState::ERROR_READING; + } + + if ((event_bits & DECODER_MESSAGE_ERROR)) { + xEventGroupClearBits(this->event_group_, DECODER_MESSAGE_ERROR); + return AudioPipelineState::ERROR_DECODING; + } + + if ((event_bits & RESAMPLER_MESSAGE_ERROR)) { + xEventGroupClearBits(this->event_group_, RESAMPLER_MESSAGE_ERROR); + return AudioPipelineState::ERROR_RESAMPLING; + } + + if ((event_bits & READER_MESSAGE_FINISHED) && (event_bits & DECODER_MESSAGE_FINISHED) && + (event_bits & RESAMPLER_MESSAGE_FINISHED)) { + return AudioPipelineState::STOPPED; + } + + return AudioPipelineState::PLAYING; +} + +esp_err_t AudioPipeline::stop() { + xEventGroupSetBits(this->event_group_, PIPELINE_COMMAND_STOP); + + uint32_t event_group_bits = xEventGroupWaitBits(this->event_group_, + FINISHED_BITS, // Bit message to read + pdFALSE, // Clear the bits on exit + pdTRUE, // Wait for all the bits, + pdMS_TO_TICKS(300)); // Duration to block/wait + + if (!(event_group_bits & READER_MESSAGE_FINISHED)) { + // Reader failed to stop + xEventGroupSetBits(this->event_group_, EventGroupBits::READER_MESSAGE_ERROR); + } + if (!(event_group_bits & DECODER_MESSAGE_FINISHED)) { + // Ddecoder failed to stop + xEventGroupSetBits(this->event_group_, EventGroupBits::DECODER_MESSAGE_ERROR); + } + if (!(event_group_bits & RESAMPLER_MESSAGE_FINISHED)) { + // Resampler failed to stop + xEventGroupSetBits(this->event_group_, EventGroupBits::RESAMPLER_MESSAGE_ERROR); + } + + if ((event_group_bits & FINISHED_BITS) != FINISHED_BITS) { + // Not all bits were set, so it timed out + return ESP_ERR_TIMEOUT; + } + + // Clear the ring buffer in the mixer; avoids playing incorrect audio when starting a new file while paused + CommandEvent command_event; + if (this->pipeline_type_ == AudioPipelineType::MEDIA) { + command_event.command = CommandEventType::CLEAR_MEDIA; + } else { + command_event.command = CommandEventType::CLEAR_ANNOUNCEMENT; + } + this->mixer_->send_command(&command_event); + + xEventGroupClearBits(this->event_group_, UNFINISHED_BITS); + this->reset_ring_buffers(); + + return ESP_OK; +} + +void AudioPipeline::reset_ring_buffers() { + this->raw_file_ring_buffer_->reset(); + this->decoded_ring_buffer_->reset(); +} + +void AudioPipeline::suspend_tasks() { + if (this->read_task_handle_ != nullptr) { + vTaskSuspend(this->read_task_handle_); + } + if (this->decode_task_handle_ != nullptr) { + vTaskSuspend(this->decode_task_handle_); + } + if (this->resample_task_handle_ != nullptr) { + vTaskSuspend(this->resample_task_handle_); + } +} + +void AudioPipeline::resume_tasks() { + if (this->read_task_handle_ != nullptr) { + vTaskResume(this->read_task_handle_); + } + if (this->decode_task_handle_ != nullptr) { + vTaskResume(this->decode_task_handle_); + } + if (this->resample_task_handle_ != nullptr) { + vTaskResume(this->resample_task_handle_); + } +} + +void AudioPipeline::read_task_(void *params) { + AudioPipeline *this_pipeline = (AudioPipeline *) params; + + while (true) { + xEventGroupSetBits(this_pipeline->event_group_, EventGroupBits::READER_MESSAGE_FINISHED); + + // Wait until the pipeline notifies us the source of the media file + EventBits_t event_bits = + xEventGroupWaitBits(this_pipeline->event_group_, + READER_COMMAND_INIT_FILE | READER_COMMAND_INIT_HTTP, // Bit message to read + pdTRUE, // Clear the bit on exit + pdFALSE, // Wait for all the bits, + portMAX_DELAY); // Block indefinitely until bit is set + + xEventGroupClearBits(this_pipeline->event_group_, EventGroupBits::READER_MESSAGE_FINISHED); + + { + InfoErrorEvent event; + event.source = InfoErrorSource::READER; + esp_err_t err = ESP_OK; + + AudioReader reader = AudioReader(this_pipeline->raw_file_ring_buffer_.get(), FILE_BUFFER_SIZE); + + if (event_bits & READER_COMMAND_INIT_FILE) { + err = reader.start(this_pipeline->current_media_file_, this_pipeline->current_media_file_type_); + } else { + err = reader.start(this_pipeline->current_uri_, this_pipeline->current_media_file_type_); + } + if (err != ESP_OK) { + // Send specific error message + event.err = err; + xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY); + + // Setting up the reader failed, stop the pipeline + xEventGroupSetBits(this_pipeline->event_group_, + EventGroupBits::READER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP); + } else { + // Send the file type to the pipeline + event.file_type = this_pipeline->current_media_file_type_; + xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY); + + // Inform the decoder that the media type is available + xEventGroupSetBits(this_pipeline->event_group_, EventGroupBits::READER_MESSAGE_LOADED_MEDIA_TYPE); + } + + while (true) { + event_bits = xEventGroupGetBits(this_pipeline->event_group_); + + if (event_bits & PIPELINE_COMMAND_STOP) { + break; + } + + AudioReaderState reader_state = reader.read(); + + if (reader_state == AudioReaderState::FINISHED) { + break; + } else if (reader_state == AudioReaderState::FAILED) { + xEventGroupSetBits(this_pipeline->event_group_, + EventGroupBits::READER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP); + break; + } + } + } + } +} + +void AudioPipeline::decode_task_(void *params) { + AudioPipeline *this_pipeline = (AudioPipeline *) params; + + while (true) { + xEventGroupSetBits(this_pipeline->event_group_, EventGroupBits::DECODER_MESSAGE_FINISHED); + + // Wait until the reader notifies us that the media type is available + EventBits_t event_bits = xEventGroupWaitBits(this_pipeline->event_group_, + READER_MESSAGE_LOADED_MEDIA_TYPE, // Bit message to read + pdTRUE, // Clear the bit on exit + pdFALSE, // Wait for all the bits, + portMAX_DELAY); // Block indefinitely until bit is set + + xEventGroupClearBits(this_pipeline->event_group_, EventGroupBits::DECODER_MESSAGE_FINISHED); + + { + InfoErrorEvent event; + event.source = InfoErrorSource::DECODER; + + std::unique_ptr decoder = make_unique( + this_pipeline->raw_file_ring_buffer_.get(), this_pipeline->decoded_ring_buffer_.get(), FILE_BUFFER_SIZE); + esp_err_t err = decoder->start(this_pipeline->current_media_file_type_); + + if (err != ESP_OK) { + // Send specific error message + event.err = err; + xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY); + + // Setting up the decoder failed, stop the pipeline + xEventGroupSetBits(this_pipeline->event_group_, + EventGroupBits::DECODER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP); + } + + bool has_stream_info = false; + + while (true) { + event_bits = xEventGroupGetBits(this_pipeline->event_group_); + + if (event_bits & PIPELINE_COMMAND_STOP) { + break; + } + + // Stop gracefully if the reader has finished + AudioDecoderState decoder_state = decoder->decode(event_bits & READER_MESSAGE_FINISHED); + + if (decoder_state == AudioDecoderState::FINISHED) { + break; + } else if (decoder_state == AudioDecoderState::FAILED) { + if (!has_stream_info) { + event.decoding_err = DecodingError::FAILED_HEADER; + xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY); + } + xEventGroupSetBits(this_pipeline->event_group_, + EventGroupBits::DECODER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP); + break; + } + + if (!has_stream_info && decoder->get_audio_stream_info().has_value()) { + has_stream_info = true; + + this_pipeline->current_audio_stream_info_ = decoder->get_audio_stream_info().value(); + + // Send the stream information to the pipeline + event.audio_stream_info = this_pipeline->current_audio_stream_info_; + + if (this_pipeline->current_audio_stream_info_.bits_per_sample != 16) { + // Error state, incompatible bits per sample + event.decoding_err = DecodingError::INCOMPATIBLE_BITS_PER_SAMPLE; + xEventGroupSetBits(this_pipeline->event_group_, + EventGroupBits::DECODER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP); + } else if ((this_pipeline->current_audio_stream_info_.channels > 2)) { + // Error state, incompatible number of channels + event.decoding_err = DecodingError::INCOMPATIBLE_CHANNELS; + xEventGroupSetBits(this_pipeline->event_group_, + EventGroupBits::DECODER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP); + } else { + // Inform the resampler that the stream information is available + xEventGroupSetBits(this_pipeline->event_group_, EventGroupBits::DECODER_MESSAGE_LOADED_STREAM_INFO); + } + + xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY); + } + } + } + } +} + +void AudioPipeline::resample_task_(void *params) { + AudioPipeline *this_pipeline = (AudioPipeline *) params; + + while (true) { + xEventGroupSetBits(this_pipeline->event_group_, EventGroupBits::RESAMPLER_MESSAGE_FINISHED); + + // Wait until the decoder notifies us that the stream information is available + EventBits_t event_bits = xEventGroupWaitBits(this_pipeline->event_group_, + DECODER_MESSAGE_LOADED_STREAM_INFO, // Bit message to read + pdTRUE, // Clear the bit on exit + pdFALSE, // Wait for all the bits, + portMAX_DELAY); // Block indefinitely until bit is set + + xEventGroupClearBits(this_pipeline->event_group_, EventGroupBits::RESAMPLER_MESSAGE_FINISHED); + + { + InfoErrorEvent event; + event.source = InfoErrorSource::RESAMPLER; + + RingBuffer *output_ring_buffer = nullptr; + + if (this_pipeline->pipeline_type_ == AudioPipelineType::MEDIA) { + output_ring_buffer = this_pipeline->mixer_->get_media_ring_buffer(); + } else { + output_ring_buffer = this_pipeline->mixer_->get_announcement_ring_buffer(); + } + + AudioResampler resampler = + AudioResampler(this_pipeline->decoded_ring_buffer_.get(), output_ring_buffer, BUFFER_SIZE_SAMPLES); + + esp_err_t err = resampler.start(this_pipeline->current_audio_stream_info_, this_pipeline->target_sample_rate_, + this_pipeline->current_resample_info_); + + if (err != ESP_OK) { + // Send specific error message + event.err = err; + xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY); + + // Setting up the resampler failed, stop the pipeline + xEventGroupSetBits(this_pipeline->event_group_, + EventGroupBits::RESAMPLER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP); + } else { + event.resample_info = this_pipeline->current_resample_info_; + xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY); + } + + while (true) { + event_bits = xEventGroupGetBits(this_pipeline->event_group_); + + if (event_bits & PIPELINE_COMMAND_STOP) { + break; + } + + // Stop gracefully if the decoder is done + AudioResamplerState resampler_state = resampler.resample(event_bits & DECODER_MESSAGE_FINISHED); + + if (resampler_state == AudioResamplerState::FINISHED) { + break; + } else if (resampler_state == AudioResamplerState::FAILED) { + xEventGroupSetBits(this_pipeline->event_group_, + EventGroupBits::RESAMPLER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP); + break; + } + } + } + } +} + +} // namespace nabu +} // namespace esphome +#endif diff --git a/esphome/components/nabu/audio_pipeline.h b/esphome/components/nabu/audio_pipeline.h new file mode 100644 index 0000000000..03d88b76fc --- /dev/null +++ b/esphome/components/nabu/audio_pipeline.h @@ -0,0 +1,153 @@ +#pragma once + +#ifdef USE_ESP_IDF + +#include "audio_reader.h" +#include "audio_decoder.h" +#include "audio_resampler.h" +#include "audio_mixer.h" +#include "nabu_media_helpers.h" + +#include "esphome/components/audio/audio.h" + +#include "esphome/core/hal.h" +#include "esphome/core/helpers.h" +#include "esphome/core/ring_buffer.h" + +#include +#include +#include + +namespace esphome { +namespace nabu { + +enum class AudioPipelineType : uint8_t { + MEDIA, + ANNOUNCEMENT, +}; + +enum class AudioPipelineState : uint8_t { + PLAYING, + STOPPED, + ERROR_READING, + ERROR_DECODING, + ERROR_RESAMPLING, +}; + +enum class InfoErrorSource : uint8_t { + READER = 0, + DECODER, + RESAMPLER, +}; + +enum class DecodingError : uint8_t { + FAILED_HEADER = 0, + INCOMPATIBLE_BITS_PER_SAMPLE, + INCOMPATIBLE_CHANNELS, +}; + +// Used to pass information from each task. +struct InfoErrorEvent { + InfoErrorSource source; + optional err; + optional file_type; + optional audio_stream_info; + optional resample_info; + optional decoding_err; +}; + +class AudioPipeline { + public: + AudioPipeline(AudioMixer *mixer, AudioPipelineType pipeline_type); + + /// @brief Starts an audio pipeline given a media url + /// @param uri media file url + /// @param target_sample_rate the desired sample rate of the audio stream + /// @param task_name FreeRTOS task name + /// @param priority FreeRTOS task priority + /// @return ESP_OK if successful or an appropriate error if not + esp_err_t start(const std::string &uri, uint32_t target_sample_rate, const std::string &task_name, + UBaseType_t priority = 1); + + /// @brief Starts an audio pipeline given a MediaFile pointer + /// @param media_file pointer to a MediaFile object + /// @param target_sample_rate the desired sample rate of the audio stream + /// @param task_name FreeRTOS task name + /// @param priority FreeRTOS task priority + /// @return ESP_OK if successful or an appropriate error if not + esp_err_t start(MediaFile *media_file, uint32_t target_sample_rate, const std::string &task_name, + UBaseType_t priority = 1); + + /// @brief Stops the pipeline. Sends a stop signal to each task (if running) and clears the ring buffers. + /// @return ESP_OK if successful or ESP_ERR_TIMEOUT if the tasks did not indicate they stopped + esp_err_t stop(); + + /// @brief Gets the state of the audio pipeline based on the info_error_queue_ and event_group_ + /// @return AudioPipelineState + AudioPipelineState get_state(); + + /// @brief Resets the ring buffers, discarding any existing data + void reset_ring_buffers(); + + /// @brief Suspends any running tasks + void suspend_tasks(); + /// @brief Resumes any running tasks + void resume_tasks(); + + protected: + /// @brief Allocates the ring buffers, event group, and info error queue. + /// @return ESP_OK if successful or ESP_ERR_NO_MEM if it is unable to allocate all parts + esp_err_t allocate_buffers_(); + + /// @brief Common start code for the pipeline, regardless if the source is a file or url. + /// @param target_sample_rate the desired sample rate of the audio stream + /// @param task_name FreeRTOS task name + /// @param priority FreeRTOS task priority + /// @return ESP_OK if successful or an appropriate error if not + esp_err_t common_start_(uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority); + + // Pointer to the media player's mixer object. The resample task feeds the appropriate ring buffer directly + AudioMixer *mixer_; + + std::string current_uri_{}; + MediaFile *current_media_file_{nullptr}; + + MediaFileType current_media_file_type_; + audio::AudioStreamInfo current_audio_stream_info_; + ResampleInfo current_resample_info_; + uint32_t target_sample_rate_; + + AudioPipelineType pipeline_type_; + + std::unique_ptr raw_file_ring_buffer_; + std::unique_ptr decoded_ring_buffer_; + + // Handles basic control/state of the three tasks + EventGroupHandle_t event_group_{nullptr}; + + // Receives detailed info (file type, stream info, resampling info) or specific errors from the three tasks + QueueHandle_t info_error_queue_{nullptr}; + + // Handles reading the media file from flash or a url + static void read_task_(void *params); + TaskHandle_t read_task_handle_{nullptr}; + StaticTask_t read_task_stack_; + StackType_t *read_task_stack_buffer_{nullptr}; + + // Decodes the media file into PCM audio + static void decode_task_(void *params); + TaskHandle_t decode_task_handle_{nullptr}; + StaticTask_t decode_task_stack_; + StackType_t *decode_task_stack_buffer_{nullptr}; + + // Resamples the audio to match the specified target sample rate. Converts mono audio to stereo audio if necessary. + static void resample_task_(void *params); + TaskHandle_t resample_task_handle_{nullptr}; + StaticTask_t resample_task_stack_; + StackType_t *resample_task_stack_buffer_{nullptr}; +}; + +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/audio_reader.cpp b/esphome/components/nabu/audio_reader.cpp new file mode 100644 index 0000000000..d179a9a8ff --- /dev/null +++ b/esphome/components/nabu/audio_reader.cpp @@ -0,0 +1,210 @@ +#ifdef USE_ESP_IDF + +#include "audio_reader.h" + +#include "esphome/core/helpers.h" +#include "esphome/core/ring_buffer.h" + +#if CONFIG_MBEDTLS_CERTIFICATE_BUNDLE +#include "esp_crt_bundle.h" +#endif + +namespace esphome { +namespace nabu { + +static const size_t READ_WRITE_TIMEOUT_MS = 20; + +// The number of times the http read times out with no data before throwing an error +static const size_t ERROR_COUNT_NO_DATA_READ_TIMEOUT = 10; + +AudioReader::AudioReader(esphome::RingBuffer *output_ring_buffer, size_t transfer_buffer_size) { + this->output_ring_buffer_ = output_ring_buffer; + this->transfer_buffer_size_ = transfer_buffer_size; +} + +AudioReader::~AudioReader() { + if (this->transfer_buffer_ != nullptr) { + ExternalRAMAllocator allocator(ExternalRAMAllocator::ALLOW_FAILURE); + allocator.deallocate(this->transfer_buffer_, this->transfer_buffer_size_); + } + + this->cleanup_connection_(); +} + +esp_err_t AudioReader::allocate_buffers_() { + ExternalRAMAllocator allocator(ExternalRAMAllocator::ALLOW_FAILURE); + if (this->transfer_buffer_ == nullptr) + this->transfer_buffer_ = allocator.allocate(this->transfer_buffer_size_); + + if (this->transfer_buffer_ == nullptr) + return ESP_ERR_NO_MEM; + + return ESP_OK; +} + +esp_err_t AudioReader::start(MediaFile *media_file, MediaFileType &file_type) { + file_type = MediaFileType::NONE; + + esp_err_t err = this->allocate_buffers_(); + if (err != ESP_OK) { + return err; + } + + this->current_media_file_ = media_file; + + this->transfer_buffer_current_ = media_file->data; + this->transfer_buffer_length_ = media_file->length; + file_type = media_file->file_type; + + return ESP_OK; +} + +esp_err_t AudioReader::start(const std::string &uri, MediaFileType &file_type) { + file_type = MediaFileType::NONE; + + esp_err_t err = this->allocate_buffers_(); + if (err != ESP_OK) { + return err; + } + + this->cleanup_connection_(); + + if (uri.empty()) { + return ESP_ERR_INVALID_ARG; + } + + esp_http_client_config_t client_config = {}; + + client_config.url = uri.c_str(); + client_config.cert_pem = nullptr; + client_config.disable_auto_redirect = false; + client_config.max_redirection_count = 10; + client_config.buffer_size = 512; + client_config.keep_alive_enable = true; + client_config.timeout_ms = 5000; // Doesn't raise an error if exceeded in esp-idf v4.4, it just prevents the + // http_client_read command from blocking for too long + +#if CONFIG_MBEDTLS_CERTIFICATE_BUNDLE + if (uri.find("https:") != std::string::npos) { + client_config.crt_bundle_attach = esp_crt_bundle_attach; + } +#endif + + this->client_ = esp_http_client_init(&client_config); + + if (this->client_ == nullptr) { + return ESP_FAIL; + } + + if ((err = esp_http_client_open(this->client_, 0)) != ESP_OK) { + this->cleanup_connection_(); + return err; + } + + int content_length = esp_http_client_fetch_headers(this->client_); + + char url[500]; + err = esp_http_client_get_url(this->client_, url, 500); + if (err != ESP_OK) { + this->cleanup_connection_(); + return err; + } + + std::string url_string = url; + + if (str_endswith(url_string, ".wav")) { + file_type = MediaFileType::WAV; + } else if (str_endswith(url_string, ".mp3")) { + file_type = MediaFileType::MP3; + } else if (str_endswith(url_string, ".flac")) { + file_type = MediaFileType::FLAC; + } else { + file_type = MediaFileType::NONE; + this->cleanup_connection_(); + return ESP_ERR_NOT_SUPPORTED; + } + + this->transfer_buffer_current_ = this->transfer_buffer_; + this->transfer_buffer_length_ = 0; + this->no_data_read_count_ = 0; + + return ESP_OK; +} + +AudioReaderState AudioReader::read() { + if (this->client_ != nullptr) { + return this->http_read_(); + } else if (this->current_media_file_ != nullptr) { + return this->file_read_(); + } + + return AudioReaderState::FAILED; +} + +AudioReaderState AudioReader::file_read_() { + if (this->transfer_buffer_length_ > 0) { + size_t bytes_written = this->output_ring_buffer_->write_without_replacement( + (void *) this->transfer_buffer_current_, this->transfer_buffer_length_, pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + this->transfer_buffer_length_ -= bytes_written; + this->transfer_buffer_current_ += bytes_written; + + return AudioReaderState::READING; + } + return AudioReaderState::FINISHED; +} + +AudioReaderState AudioReader::http_read_() { + if (this->transfer_buffer_length_ > 0) { + size_t bytes_written = this->output_ring_buffer_->write_without_replacement( + (void *) this->transfer_buffer_, this->transfer_buffer_length_, pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + this->transfer_buffer_length_ -= bytes_written; + + // Shift remaining data to the start of the transfer buffer + memmove(this->transfer_buffer_, this->transfer_buffer_ + bytes_written, this->transfer_buffer_length_); + } + + if (esp_http_client_is_complete_data_received(this->client_)) { + if (this->transfer_buffer_length_ == 0) { + this->cleanup_connection_(); + return AudioReaderState::FINISHED; + } + } else { + size_t bytes_to_read = this->transfer_buffer_size_ - this->transfer_buffer_length_; + int received_len = esp_http_client_read( + this->client_, (char *) this->transfer_buffer_ + this->transfer_buffer_length_, bytes_to_read); + + if (received_len > 0) { + this->transfer_buffer_length_ += received_len; + this->no_data_read_count_ = 0; + } else if (received_len < 0) { + // HTTP read error + this->cleanup_connection_(); + return AudioReaderState::FAILED; + } else { + if (bytes_to_read > 0) { + // Read timed out + ++this->no_data_read_count_; + if (this->no_data_read_count_ >= ERROR_COUNT_NO_DATA_READ_TIMEOUT) { + // Timed out with no data read too many times, so the http read has failed + this->cleanup_connection_(); + return AudioReaderState::FAILED; + } + } + } + } + + return AudioReaderState::READING; +} + +void AudioReader::cleanup_connection_() { + if (this->client_ != nullptr) { + esp_http_client_close(this->client_); + esp_http_client_cleanup(this->client_); + this->client_ = nullptr; + } +} + +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/audio_reader.h b/esphome/components/nabu/audio_reader.h new file mode 100644 index 0000000000..232a409960 --- /dev/null +++ b/esphome/components/nabu/audio_reader.h @@ -0,0 +1,54 @@ +#pragma once + +#ifdef USE_ESP_IDF + +#include "nabu_media_helpers.h" +#include "esphome/core/ring_buffer.h" + +#include + +namespace esphome { +namespace nabu { + +enum class AudioReaderState : uint8_t { + READING = 0, + FINISHED, + FAILED, +}; + +class AudioReader { + public: + AudioReader(esphome::RingBuffer *output_ring_buffer, size_t transfer_buffer_size); + ~AudioReader(); + + esp_err_t start(const std::string &uri, MediaFileType &file_type); + esp_err_t start(MediaFile *media_file, MediaFileType &file_type); + + AudioReaderState read(); + + protected: + esp_err_t allocate_buffers_(); + + AudioReaderState file_read_(); + AudioReaderState http_read_(); + + void cleanup_connection_(); + + esphome::RingBuffer *output_ring_buffer_; + + size_t transfer_buffer_length_; // Amount of data currently stored in transfer buffer (in bytes) + size_t transfer_buffer_size_; // Capacity of transfer buffer (in bytes) + + ssize_t no_data_read_count_; + + uint8_t *transfer_buffer_{nullptr}; + const uint8_t *transfer_buffer_current_{nullptr}; + + esp_http_client_handle_t client_{nullptr}; + + MediaFile *current_media_file_{nullptr}; +}; +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/audio_resampler.cpp b/esphome/components/nabu/audio_resampler.cpp new file mode 100644 index 0000000000..8e29edf9a4 --- /dev/null +++ b/esphome/components/nabu/audio_resampler.cpp @@ -0,0 +1,317 @@ +#ifdef USE_ESP_IDF + +#include "audio_resampler.h" + +#include "esphome/core/ring_buffer.h" +#include "esphome/core/helpers.h" + +namespace esphome { +namespace nabu { + +static const size_t NUM_TAPS = 32; +static const size_t NUM_FILTERS = 32; +static const bool USE_PRE_POST_FILTER = true; + +// These output parameters are currently hardcoded in the elements further down the pipeline (mixer and speaker) +static const uint8_t OUTPUT_CHANNELS = 2; +static const uint8_t OUTPUT_BITS_PER_SAMPLE = 16; + +static const size_t READ_WRITE_TIMEOUT_MS = 20; + +AudioResampler::AudioResampler(RingBuffer *input_ring_buffer, RingBuffer *output_ring_buffer, + size_t internal_buffer_samples) { + this->input_ring_buffer_ = input_ring_buffer; + this->output_ring_buffer_ = output_ring_buffer; + this->internal_buffer_samples_ = internal_buffer_samples; +} + +AudioResampler::~AudioResampler() { + ExternalRAMAllocator int16_allocator(ExternalRAMAllocator::ALLOW_FAILURE); + ExternalRAMAllocator float_allocator(ExternalRAMAllocator::ALLOW_FAILURE); + + if (this->input_buffer_ != nullptr) { + int16_allocator.deallocate(this->input_buffer_, this->internal_buffer_samples_); + } + if (this->output_buffer_ != nullptr) { + int16_allocator.deallocate(this->output_buffer_, this->internal_buffer_samples_); + } + if (this->float_input_buffer_ != nullptr) { + float_allocator.deallocate(this->float_input_buffer_, this->internal_buffer_samples_); + } + if (this->float_output_buffer_ != nullptr) { + float_allocator.deallocate(this->float_output_buffer_, this->internal_buffer_samples_); + } + if (this->resampler_ != nullptr) { + resampleFree(this->resampler_); + this->resampler_ = nullptr; + } +} + +esp_err_t AudioResampler::allocate_buffers_() { + ExternalRAMAllocator int16_allocator(ExternalRAMAllocator::ALLOW_FAILURE); + ExternalRAMAllocator float_allocator(ExternalRAMAllocator::ALLOW_FAILURE); + + if (this->input_buffer_ == nullptr) + this->input_buffer_ = int16_allocator.allocate(this->internal_buffer_samples_); + if (this->output_buffer_ == nullptr) + this->output_buffer_ = int16_allocator.allocate(this->internal_buffer_samples_); + + if (this->float_input_buffer_ == nullptr) + this->float_input_buffer_ = float_allocator.allocate(this->internal_buffer_samples_); + + if (this->float_output_buffer_ == nullptr) + this->float_output_buffer_ = float_allocator.allocate(this->internal_buffer_samples_); + + if ((this->input_buffer_ == nullptr) || (this->output_buffer_ == nullptr) || (this->float_input_buffer_ == nullptr) || + (this->float_output_buffer_ == nullptr)) { + return ESP_ERR_NO_MEM; + } + + return ESP_OK; +} + +esp_err_t AudioResampler::start(audio::AudioStreamInfo &stream_info, uint32_t target_sample_rate, + ResampleInfo &resample_info) { + esp_err_t err = this->allocate_buffers_(); + if (err != ESP_OK) { + return err; + } + + this->stream_info_ = stream_info; + + this->input_buffer_current_ = this->input_buffer_; + this->input_buffer_length_ = 0; + this->float_input_buffer_current_ = this->float_input_buffer_; + this->float_input_buffer_length_ = 0; + + this->output_buffer_current_ = this->output_buffer_; + this->output_buffer_length_ = 0; + this->float_output_buffer_current_ = this->float_output_buffer_; + this->float_output_buffer_length_ = 0; + + resample_info.mono_to_stereo = (stream_info.channels != 2); + + if ((stream_info.channels > OUTPUT_CHANNELS) || (stream_info_.bits_per_sample != OUTPUT_BITS_PER_SAMPLE)) { + return ESP_ERR_NOT_SUPPORTED; + } + + if (stream_info.channels > 0) { + this->channel_factor_ = 2 / stream_info.channels; + } + + if (stream_info.sample_rate != target_sample_rate) { + int flags = 0; + + resample_info.resample = true; + + this->sample_ratio_ = static_cast(target_sample_rate) / static_cast(stream_info.sample_rate); + + if (this->sample_ratio_ < 1.0) { + this->lowpass_ratio_ -= (10.24 / 16); + + if (this->lowpass_ratio_ < 0.84) { + this->lowpass_ratio_ = 0.84; + } + + if (this->lowpass_ratio_ < this->sample_ratio_) { + // avoid discontinuities near unity sample ratios + this->lowpass_ratio_ = this->sample_ratio_; + } + } + if (this->lowpass_ratio_ * this->sample_ratio_ < 0.98 && USE_PRE_POST_FILTER) { + float cutoff = this->lowpass_ratio_ * this->sample_ratio_ / 2.0; + biquad_lowpass(&this->lowpass_coeff_, cutoff); + this->pre_filter_ = true; + } + + if (this->lowpass_ratio_ / this->sample_ratio_ < 0.98 && USE_PRE_POST_FILTER && !this->pre_filter_) { + float cutoff = this->lowpass_ratio_ / this->sample_ratio_ / 2.0; + biquad_lowpass(&this->lowpass_coeff_, cutoff); + this->post_filter_ = true; + } + + if (this->pre_filter_ || this->post_filter_) { + for (int i = 0; i < stream_info.channels; ++i) { + biquad_init(&this->lowpass_[i][0], &this->lowpass_coeff_, 1.0); + biquad_init(&this->lowpass_[i][1], &this->lowpass_coeff_, 1.0); + } + } + + if (this->sample_ratio_ < 1.0) { + this->resampler_ = resampleInit(stream_info.channels, NUM_TAPS, NUM_FILTERS, + this->sample_ratio_ * this->lowpass_ratio_, flags | INCLUDE_LOWPASS); + } else if (this->lowpass_ratio_ < 1.0) { + this->resampler_ = + resampleInit(stream_info.channels, NUM_TAPS, NUM_FILTERS, this->lowpass_ratio_, flags | INCLUDE_LOWPASS); + } else { + this->resampler_ = resampleInit(stream_info.channels, NUM_TAPS, NUM_FILTERS, 1.0, flags); + } + + resampleAdvancePosition(this->resampler_, NUM_TAPS / 2.0); + + } else { + resample_info.resample = false; + } + + this->resample_info_ = resample_info; + return ESP_OK; +} + +AudioResamplerState AudioResampler::resample(bool stop_gracefully) { + if (stop_gracefully) { + if ((this->input_ring_buffer_->available() == 0) && (this->output_ring_buffer_->available() == 0) && + (this->input_buffer_length_ == 0) && (this->output_buffer_length_ == 0)) { + return AudioResamplerState::FINISHED; + } + } + + if (this->output_buffer_length_ > 0) { + size_t bytes_to_write = this->output_buffer_length_; + + if (bytes_to_write > 0) { + size_t bytes_written = this->output_ring_buffer_->write_without_replacement( + (void *) this->output_buffer_current_, bytes_to_write, pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + + this->output_buffer_current_ += bytes_written / sizeof(int16_t); + this->output_buffer_length_ -= bytes_written; + } + + return AudioResamplerState::RESAMPLING; + } + + // Copy audio data directly to output_buffer if resampling isn't required + if (!this->resample_info_.resample && !this->resample_info_.mono_to_stereo) { + size_t bytes_read = + this->input_ring_buffer_->read((void *) this->output_buffer_, this->internal_buffer_samples_ * sizeof(int16_t), + pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + + this->output_buffer_current_ = this->output_buffer_; + this->output_buffer_length_ += bytes_read; + + return AudioResamplerState::RESAMPLING; + } + + ////// + // Refill input buffer + ////// + + // Depending on if we are converting mono to stereo or if we are upsampling, we may need to restrict how many input + // samples we transfer + size_t max_input_samples = this->internal_buffer_samples_; + + // Mono to stereo -> cut in half + max_input_samples /= (2 / this->stream_info_.channels); + + if (this->sample_ratio_ > 1.0) { + // Upsampling -> reduce by a factor of the ceiling of sample_ratio_ + uint32_t upsampling_factor = std::ceil(this->sample_ratio_); + max_input_samples /= upsampling_factor; + } + + // Move old data to the start of the buffer + if (this->input_buffer_length_ > 0) { + memmove((void *) this->input_buffer_, (void *) this->input_buffer_current_, this->input_buffer_length_); + } + this->input_buffer_current_ = this->input_buffer_; + + // Copy new data to the end of the of the buffer + size_t bytes_to_read = max_input_samples * sizeof(int16_t) - this->input_buffer_length_; + + if (bytes_to_read > 0) { + int16_t *new_input_buffer_data = this->input_buffer_ + this->input_buffer_length_ / sizeof(int16_t); + size_t bytes_read = this->input_ring_buffer_->read((void *) new_input_buffer_data, bytes_to_read, + pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + + this->input_buffer_length_ += bytes_read; + } + + if (this->input_buffer_length_ == 0) { + return AudioResamplerState::RESAMPLING; + } + + if (this->resample_info_.resample) { + if (this->input_buffer_length_ > 0) { + // Samples are indiviudal int16 values. Frames include 1 sample for mono and 2 samples for stereo + // Be careful converting between bytes, samples, and frames! + // 1 sample = 2 bytes = sizeof(int16_t) + // if mono: + // 1 frame = 1 sample + // if stereo: + // 1 frame = 2 samples (left and right) + + size_t samples_read = this->input_buffer_length_ / sizeof(int16_t); + + for (int i = 0; i < samples_read; ++i) { + this->float_input_buffer_[i] = static_cast(this->input_buffer_[i]) / 32768.0f; + } + + size_t frames_read = samples_read / this->stream_info_.channels; + + if (this->pre_filter_) { + for (int i = 0; i < this->stream_info_.channels; ++i) { + biquad_apply_buffer(&this->lowpass_[i][0], this->float_input_buffer_ + i, frames_read, + this->stream_info_.channels); + biquad_apply_buffer(&this->lowpass_[i][1], this->float_input_buffer_ + i, frames_read, + this->stream_info_.channels); + } + } + + ResampleResult res; + + res = resampleProcessInterleaved(this->resampler_, this->float_input_buffer_, frames_read, + this->float_output_buffer_, + this->internal_buffer_samples_ / this->channel_factor_, this->sample_ratio_); + + size_t frames_used = res.input_used; + size_t samples_used = frames_used * this->stream_info_.channels; + + size_t frames_generated = res.output_generated; + if (this->post_filter_) { + for (int i = 0; i < this->stream_info_.channels; ++i) { + biquad_apply_buffer(&this->lowpass_[i][0], this->float_output_buffer_ + i, frames_generated, + this->stream_info_.channels); + biquad_apply_buffer(&this->lowpass_[i][1], this->float_output_buffer_ + i, frames_generated, + this->stream_info_.channels); + } + } + + size_t samples_generated = frames_generated * this->stream_info_.channels; + + for (int i = 0; i < samples_generated; ++i) { + this->output_buffer_[i] = static_cast(this->float_output_buffer_[i] * 32767); + } + + this->input_buffer_current_ += samples_used; + this->input_buffer_length_ -= samples_used * sizeof(int16_t); + + this->output_buffer_current_ = this->output_buffer_; + this->output_buffer_length_ += samples_generated * sizeof(int16_t); + } + } else { + size_t bytes_to_transfer = + std::min(this->internal_buffer_samples_ / this->channel_factor_ * sizeof(int16_t), this->input_buffer_length_); + std::memcpy((void *) this->output_buffer_, (void *) this->input_buffer_current_, bytes_to_transfer); + + this->input_buffer_current_ += bytes_to_transfer / sizeof(int16_t); + this->input_buffer_length_ -= bytes_to_transfer; + + this->output_buffer_current_ = this->output_buffer_; + this->output_buffer_length_ += bytes_to_transfer; + } + + if (this->resample_info_.mono_to_stereo) { + // Convert mono to stereo + for (int i = this->output_buffer_length_ / (sizeof(int16_t)) - 1; i >= 0; --i) { + this->output_buffer_[2 * i] = this->output_buffer_[i]; + this->output_buffer_[2 * i + 1] = this->output_buffer_[i]; + } + + this->output_buffer_length_ *= 2; // double the bytes for stereo samples + } + return AudioResamplerState::RESAMPLING; +} + +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/audio_resampler.h b/esphome/components/nabu/audio_resampler.h new file mode 100644 index 0000000000..46521e41c2 --- /dev/null +++ b/esphome/components/nabu/audio_resampler.h @@ -0,0 +1,82 @@ +#pragma once + +#ifdef USE_ESP_IDF + +#include "biquad.h" +#include "resampler.h" + +#include "esphome/components/audio/audio.h" +#include "esphome/core/ring_buffer.h" + +namespace esphome { +namespace nabu { + +enum class AudioResamplerState : uint8_t { + INITIALIZED = 0, + RESAMPLING, + FINISHED, + FAILED, +}; + +struct ResampleInfo { + bool resample; + bool mono_to_stereo; +}; + +class AudioResampler { + public: + AudioResampler(esphome::RingBuffer *input_ring_buffer, esphome::RingBuffer *output_ring_buffer, + size_t internal_buffer_samples); + ~AudioResampler(); + + /// @brief Sets up the various bits necessary to resample + /// @param stream_info the incoming sample rate, bits per sample, and number of channels + /// @param target_sample_rate the necessary sample rate to convert to + /// @return ESP_OK if it is able to convert the incoming stream or an error otherwise + esp_err_t start(audio::AudioStreamInfo &stream_info, uint32_t target_sample_rate, ResampleInfo &resample_info); + + AudioResamplerState resample(bool stop_gracefully); + + protected: + esp_err_t allocate_buffers_(); + + esphome::RingBuffer *input_ring_buffer_; + esphome::RingBuffer *output_ring_buffer_; + size_t internal_buffer_samples_; + + int16_t *input_buffer_{nullptr}; + int16_t *input_buffer_current_{nullptr}; + size_t input_buffer_length_; + + int16_t *output_buffer_{nullptr}; + int16_t *output_buffer_current_{nullptr}; + size_t output_buffer_length_; + + float *float_input_buffer_{nullptr}; + float *float_input_buffer_current_{nullptr}; + size_t float_input_buffer_length_; + + float *float_output_buffer_{nullptr}; + float *float_output_buffer_current_{nullptr}; + size_t float_output_buffer_length_; + + audio::AudioStreamInfo stream_info_; + ResampleInfo resample_info_; + + Resample *resampler_{nullptr}; + + Biquad lowpass_[2][2]; + BiquadCoefficients lowpass_coeff_; + + float sample_ratio_{1.0}; + float lowpass_ratio_{1.0}; + uint8_t channel_factor_{1}; + + bool pre_filter_{false}; + bool post_filter_{false}; +}; + +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/automation.h b/esphome/components/nabu/automation.h new file mode 100644 index 0000000000..29926c0ddc --- /dev/null +++ b/esphome/components/nabu/automation.h @@ -0,0 +1,44 @@ +#pragma once + +#ifdef USE_ESP_IDF + +#include "esphome/core/automation.h" +#include "nabu_media_player.h" + +namespace esphome { +namespace nabu { + +template class DuckingSetAction : public Action, public Parented { + TEMPLATABLE_VALUE(uint8_t, decibel_reduction) + TEMPLATABLE_VALUE(float, duration) + void play(Ts... x) override { + this->parent_->set_ducking_reduction(this->decibel_reduction_.value(x...), this->duration_.value(x...)); + } +}; + +template class PlayLocalMediaAction : public Action, public Parented { + TEMPLATABLE_VALUE(MediaFile *, media_file) + TEMPLATABLE_VALUE(bool, announcement) + void play(Ts... x) override { + this->parent_->play_file(this->media_file_.value(x...), this->announcement_.value(x...)); + } +}; + +template class StopPipelineAction : public Action, public Parented { + TEMPLATABLE_VALUE(AudioPipelineType, pipeline_type) + void play(Ts... x) override { + bool announcement = false; + if (this->pipeline_type_.value(x...) == AudioPipelineType::ANNOUNCEMENT) { + announcement = true; + } + this->parent_->make_call() + .set_command(media_player::MediaPlayerCommand::MEDIA_PLAYER_COMMAND_STOP) + .set_announcement(announcement) + .perform(); + } +}; + +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/media_player.py b/esphome/components/nabu/media_player.py new file mode 100644 index 0000000000..23b8eeeee8 --- /dev/null +++ b/esphome/components/nabu/media_player.py @@ -0,0 +1,382 @@ +"""Nabu Media Player Setup.""" + +import hashlib +import logging +from pathlib import Path + +from esphome import automation, external_files +import esphome.codegen as cg +from esphome.components import esp32, media_player, speaker +import esphome.config_validation as cv +from esphome.const import ( + CONF_DURATION, + CONF_FILE, + CONF_FILES, + CONF_ID, + CONF_PATH, + CONF_RAW_DATA_ID, + CONF_SAMPLE_RATE, + CONF_SPEAKER, + CONF_TYPE, + CONF_URL, +) +from esphome.core import CORE, HexInt +from esphome.external_files import download_content + +_LOGGER = logging.getLogger(__name__) + +AUTO_LOAD = ["audio", "psram"] + +CODEOWNERS = ["@synesthesiam", "@kahrendt"] +DEPENDENCIES = ["media_player"] +DOMAIN = "file" + +TYPE_LOCAL = "local" +TYPE_WEB = "web" + +CONF_DECIBEL_REDUCTION = "decibel_reduction" + +CONF_ANNOUNCEMENT = "announcement" +CONF_MEDIA_FILE = "media_file" +CONF_PIPELINE = "pipeline" +CONF_VOLUME_INCREMENT = "volume_increment" +CONF_VOLUME_MIN = "volume_min" +CONF_VOLUME_MAX = "volume_max" + +CONF_ON_MUTE = "on_mute" +CONF_ON_UNMUTE = "on_unmute" +CONF_ON_VOLUME = "on_volume" + +nabu_ns = cg.esphome_ns.namespace("nabu") +NabuMediaPlayer = nabu_ns.class_("NabuMediaPlayer") +NabuMediaPlayer = nabu_ns.class_( + "NabuMediaPlayer", + NabuMediaPlayer, + media_player.MediaPlayer, + cg.Component, +) + +MediaFile = nabu_ns.struct("MediaFile") +MediaFileType = nabu_ns.enum("MediaFileType", is_class=True) +MEDIA_FILE_TYPE_ENUM = { + "NONE": MediaFileType.NONE, + "WAV": MediaFileType.WAV, + "MP3": MediaFileType.MP3, + "FLAC": MediaFileType.FLAC, +} + +PipelineType = nabu_ns.enum("AudioPipelineType", is_class=True) +PIPELINE_TYPE_ENUM = { + "MEDIA": PipelineType.MEDIA, + "ANNOUNCEMENT": PipelineType.ANNOUNCEMENT, +} + +PlayLocalMediaAction = nabu_ns.class_( + "PlayLocalMediaAction", automation.Action, cg.Parented.template(NabuMediaPlayer) +) +StopPipelineAction = nabu_ns.class_( + "StopPipelineAction", automation.Action, cg.Parented.template(NabuMediaPlayer) +) +DuckingSetAction = nabu_ns.class_( + "DuckingSetAction", automation.Action, cg.Parented.template(NabuMediaPlayer) +) + + +def _compute_local_file_path(value: dict) -> Path: + url = value[CONF_URL] + h = hashlib.new("sha256") + h.update(url.encode()) + key = h.hexdigest()[:8] + base_dir = external_files.compute_local_file_dir(DOMAIN) + _LOGGER.debug("_compute_local_file_path: base_dir=%s", base_dir / key) + return base_dir / key + + +def _download_web_file(value): + url = value[CONF_URL] + path = _compute_local_file_path(value) + + download_content(url, path) + _LOGGER.debug("download_web_file: path=%s", path) + return value + + +def _validate_file_shorthand(value): + value = cv.string_strict(value) + if value.startswith("http://") or value.startswith("https://"): + return _file_schema( + { + CONF_TYPE: TYPE_WEB, + CONF_URL: value, + } + ) + return _file_schema( + { + CONF_TYPE: TYPE_LOCAL, + CONF_PATH: value, + } + ) + + +def _file_schema(value): + if isinstance(value, str): + return _validate_file_shorthand(value) + return TYPED_FILE_SCHEMA(value) + + +def _read_audio_file_and_type(file_config): + conf_file = file_config[CONF_FILE] + file_source = conf_file[CONF_TYPE] + if file_source == TYPE_LOCAL: + path = CORE.relative_config_path(conf_file[CONF_PATH]) + elif file_source == TYPE_WEB: + path = _compute_local_file_path(conf_file) + else: + raise cv.Invalid("Unsupported file source.") + + with open(path, "rb") as f: + data = f.read() + + try: + import puremagic + + file_type: str = puremagic.from_string(data) + except ImportError: + try: + from magic import Magic + + magic = Magic(mime=True) + file_type: str = magic.from_buffer(data) + except ImportError as exc: + raise cv.Invalid("Please install puremagic") from exc + if file_type.startswith("."): + file_type = file_type[1:] + + media_file_type = MEDIA_FILE_TYPE_ENUM["NONE"] + if file_type in ("wav"): + media_file_type = MEDIA_FILE_TYPE_ENUM["WAV"] + elif file_type in ("mp3", "mpeg", "mpga"): + media_file_type = MEDIA_FILE_TYPE_ENUM["MP3"] + elif file_type in ("flac"): + media_file_type = MEDIA_FILE_TYPE_ENUM["FLAC"] + + return data, media_file_type + + +def _supported_local_file_validate(config): + if files_list := config.get(CONF_FILES): + for file_config in files_list: + _, media_file_type = _read_audio_file_and_type(file_config) + if str(media_file_type) == str(MEDIA_FILE_TYPE_ENUM["NONE"]): + raise cv.Invalid("Unsupported local media file.") + + +LOCAL_SCHEMA = cv.Schema( + { + cv.Required(CONF_PATH): cv.file_, + } +) + +WEB_SCHEMA = cv.All( + { + cv.Required(CONF_URL): cv.url, + }, + _download_web_file, +) + + +TYPED_FILE_SCHEMA = cv.typed_schema( + { + TYPE_LOCAL: LOCAL_SCHEMA, + TYPE_WEB: WEB_SCHEMA, + }, +) + + +MEDIA_FILE_TYPE_SCHEMA = cv.Schema( + { + cv.Required(CONF_ID): cv.declare_id(MediaFile), + cv.Required(CONF_FILE): _file_schema, + cv.GenerateID(CONF_RAW_DATA_ID): cv.declare_id(cg.uint8), + } +) + + +CONFIG_SCHEMA = cv.All( + media_player.MEDIA_PLAYER_SCHEMA.extend( + { + cv.GenerateID(): cv.declare_id(NabuMediaPlayer), + cv.Required(CONF_SPEAKER): cv.use_id(speaker.Speaker), + cv.Optional(CONF_SAMPLE_RATE, default=16000): cv.int_range(min=1), + cv.Optional(CONF_VOLUME_INCREMENT, default=0.05): cv.percentage, + cv.Optional(CONF_VOLUME_MAX, default=1.0): cv.percentage, + cv.Optional(CONF_VOLUME_MIN, default=0.0): cv.percentage, + cv.Optional(CONF_FILES): cv.ensure_list(MEDIA_FILE_TYPE_SCHEMA), + cv.Optional(CONF_ON_MUTE): automation.validate_automation(single=True), + cv.Optional(CONF_ON_UNMUTE): automation.validate_automation(single=True), + cv.Optional(CONF_ON_VOLUME): automation.validate_automation(single=True), + } + ), + cv.only_with_esp_idf, +) +FINAL_VALIDATE_SCHEMA = _supported_local_file_validate + + +async def to_code(config): + cg.add_library("https://github.com/esphome/esp-audio-libs", "1.0.0") + + # Wifi settings based on https://github.com/espressif/esp-adf/issues/297#issuecomment-783811702 + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_STATIC_RX_BUFFER_NUM", 16) + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_DYNAMIC_RX_BUFFER_NUM", 512) + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_STATIC_TX_BUFFER", True) + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_TX_BUFFER_TYPE", 0) + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_STATIC_TX_BUFFER_NUM", 8) + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_CACHE_TX_BUFFER_NUM", 32) + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_AMPDU_TX_ENABLED", True) + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_TX_BA_WIN", 16) + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_AMPDU_RX_ENABLED", True) + esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_RX_BA_WIN", 32) + esp32.add_idf_sdkconfig_option("CONFIG_LWIP_MAX_ACTIVE_TCP", 16) + esp32.add_idf_sdkconfig_option("CONFIG_LWIP_MAX_LISTENING_TCP", 16) + esp32.add_idf_sdkconfig_option("CONFIG_TCP_MAXRTX", 12) + esp32.add_idf_sdkconfig_option("CONFIG_TCP_SYNMAXRTX", 6) + esp32.add_idf_sdkconfig_option("CONFIG_TCP_MSS", 1436) + esp32.add_idf_sdkconfig_option("CONFIG_TCP_MSL", 60000) + esp32.add_idf_sdkconfig_option("CONFIG_TCP_SND_BUF_DEFAULT", 5840) + esp32.add_idf_sdkconfig_option( + "CONFIG_TCP_WND_DEFAULT", 65535 + ) # Adjusted from referenced settings to avoid compilation error + esp32.add_idf_sdkconfig_option("CONFIG_TCP_RECVMBOX_SIZE", 512) + esp32.add_idf_sdkconfig_option("CONFIG_TCP_QUEUE_OOSEQ", True) + esp32.add_idf_sdkconfig_option("CONFIG_TCP_OVERSIZE_MSS", True) + esp32.add_idf_sdkconfig_option("CONFIG_LWIP_WND_SCALE", True) + esp32.add_idf_sdkconfig_option("CONFIG_TCP_RCV_SCALE", 3) + esp32.add_idf_sdkconfig_option("CONFIG_LWIP_TCPIP_RECVMBOX_SIZE", 512) + + var = cg.new_Pvariable(config[CONF_ID]) + await cg.register_component(var, config) + await media_player.register_media_player(var, config) + + cg.add_define("USE_OTA_STATE_CALLBACK") + + cg.add(var.set_sample_rate(config[CONF_SAMPLE_RATE])) + + cg.add(var.set_volume_increment(config[CONF_VOLUME_INCREMENT])) + cg.add(var.set_volume_max(config[CONF_VOLUME_MAX])) + cg.add(var.set_volume_min(config[CONF_VOLUME_MIN])) + + spkr = await cg.get_variable(config[CONF_SPEAKER]) + cg.add(var.set_speaker(spkr)) + + if on_mute := config.get(CONF_ON_MUTE): + await automation.build_automation( + var.get_mute_trigger(), + [], + on_mute, + ) + if on_unmute := config.get(CONF_ON_UNMUTE): + await automation.build_automation( + var.get_unmute_trigger(), + [], + on_unmute, + ) + if on_volume := config.get(CONF_ON_VOLUME): + await automation.build_automation( + var.get_volume_trigger(), + [(cg.float_, "x")], + on_volume, + ) + + if files_list := config.get(CONF_FILES): + for file_config in files_list: + data, media_file_type = _read_audio_file_and_type(file_config) + + rhs = [HexInt(x) for x in data] + prog_arr = cg.progmem_array(file_config[CONF_RAW_DATA_ID], rhs) + + media_files_struct = cg.StructInitializer( + MediaFile, + ( + "data", + prog_arr, + ), + ( + "length", + len(rhs), + ), + ( + "file_type", + media_file_type, + ), + ) + + cg.new_Pvariable( + file_config[CONF_ID], + media_files_struct, + ) + + +@automation.register_action( + "nabu.play_local_media_file", + PlayLocalMediaAction, + cv.maybe_simple_value( + { + cv.GenerateID(): cv.use_id(NabuMediaPlayer), + cv.Required(CONF_MEDIA_FILE): cv.use_id(MediaFile), + cv.Optional(CONF_ANNOUNCEMENT, default=False): cv.boolean, + }, + key=CONF_MEDIA_FILE, + ), +) +async def nabu_play_local_media_media_action(config, action_id, template_arg, args): + var = cg.new_Pvariable(action_id, template_arg) + await cg.register_parented(var, config[CONF_ID]) + media_file = await cg.get_variable(config[CONF_MEDIA_FILE]) + cg.add(var.set_media_file(media_file)) + cg.add(var.set_announcement(config[CONF_ANNOUNCEMENT])) + return var + + +@automation.register_action( + "nabu.stop_pipeline", + StopPipelineAction, + cv.maybe_simple_value( + { + cv.GenerateID(): cv.use_id(NabuMediaPlayer), + cv.Required(CONF_PIPELINE): cv.enum(PIPELINE_TYPE_ENUM, upper=True), + }, + key=CONF_PIPELINE, + ), +) +async def nabu_stop_pipeline_action(config, action_id, template_arg, args): + var = cg.new_Pvariable(action_id, template_arg) + await cg.register_parented(var, config[CONF_ID]) + cg.add(var.set_pipeline_type(config[CONF_PIPELINE])) + return var + + +@automation.register_action( + "nabu.set_ducking", + DuckingSetAction, + cv.Schema( + { + cv.GenerateID(): cv.use_id(NabuMediaPlayer), + cv.Required(CONF_DECIBEL_REDUCTION): cv.templatable( + cv.int_range(min=0, max=51) + ), + cv.Optional(CONF_DURATION, default="0.0s"): cv.templatable( + cv.positive_time_period_seconds + ), + } + ), +) +async def ducking_set_to_code(config, action_id, template_arg, args): + var = cg.new_Pvariable(action_id, template_arg) + await cg.register_parented(var, config[CONF_ID]) + decibel_reduction = await cg.templatable( + config[CONF_DECIBEL_REDUCTION], args, cg.uint8 + ) + cg.add(var.set_decibel_reduction(decibel_reduction)) + duration = await cg.templatable(config[CONF_DURATION], args, cg.float_) + cg.add(var.set_duration(duration)) + return var diff --git a/esphome/components/nabu/nabu_media_helpers.h b/esphome/components/nabu/nabu_media_helpers.h new file mode 100644 index 0000000000..91b7ad3539 --- /dev/null +++ b/esphome/components/nabu/nabu_media_helpers.h @@ -0,0 +1,28 @@ +#pragma once + +#ifdef USE_ESP_IDF + +#include +#include + +namespace esphome { +namespace nabu { + +enum class MediaFileType : uint8_t { + NONE = 0, + WAV, + MP3, + FLAC, +}; +const char *media_player_file_type_to_string(MediaFileType file_type); + +struct MediaFile { + const uint8_t *data; + size_t length; + MediaFileType file_type; +}; + +} // namespace nabu +} // namespace esphome + +#endif diff --git a/esphome/components/nabu/nabu_media_player.cpp b/esphome/components/nabu/nabu_media_player.cpp new file mode 100644 index 0000000000..0bd4854ae6 --- /dev/null +++ b/esphome/components/nabu/nabu_media_player.cpp @@ -0,0 +1,475 @@ +#ifdef USE_ESP_IDF + +#include "nabu_media_player.h" + +#include "esphome/components/audio/audio.h" + +#include "esphome/core/hal.h" +#include "esphome/core/log.h" + +#ifdef USE_OTA +#include "esphome/components/ota/ota_backend.h" +#endif + +namespace esphome { +namespace nabu { + +// Framework: +// - Media player that can handle two streams; one for media and one for announcements +// - If played together, they are mixed with the announcement stream staying at full volume +// - The media audio is scaled, if necessary, to avoid clipping when mixing an announcement stream +// - The media audio can be further ducked via the ``set_ducking_reduction`` function +// - Each stream is handled by an ``AudioPipeline`` object with three parts/tasks +// - ``AudioReader`` handles reading from an HTTP source or from a PROGMEM flash set at compile time +// - ``AudioDecoder`` handles decoding the audio file. All formats are limited to two channels and 16 bits per sample +// - FLAC +// - WAV +// - MP3 (based on the libhelix decoder - a random mp3 file may be incompatible) +// - ``AudioResampler`` handles converting the sample rate to the configured output sample rate and converting mono +// to stereo +// - The quality is not good, and it is slow! Please use audio at the configured sample rate to avoid these issues +// - Each task will always run once started, but they will not doing anything until they are needed +// - FreeRTOS Event Groups make up the inter-task communication +// - The ``AudioPipeline`` sets up an output ring buffer for the Reader and Decoder parts. The next part/task +// automatically pulls from the previous ring buffer +// - The streams are mixed together in the ``AudioMixer`` task +// - Each stream has a corresponding input buffer that the ``AudioResampler`` feeds directly +// - Pausing the media stream is done here +// - Media stream ducking is done here +// - The output ring buffer feeds the configured speaker the audio directly +// - Generic media player commands are received by the ``control`` function. The commands are added to the +// ``media_control_command_queue_`` to be processed in the component's loop +// - Local file play back is initiatied with ``play_file`` and adds it to the ``media_control_command_queue_`` +// - Starting a stream intializes the appropriate pipeline or stops it if it is already running +// - Volume and mute commands are achieved by the ``mute``, ``unmute``, ``set_volume`` functions. The speaker +// component handles the implementation details. +// - Volume commands are ignored if the media control queue is full to avoid crashing when the track wheel is spun +// fast +// - Pausing is sent to the ``AudioMixer`` task. It only effects the media stream. +// - The components main loop performs housekeeping: +// - It reads the media control queue and processes it directly +// - It watches the state of speaker and mixer tasks +// - It determines the overall state of the media player by considering the state of each pipeline +// - announcement playback takes highest priority +// - All logging happens in the main loop task to reduce task stack memory usage. + +static const size_t QUEUE_LENGTH = 20; + +static const uint8_t NUMBER_OF_CHANNELS = 2; // Hard-coded expectation of stereo (2 channel) audio + +static const UBaseType_t MEDIA_PIPELINE_TASK_PRIORITY = 1; +static const UBaseType_t ANNOUNCEMENT_PIPELINE_TASK_PRIORITY = 1; +static const UBaseType_t MIXER_TASK_PRIORITY = 10; + +static const size_t TASK_DELAY_MS = 10; + +static const float FIRST_BOOT_DEFAULT_VOLUME = 0.5f; + +static const char *const TAG = "nabu_media_player"; + +const char *media_player_file_type_to_string(MediaFileType file_type) { + switch (file_type) { + case MediaFileType::FLAC: + return "FLAC"; + case MediaFileType::MP3: + return "MP3"; + case MediaFileType::WAV: + return "WAV"; + default: + return "unknown"; + } +} + +void NabuMediaPlayer::setup() { + state = media_player::MEDIA_PLAYER_STATE_IDLE; + + this->media_control_command_queue_ = xQueueCreate(QUEUE_LENGTH, sizeof(MediaCallCommand)); + + this->pref_ = global_preferences->make_preference(this->get_object_id_hash()); + + VolumeRestoreState volume_restore_state; + if (this->pref_.load(&volume_restore_state)) { + this->set_volume_(volume_restore_state.volume); + this->set_mute_state_(volume_restore_state.is_muted); + } else { + this->set_volume_(FIRST_BOOT_DEFAULT_VOLUME); + this->set_mute_state_(false); + } + +#ifdef USE_OTA + ota::get_global_ota_callback()->add_on_state_callback( + [this](ota::OTAState state, float progress, uint8_t error, ota::OTAComponent *comp) { + if (state == ota::OTA_STARTED) { + if (this->audio_mixer_ != nullptr) { + this->audio_mixer_->suspend_task(); + } + if (this->media_pipeline_ != nullptr) { + this->media_pipeline_->suspend_tasks(); + } + if (this->announcement_pipeline_ != nullptr) { + this->announcement_pipeline_->suspend_tasks(); + } + } else if (state == ota::OTA_ERROR) { + if (this->audio_mixer_ != nullptr) { + this->audio_mixer_->resume_task(); + } + if (this->media_pipeline_ != nullptr) { + this->media_pipeline_->resume_tasks(); + } + if (this->announcement_pipeline_ != nullptr) { + this->announcement_pipeline_->resume_tasks(); + } + } + }); +#endif + + ESP_LOGI(TAG, "Set up nabu media player"); +} + +esp_err_t NabuMediaPlayer::start_pipeline_(AudioPipelineType type, bool url) { + esp_err_t err = ESP_OK; + + if (this->speaker_ != nullptr) { + audio::AudioStreamInfo audio_stream_info; + audio_stream_info.channels = 2; + audio_stream_info.bits_per_sample = 16; + audio_stream_info.sample_rate = this->sample_rate_; + + this->speaker_->set_audio_stream_info(audio_stream_info); + } + + if (this->audio_mixer_ == nullptr) { + this->audio_mixer_ = make_unique(); + err = this->audio_mixer_->start(this->speaker_, "mixer", MIXER_TASK_PRIORITY); + if (err != ESP_OK) { + return err; + } + } + + if (type == AudioPipelineType::MEDIA) { + if (this->media_pipeline_ == nullptr) { + this->media_pipeline_ = make_unique(this->audio_mixer_.get(), type); + } + + if (url) { + err = this->media_pipeline_->start(this->media_url_.value(), this->sample_rate_, "media", + MEDIA_PIPELINE_TASK_PRIORITY); + } else { + err = this->media_pipeline_->start(this->media_file_.value(), this->sample_rate_, "media", + MEDIA_PIPELINE_TASK_PRIORITY); + } + + if (this->is_paused_) { + CommandEvent command_event; + command_event.command = CommandEventType::RESUME_MEDIA; + this->audio_mixer_->send_command(&command_event); + } + this->is_paused_ = false; + } else if (type == AudioPipelineType::ANNOUNCEMENT) { + if (this->announcement_pipeline_ == nullptr) { + this->announcement_pipeline_ = make_unique(this->audio_mixer_.get(), type); + } + + if (url) { + err = this->announcement_pipeline_->start(this->announcement_url_.value(), this->sample_rate_, "ann", + ANNOUNCEMENT_PIPELINE_TASK_PRIORITY); + } else { + err = this->announcement_pipeline_->start(this->announcement_file_.value(), this->sample_rate_, "ann", + ANNOUNCEMENT_PIPELINE_TASK_PRIORITY); + } + } + + return err; +} + +void NabuMediaPlayer::watch_media_commands_() { + MediaCallCommand media_command; + CommandEvent command_event; + esp_err_t err = ESP_OK; + + if (xQueueReceive(this->media_control_command_queue_, &media_command, 0) == pdTRUE) { + if (media_command.new_url.has_value() && media_command.new_url.value()) { + if (media_command.announce.has_value() && media_command.announce.value()) { + err = this->start_pipeline_(AudioPipelineType::ANNOUNCEMENT, true); + } else { + err = this->start_pipeline_(AudioPipelineType::MEDIA, true); + } + } + + if (media_command.new_file.has_value() && media_command.new_file.value()) { + if (media_command.announce.has_value() && media_command.announce.value()) { + err = this->start_pipeline_(AudioPipelineType::ANNOUNCEMENT, false); + } else { + err = this->start_pipeline_(AudioPipelineType::MEDIA, false); + } + } + + if (err != ESP_OK) { + ESP_LOGE(TAG, "Error starting the audio pipeline: %s", esp_err_to_name(err)); + this->status_set_error(); + } else { + this->status_clear_error(); + } + + if (media_command.volume.has_value()) { + this->set_volume_(media_command.volume.value()); + this->publish_state(); + } + + if (media_command.command.has_value()) { + switch (media_command.command.value()) { + case media_player::MEDIA_PLAYER_COMMAND_PLAY: + if ((this->audio_mixer_ != nullptr) && this->is_paused_) { + command_event.command = CommandEventType::RESUME_MEDIA; + this->audio_mixer_->send_command(&command_event); + } + this->is_paused_ = false; + break; + case media_player::MEDIA_PLAYER_COMMAND_PAUSE: + if ((this->audio_mixer_ != nullptr) && !this->is_paused_) { + command_event.command = CommandEventType::PAUSE_MEDIA; + this->audio_mixer_->send_command(&command_event); + } + this->is_paused_ = true; + break; + case media_player::MEDIA_PLAYER_COMMAND_STOP: + command_event.command = CommandEventType::STOP; + if (media_command.announce.has_value() && media_command.announce.value()) { + if (this->announcement_pipeline_ != nullptr) { + this->announcement_pipeline_->stop(); + } + } else { + if (this->media_pipeline_ != nullptr) { + this->media_pipeline_->stop(); + } + } + break; + case media_player::MEDIA_PLAYER_COMMAND_TOGGLE: + if ((this->audio_mixer_ != nullptr) && this->is_paused_) { + command_event.command = CommandEventType::RESUME_MEDIA; + this->audio_mixer_->send_command(&command_event); + this->is_paused_ = false; + } else if (this->audio_mixer_ != nullptr) { + command_event.command = CommandEventType::PAUSE_MEDIA; + this->audio_mixer_->send_command(&command_event); + this->is_paused_ = true; + } + break; + case media_player::MEDIA_PLAYER_COMMAND_MUTE: { + this->set_mute_state_(true); + + this->publish_state(); + break; + } + case media_player::MEDIA_PLAYER_COMMAND_UNMUTE: + this->set_mute_state_(false); + this->publish_state(); + break; + case media_player::MEDIA_PLAYER_COMMAND_VOLUME_UP: + this->set_volume_(std::min(1.0f, this->volume + this->volume_increment_)); + this->publish_state(); + break; + case media_player::MEDIA_PLAYER_COMMAND_VOLUME_DOWN: + this->set_volume_(std::max(0.0f, this->volume - this->volume_increment_)); + this->publish_state(); + break; + default: + break; + } + } + } +} + +void NabuMediaPlayer::watch_mixer_() { + TaskEvent event; + if (this->audio_mixer_ != nullptr) { + while (this->audio_mixer_->read_event(&event)) + if (event.type == EventType::WARNING) { + ESP_LOGD(TAG, "Mixer encountered an error: %s", esp_err_to_name(event.err)); + this->status_set_error(); + } + } +} + +void NabuMediaPlayer::loop() { + this->watch_media_commands_(); + this->watch_mixer_(); + + // Determine state of the media player + media_player::MediaPlayerState old_state = this->state; + + if (this->announcement_pipeline_ != nullptr) + this->announcement_pipeline_state_ = this->announcement_pipeline_->get_state(); + + if (this->media_pipeline_ != nullptr) + this->media_pipeline_state_ = this->media_pipeline_->get_state(); + + if (this->media_pipeline_state_ == AudioPipelineState::ERROR_READING) { + ESP_LOGE(TAG, "The media pipeline's file reader encountered an error."); + } else if (this->media_pipeline_state_ == AudioPipelineState::ERROR_DECODING) { + ESP_LOGE(TAG, "The media pipeline's audio decoder encountered an error."); + } else if (this->media_pipeline_state_ == AudioPipelineState::ERROR_RESAMPLING) { + ESP_LOGE(TAG, "The media pipeline's audio resampler encountered an error."); + } + + if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_READING) { + ESP_LOGE(TAG, "The announcement pipeline's file reader encountered an error."); + } else if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_DECODING) { + ESP_LOGE(TAG, "The announcement pipeline's audio decoder encountered an error."); + } else if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_RESAMPLING) { + ESP_LOGE(TAG, "The announcement pipeline's audio resampler encountered an error."); + } + + if (this->announcement_pipeline_state_ != AudioPipelineState::STOPPED) { + this->state = media_player::MEDIA_PLAYER_STATE_ANNOUNCING; + } else { + if (this->media_pipeline_state_ == AudioPipelineState::STOPPED) { + this->state = media_player::MEDIA_PLAYER_STATE_IDLE; + } else if (this->is_paused_) { + this->state = media_player::MEDIA_PLAYER_STATE_PAUSED; + } else { + this->state = media_player::MEDIA_PLAYER_STATE_PLAYING; + } + } + + if (this->state != old_state) { + this->publish_state(); + } +} + +void NabuMediaPlayer::set_ducking_reduction(uint8_t decibel_reduction, float duration) { + if (this->audio_mixer_ != nullptr) { + CommandEvent command_event; + command_event.command = CommandEventType::DUCK; + command_event.decibel_reduction = decibel_reduction; + + // Convert the duration in seconds to number of samples, accounting for the sample rate and number of channels + command_event.transition_samples = static_cast(duration * this->sample_rate_ * NUMBER_OF_CHANNELS); + this->audio_mixer_->send_command(&command_event); + } +} + +void NabuMediaPlayer::play_file(MediaFile *media_file, bool announcement) { + if (!this->is_ready()) { + // Ignore any commands sent before the media player is setup + return; + } + + MediaCallCommand media_command; + + media_command.new_file = true; + if (announcement) { + this->announcement_file_ = media_file; + media_command.announce = true; + } else { + this->media_file_ = media_file; + media_command.announce = false; + } + xQueueSend(this->media_control_command_queue_, &media_command, portMAX_DELAY); +} + +void NabuMediaPlayer::control(const media_player::MediaPlayerCall &call) { + if (!this->is_ready()) { + // Ignore any commands sent before the media player is setup + return; + } + + MediaCallCommand media_command; + + if (call.get_announcement().has_value() && call.get_announcement().value()) { + media_command.announce = true; + } else { + media_command.announce = false; + } + + if (call.get_media_url().has_value()) { + std::string new_uri = call.get_media_url().value(); + + media_command.new_url = true; + if (call.get_announcement().has_value() && call.get_announcement().value()) { + this->announcement_url_ = new_uri; + } else { + this->media_url_ = new_uri; + } + xQueueSend(this->media_control_command_queue_, &media_command, portMAX_DELAY); + return; + } + + if (call.get_volume().has_value()) { + media_command.volume = call.get_volume().value(); + // Wait 0 ticks for queue to be free, volume sets aren't that important! + xQueueSend(this->media_control_command_queue_, &media_command, 0); + return; + } + + if (call.get_command().has_value()) { + media_command.command = call.get_command().value(); + TickType_t ticks_to_wait = portMAX_DELAY; + if ((call.get_command().value() == media_player::MEDIA_PLAYER_COMMAND_VOLUME_UP) || + (call.get_command().value() == media_player::MEDIA_PLAYER_COMMAND_VOLUME_DOWN)) { + ticks_to_wait = 0; // Wait 0 ticks for queue to be free, volume sets aren't that important! + } + xQueueSend(this->media_control_command_queue_, &media_command, ticks_to_wait); + return; + } +} + +media_player::MediaPlayerTraits NabuMediaPlayer::get_traits() { + auto traits = media_player::MediaPlayerTraits(); + traits.set_supports_pause(true); + traits.get_supported_formats().push_back( + media_player::MediaPlayerSupportedFormat{.format = "flac", + .sample_rate = this->sample_rate_, + .num_channels = 2, + .purpose = media_player::MediaPlayerFormatPurpose::PURPOSE_DEFAULT, + .sample_bytes = 2}); + traits.get_supported_formats().push_back( + media_player::MediaPlayerSupportedFormat{.format = "flac", + .sample_rate = this->sample_rate_, + .num_channels = 1, + .purpose = media_player::MediaPlayerFormatPurpose::PURPOSE_ANNOUNCEMENT, + .sample_bytes = 2}); + return traits; +}; + +void NabuMediaPlayer::save_volume_restore_state_() { + VolumeRestoreState volume_restore_state; + volume_restore_state.volume = this->volume; + volume_restore_state.is_muted = this->is_muted_; + this->pref_.save(&volume_restore_state); +} + +void NabuMediaPlayer::set_mute_state_(bool mute_state) { + this->speaker_->set_mute_state(mute_state); + + bool old_mute_state = this->is_muted_; + this->is_muted_ = mute_state; + + this->save_volume_restore_state_(); + + if (old_mute_state != mute_state) { + if (mute_state) { + this->defer([this]() { this->mute_trigger_->trigger(); }); + } else { + this->defer([this]() { this->unmute_trigger_->trigger(); }); + } + } +} + +void NabuMediaPlayer::set_volume_(float volume, bool publish) { + // Remap the volume to fit with in the configured limits + float bounded_volume = remap(volume, 0.0f, 1.0f, this->volume_min_, this->volume_max_); + + this->speaker_->set_volume(bounded_volume); + + if (publish) { + this->volume = volume; + this->save_volume_restore_state_(); + } + + this->defer([this, volume]() { this->volume_trigger_->trigger(volume); }); +} + +} // namespace nabu +} // namespace esphome +#endif diff --git a/esphome/components/nabu/nabu_media_player.h b/esphome/components/nabu/nabu_media_player.h new file mode 100644 index 0000000000..a901ad5aa4 --- /dev/null +++ b/esphome/components/nabu/nabu_media_player.h @@ -0,0 +1,133 @@ +#pragma once + +#ifdef USE_ESP_IDF + +#include "audio_mixer.h" +#include "audio_pipeline.h" + +#include "nabu_media_helpers.h" + +#include "esphome/components/media_player/media_player.h" +#include "esphome/components/speaker/speaker.h" + +#include "esphome/core/automation.h" +#include "esphome/core/component.h" +#include "esphome/core/preferences.h" + +#include +#include + +#include + +namespace esphome { +namespace nabu { + +struct MediaCallCommand { + optional command; + optional volume; + optional announce; + optional new_url; + optional new_file; +}; + +struct VolumeRestoreState { + float volume; + bool is_muted; +}; + +class NabuMediaPlayer : public Component, public media_player::MediaPlayer { + public: + float get_setup_priority() const override { return esphome::setup_priority::LATE; } + void setup() override; + void loop() override; + + // MediaPlayer implementations + media_player::MediaPlayerTraits get_traits() override; + bool is_muted() const override { return this->is_muted_; } + + /// @brief Sets the ducking level for the media stream in the mixer + /// @param decibel_reduction (uint8_t) The dB reduction level. For example, 0 is no change, 10 is a reduction by 10 dB + /// @param duration (float) The duration (in seconds) for transitioning to the new ducking level + void set_ducking_reduction(uint8_t decibel_reduction, float duration); + + void set_sample_rate(uint32_t sample_rate) { this->sample_rate_ = sample_rate; } + + // Percentage to increase or decrease the volume for volume up or volume down commands + void set_volume_increment(float volume_increment) { this->volume_increment_ = volume_increment; } + + void set_volume_max(float volume_max) { this->volume_max_ = volume_max; } + void set_volume_min(float volume_min) { this->volume_min_ = volume_min; } + + void set_speaker(speaker::Speaker *speaker) { this->speaker_ = speaker; } + + Trigger<> *get_mute_trigger() const { return this->mute_trigger_; } + Trigger<> *get_unmute_trigger() const { return this->unmute_trigger_; } + Trigger *get_volume_trigger() const { return this->volume_trigger_; } + + void play_file(MediaFile *media_file, bool announcement); + + protected: + // Receives commands from HA or from the voice assistant component + // Sends commands to the media_control_commanda_queue_ + void control(const media_player::MediaPlayerCall &call) override; + + /// @brief Updates this->volume and saves volume/mute state to flash for restortation if publish is true. + void set_volume_(float volume, bool publish = true); + + /// @brief Sets the mute state. Restores previous volume if unmuting. Always saves volume/mute state to flash for + /// restoration. + /// @param mute_state If true, audio will be muted. If false, audio will be unmuted + void set_mute_state_(bool mute_state); + + /// @brief Saves the current volume and mute state to the flash for restoration. + void save_volume_restore_state_(); + + // Reads commands from media_control_command_queue_. Starts pipelines and mixer if necessary. + void watch_media_commands_(); + + std::unique_ptr media_pipeline_; + std::unique_ptr announcement_pipeline_; + std::unique_ptr audio_mixer_; + + speaker::Speaker *speaker_{nullptr}; + + // Monitors the mixer task + void watch_mixer_(); + + // Starts the ``type`` pipeline with a ``url`` or file. Starts the mixer, pipeline, and speaker tasks if necessary. + // Unpauses if starting media in paused state + esp_err_t start_pipeline_(AudioPipelineType type, bool url); + + AudioPipelineState media_pipeline_state_{AudioPipelineState::STOPPED}; + AudioPipelineState announcement_pipeline_state_{AudioPipelineState::STOPPED}; + + optional media_url_{}; // only modified by control function + optional announcement_url_{}; // only modified by control function + optional media_file_{}; // only modified by play_file function + optional announcement_file_{}; // only modified by play_file function + + QueueHandle_t media_control_command_queue_; + + uint32_t sample_rate_; + + bool is_paused_{false}; + bool is_muted_{false}; + + // The amount to change the volume on volume up/down commands + float volume_increment_; + + float volume_max_; + float volume_min_; + + // Used to save volume/mute state for restoration on reboot + ESPPreferenceObject pref_; + + Trigger<> *mute_trigger_ = new Trigger<>(); + Trigger<> *unmute_trigger_ = new Trigger<>(); + Trigger *volume_trigger_ = new Trigger(); +}; + +} // namespace nabu +} // namespace esphome + +#endif