From 6f71363d9b9b2b35f13ff6bdde5870e9135e4977 Mon Sep 17 00:00:00 2001 From: Jesse Hills <3060199+jesserockz@users.noreply.github.com> Date: Mon, 8 Apr 2024 16:19:22 +1200 Subject: [PATCH] Send/Receive Voice Assistant audio via API (#6471) Co-authored-by: Michael Hansen --- esphome/components/api/api.proto | 19 ++- esphome/components/api/api_connection.cpp | 27 ++++- esphome/components/api/api_connection.h | 1 + esphome/components/api/api_pb2.cpp | 83 ++++++++++++- esphome/components/api/api_pb2.h | 21 +++- esphome/components/api/api_pb2_service.cpp | 19 +++ esphome/components/api/api_pb2_service.h | 4 + .../voice_assistant/voice_assistant.cpp | 109 +++++++++++++----- .../voice_assistant/voice_assistant.h | 40 ++++++- 9 files changed, 275 insertions(+), 48 deletions(-) diff --git a/esphome/components/api/api.proto b/esphome/components/api/api.proto index 7efc7aef64..6b685b8974 100644 --- a/esphome/components/api/api.proto +++ b/esphome/components/api/api.proto @@ -217,7 +217,8 @@ message DeviceInfoResponse { string friendly_name = 13; - uint32 voice_assistant_version = 14; + uint32 legacy_voice_assistant_version = 14; + uint32 voice_assistant_feature_flags = 17; string suggested_area = 16; } @@ -1422,12 +1423,18 @@ message BluetoothDeviceClearCacheResponse { } // ==================== PUSH TO TALK ==================== +enum VoiceAssistantSubscribeFlag { + VOICE_ASSISTANT_SUBSCRIBE_NONE = 0; + VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1; +} + message SubscribeVoiceAssistantRequest { option (id) = 89; option (source) = SOURCE_CLIENT; option (ifdef) = "USE_VOICE_ASSISTANT"; bool subscribe = 1; + uint32 flags = 2; } enum VoiceAssistantRequestFlag { @@ -1495,6 +1502,16 @@ message VoiceAssistantEventResponse { repeated VoiceAssistantEventData data = 2; } +message VoiceAssistantAudio { + option (id) = 106; + option (source) = SOURCE_BOTH; + option (ifdef) = "USE_VOICE_ASSISTANT"; + + bytes data = 1; + bool end = 2; +} + + // ==================== ALARM CONTROL PANEL ==================== enum AlarmControlPanelState { ALARM_STATE_DISARMED = 0; diff --git a/esphome/components/api/api_connection.cpp b/esphome/components/api/api_connection.cpp index bd4790df95..e9607f7f77 100644 --- a/esphome/components/api/api_connection.cpp +++ b/esphome/components/api/api_connection.cpp @@ -1040,10 +1040,15 @@ void APIConnection::on_voice_assistant_response(const VoiceAssistantResponse &ms voice_assistant::global_voice_assistant->failed_to_start(); return; } - struct sockaddr_storage storage; - socklen_t len = sizeof(storage); - this->helper_->getpeername((struct sockaddr *) &storage, &len); - voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port); + if (msg.port == 0) { + // Use API Audio + voice_assistant::global_voice_assistant->start_streaming(); + } else { + struct sockaddr_storage storage; + socklen_t len = sizeof(storage); + this->helper_->getpeername((struct sockaddr *) &storage, &len); + voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port); + } } }; void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) { @@ -1055,6 +1060,15 @@ void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventR voice_assistant::global_voice_assistant->on_event(msg); } } +void APIConnection::on_voice_assistant_audio(const VoiceAssistantAudio &msg) { + if (voice_assistant::global_voice_assistant != nullptr) { + if (voice_assistant::global_voice_assistant->get_api_connection() != this) { + return; + } + + voice_assistant::global_voice_assistant->on_audio(msg); + } +}; #endif @@ -1142,7 +1156,7 @@ HelloResponse APIConnection::hello(const HelloRequest &msg) { HelloResponse resp; resp.api_version_major = 1; - resp.api_version_minor = 9; + resp.api_version_minor = 10; resp.server_info = App.get_name() + " (esphome v" ESPHOME_VERSION ")"; resp.name = App.get_name(); @@ -1203,7 +1217,8 @@ DeviceInfoResponse APIConnection::device_info(const DeviceInfoRequest &msg) { resp.bluetooth_proxy_feature_flags = bluetooth_proxy::global_bluetooth_proxy->get_feature_flags(); #endif #ifdef USE_VOICE_ASSISTANT - resp.voice_assistant_version = voice_assistant::global_voice_assistant->get_version(); + resp.legacy_voice_assistant_version = voice_assistant::global_voice_assistant->get_legacy_version(); + resp.voice_assistant_feature_flags = voice_assistant::global_voice_assistant->get_feature_flags(); #endif return resp; } diff --git a/esphome/components/api/api_connection.h b/esphome/components/api/api_connection.h index 6fe6e0d509..c19c209969 100644 --- a/esphome/components/api/api_connection.h +++ b/esphome/components/api/api_connection.h @@ -134,6 +134,7 @@ class APIConnection : public APIServerConnection { void subscribe_voice_assistant(const SubscribeVoiceAssistantRequest &msg) override; void on_voice_assistant_response(const VoiceAssistantResponse &msg) override; void on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) override; + void on_voice_assistant_audio(const VoiceAssistantAudio &msg) override; #endif #ifdef USE_ALARM_CONTROL_PANEL diff --git a/esphome/components/api/api_pb2.cpp b/esphome/components/api/api_pb2.cpp index 32654f3148..f6325d0854 100644 --- a/esphome/components/api/api_pb2.cpp +++ b/esphome/components/api/api_pb2.cpp @@ -410,6 +410,19 @@ const char *proto_enum_to_string(enums::Bluet } #endif #ifdef HAS_PROTO_MESSAGE_DUMP +template<> +const char *proto_enum_to_string(enums::VoiceAssistantSubscribeFlag value) { + switch (value) { + case enums::VOICE_ASSISTANT_SUBSCRIBE_NONE: + return "VOICE_ASSISTANT_SUBSCRIBE_NONE"; + case enums::VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO: + return "VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO"; + default: + return "UNKNOWN"; + } +} +#endif +#ifdef HAS_PROTO_MESSAGE_DUMP template<> const char *proto_enum_to_string(enums::VoiceAssistantRequestFlag value) { switch (value) { case enums::VOICE_ASSISTANT_REQUEST_NONE: @@ -716,7 +729,11 @@ bool DeviceInfoResponse::decode_varint(uint32_t field_id, ProtoVarInt value) { return true; } case 14: { - this->voice_assistant_version = value.as_uint32(); + this->legacy_voice_assistant_version = value.as_uint32(); + return true; + } + case 17: { + this->voice_assistant_feature_flags = value.as_uint32(); return true; } default: @@ -784,7 +801,8 @@ void DeviceInfoResponse::encode(ProtoWriteBuffer buffer) const { buffer.encode_uint32(15, this->bluetooth_proxy_feature_flags); buffer.encode_string(12, this->manufacturer); buffer.encode_string(13, this->friendly_name); - buffer.encode_uint32(14, this->voice_assistant_version); + buffer.encode_uint32(14, this->legacy_voice_assistant_version); + buffer.encode_uint32(17, this->voice_assistant_feature_flags); buffer.encode_string(16, this->suggested_area); } #ifdef HAS_PROTO_MESSAGE_DUMP @@ -850,8 +868,13 @@ void DeviceInfoResponse::dump_to(std::string &out) const { out.append("'").append(this->friendly_name).append("'"); out.append("\n"); - out.append(" voice_assistant_version: "); - sprintf(buffer, "%" PRIu32, this->voice_assistant_version); + out.append(" legacy_voice_assistant_version: "); + sprintf(buffer, "%" PRIu32, this->legacy_voice_assistant_version); + out.append(buffer); + out.append("\n"); + + out.append(" voice_assistant_feature_flags: "); + sprintf(buffer, "%" PRIu32, this->voice_assistant_feature_flags); out.append(buffer); out.append("\n"); @@ -6514,11 +6537,18 @@ bool SubscribeVoiceAssistantRequest::decode_varint(uint32_t field_id, ProtoVarIn this->subscribe = value.as_bool(); return true; } + case 2: { + this->flags = value.as_uint32(); + return true; + } default: return false; } } -void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { buffer.encode_bool(1, this->subscribe); } +void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { + buffer.encode_bool(1, this->subscribe); + buffer.encode_uint32(2, this->flags); +} #ifdef HAS_PROTO_MESSAGE_DUMP void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; @@ -6526,6 +6556,11 @@ void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const { out.append(" subscribe: "); out.append(YESNO(this->subscribe)); out.append("\n"); + + out.append(" flags: "); + sprintf(buffer, "%" PRIu32, this->flags); + out.append(buffer); + out.append("\n"); out.append("}"); } #endif @@ -6752,6 +6787,44 @@ void VoiceAssistantEventResponse::dump_to(std::string &out) const { out.append("}"); } #endif +bool VoiceAssistantAudio::decode_varint(uint32_t field_id, ProtoVarInt value) { + switch (field_id) { + case 2: { + this->end = value.as_bool(); + return true; + } + default: + return false; + } +} +bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited value) { + switch (field_id) { + case 1: { + this->data = value.as_string(); + return true; + } + default: + return false; + } +} +void VoiceAssistantAudio::encode(ProtoWriteBuffer buffer) const { + buffer.encode_string(1, this->data); + buffer.encode_bool(2, this->end); +} +#ifdef HAS_PROTO_MESSAGE_DUMP +void VoiceAssistantAudio::dump_to(std::string &out) const { + __attribute__((unused)) char buffer[64]; + out.append("VoiceAssistantAudio {\n"); + out.append(" data: "); + out.append("'").append(this->data).append("'"); + out.append("\n"); + + out.append(" end: "); + out.append(YESNO(this->end)); + out.append("\n"); + out.append("}"); +} +#endif bool ListEntitiesAlarmControlPanelResponse::decode_varint(uint32_t field_id, ProtoVarInt value) { switch (field_id) { case 6: { diff --git a/esphome/components/api/api_pb2.h b/esphome/components/api/api_pb2.h index f9847a6a19..e361e6b8e1 100644 --- a/esphome/components/api/api_pb2.h +++ b/esphome/components/api/api_pb2.h @@ -165,6 +165,10 @@ enum BluetoothDeviceRequestType : uint32_t { BLUETOOTH_DEVICE_REQUEST_TYPE_CONNECT_V3_WITHOUT_CACHE = 5, BLUETOOTH_DEVICE_REQUEST_TYPE_CLEAR_CACHE = 6, }; +enum VoiceAssistantSubscribeFlag : uint32_t { + VOICE_ASSISTANT_SUBSCRIBE_NONE = 0, + VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1, +}; enum VoiceAssistantRequestFlag : uint32_t { VOICE_ASSISTANT_REQUEST_NONE = 0, VOICE_ASSISTANT_REQUEST_USE_VAD = 1, @@ -327,7 +331,8 @@ class DeviceInfoResponse : public ProtoMessage { uint32_t bluetooth_proxy_feature_flags{0}; std::string manufacturer{}; std::string friendly_name{}; - uint32_t voice_assistant_version{0}; + uint32_t legacy_voice_assistant_version{0}; + uint32_t voice_assistant_feature_flags{0}; std::string suggested_area{}; void encode(ProtoWriteBuffer buffer) const override; #ifdef HAS_PROTO_MESSAGE_DUMP @@ -1674,6 +1679,7 @@ class BluetoothDeviceClearCacheResponse : public ProtoMessage { class SubscribeVoiceAssistantRequest : public ProtoMessage { public: bool subscribe{false}; + uint32_t flags{0}; void encode(ProtoWriteBuffer buffer) const override; #ifdef HAS_PROTO_MESSAGE_DUMP void dump_to(std::string &out) const override; @@ -1749,6 +1755,19 @@ class VoiceAssistantEventResponse : public ProtoMessage { bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override; bool decode_varint(uint32_t field_id, ProtoVarInt value) override; }; +class VoiceAssistantAudio : public ProtoMessage { + public: + std::string data{}; + bool end{false}; + void encode(ProtoWriteBuffer buffer) const override; +#ifdef HAS_PROTO_MESSAGE_DUMP + void dump_to(std::string &out) const override; +#endif + + protected: + bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override; + bool decode_varint(uint32_t field_id, ProtoVarInt value) override; +}; class ListEntitiesAlarmControlPanelResponse : public ProtoMessage { public: std::string object_id{}; diff --git a/esphome/components/api/api_pb2_service.cpp b/esphome/components/api/api_pb2_service.cpp index 4e61893bae..2067f530ce 100644 --- a/esphome/components/api/api_pb2_service.cpp +++ b/esphome/components/api/api_pb2_service.cpp @@ -476,6 +476,14 @@ bool APIServerConnectionBase::send_voice_assistant_request(const VoiceAssistantR #endif #ifdef USE_VOICE_ASSISTANT #endif +#ifdef USE_VOICE_ASSISTANT +bool APIServerConnectionBase::send_voice_assistant_audio(const VoiceAssistantAudio &msg) { +#ifdef HAS_PROTO_MESSAGE_DUMP + ESP_LOGVV(TAG, "send_voice_assistant_audio: %s", msg.dump().c_str()); +#endif + return this->send_message_(msg, 106); +} +#endif #ifdef USE_ALARM_CONTROL_PANEL bool APIServerConnectionBase::send_list_entities_alarm_control_panel_response( const ListEntitiesAlarmControlPanelResponse &msg) { @@ -971,6 +979,17 @@ bool APIServerConnectionBase::read_message(uint32_t msg_size, uint32_t msg_type, ESP_LOGVV(TAG, "on_date_command_request: %s", msg.dump().c_str()); #endif this->on_date_command_request(msg); +#endif + break; + } + case 106: { +#ifdef USE_VOICE_ASSISTANT + VoiceAssistantAudio msg; + msg.decode(msg_data, msg_size); +#ifdef HAS_PROTO_MESSAGE_DUMP + ESP_LOGVV(TAG, "on_voice_assistant_audio: %s", msg.dump().c_str()); +#endif + this->on_voice_assistant_audio(msg); #endif break; } diff --git a/esphome/components/api/api_pb2_service.h b/esphome/components/api/api_pb2_service.h index a3c53a7534..effcfc30f4 100644 --- a/esphome/components/api/api_pb2_service.h +++ b/esphome/components/api/api_pb2_service.h @@ -240,6 +240,10 @@ class APIServerConnectionBase : public ProtoService { #ifdef USE_VOICE_ASSISTANT virtual void on_voice_assistant_event_response(const VoiceAssistantEventResponse &value){}; #endif +#ifdef USE_VOICE_ASSISTANT + bool send_voice_assistant_audio(const VoiceAssistantAudio &msg); + virtual void on_voice_assistant_audio(const VoiceAssistantAudio &value){}; +#endif #ifdef USE_ALARM_CONTROL_PANEL bool send_list_entities_alarm_control_panel_response(const ListEntitiesAlarmControlPanelResponse &msg); #endif diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index 49b8fdc959..34a26eec01 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -24,28 +24,24 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE; float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; } -void VoiceAssistant::setup() { - ESP_LOGCONFIG(TAG, "Setting up Voice Assistant..."); - - global_voice_assistant = this; - +bool VoiceAssistant::start_udp_socket_() { this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); - if (socket_ == nullptr) { - ESP_LOGW(TAG, "Could not create socket"); + if (this->socket_ == nullptr) { + ESP_LOGE(TAG, "Could not create socket"); this->mark_failed(); - return; + return false; } int enable = 1; - int err = socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); + int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); if (err != 0) { ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err); // we can still continue } - err = socket_->setblocking(false); + err = this->socket_->setblocking(false); if (err != 0) { - ESP_LOGW(TAG, "Socket unable to set nonblocking mode: errno %d", err); + ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err); this->mark_failed(); - return; + return false; } #ifdef USE_SPEAKER @@ -54,18 +50,30 @@ void VoiceAssistant::setup() { socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055); if (sl == 0) { - ESP_LOGW(TAG, "Socket unable to set sockaddr: errno %d", errno); + ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno); this->mark_failed(); - return; + return false; } - err = socket_->bind((struct sockaddr *) &server, sizeof(server)); + err = this->socket_->bind((struct sockaddr *) &server, sizeof(server)); if (err != 0) { - ESP_LOGW(TAG, "Socket unable to bind: errno %d", errno); + ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno); this->mark_failed(); - return; + return false; } + } +#endif + this->udp_socket_running_ = true; + return true; +} +void VoiceAssistant::setup() { + ESP_LOGCONFIG(TAG, "Setting up Voice Assistant..."); + + global_voice_assistant = this; + +#ifdef USE_SPEAKER + if (this->speaker_ != nullptr) { ExternalRAMAllocator speaker_allocator(ExternalRAMAllocator::ALLOW_FAILURE); this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE); if (this->speaker_buffer_ == nullptr) { @@ -238,8 +246,20 @@ void VoiceAssistant::loop() { size_t available = this->ring_buffer_->available(); while (available >= SEND_BUFFER_SIZE) { size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0); - this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_, - sizeof(this->dest_addr_)); + if (this->audio_mode_ == AUDIO_MODE_API) { + api::VoiceAssistantAudio msg; + msg.data.assign((char *) this->send_buffer_, read_bytes); + this->api_client_->send_voice_assistant_audio(msg); + } else { + if (!this->udp_socket_running_) { + if (!this->start_udp_socket_()) { + this->set_state_(State::STOP_MICROPHONE, State::IDLE); + break; + } + } + this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_, + sizeof(this->dest_addr_)); + } available = this->ring_buffer_->available(); } @@ -268,22 +288,25 @@ void VoiceAssistant::loop() { #ifdef USE_SPEAKER if (this->speaker_ != nullptr) { ssize_t received_len = 0; - if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) { - received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE); - if (received_len > 0) { - this->speaker_buffer_index_ += received_len; - this->speaker_buffer_size_ += received_len; - this->speaker_bytes_received_ += received_len; + if (this->audio_mode_ == AUDIO_MODE_UDP) { + if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) { + received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE); + if (received_len > 0) { + this->speaker_buffer_index_ += received_len; + this->speaker_buffer_size_ += received_len; + this->speaker_bytes_received_ += received_len; + } + } else { + ESP_LOGD(TAG, "Receive buffer full"); } - } else { - ESP_LOGD(TAG, "Receive buffer full"); } // Build a small buffer of audio before sending to the speaker - if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4) + bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0); + if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream) this->write_speaker_(); if (this->wait_for_stream_end_) { this->cancel_timeout("playing"); - if (this->stream_ended_ && received_len < 0) { + if (end_of_stream) { ESP_LOGD(TAG, "End of audio stream received"); this->cancel_timeout("speaker-timeout"); this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED); @@ -428,6 +451,22 @@ void VoiceAssistant::failed_to_start() { this->set_state_(State::STOP_MICROPHONE, State::IDLE); } +void VoiceAssistant::start_streaming() { + if (this->state_ != State::STARTING_PIPELINE) { + this->signal_stop_(); + return; + } + + ESP_LOGD(TAG, "Client started, streaming microphone"); + this->audio_mode_ = AUDIO_MODE_API; + + if (this->mic_->is_running()) { + this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); + } else { + this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE); + } +} + void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) { if (this->state_ != State::STARTING_PIPELINE) { this->signal_stop_(); @@ -435,6 +474,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por } ESP_LOGD(TAG, "Client started, streaming microphone"); + this->audio_mode_ = AUDIO_MODE_UDP; memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_)); if (this->dest_addr_.ss_family == AF_INET) { @@ -688,6 +728,17 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { } } +void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) { + if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) { + memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length()); + this->speaker_buffer_index_ += msg.data.length(); + this->speaker_buffer_size_ += msg.data.length(); + this->speaker_bytes_received_ += msg.data.length(); + } else { + ESP_LOGE(TAG, "Cannot receive audio, buffer is full"); + } +} + VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) } // namespace voice_assistant diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index 14352bf3ae..d6b1502381 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -29,9 +29,14 @@ namespace voice_assistant { // Version 1: Initial version // Version 2: Adds raw speaker support -// Version 3: Unused/skip -static const uint32_t INITIAL_VERSION = 1; -static const uint32_t SPEAKER_SUPPORT = 2; +static const uint32_t LEGACY_INITIAL_VERSION = 1; +static const uint32_t LEGACY_SPEAKER_SUPPORT = 2; + +enum VoiceAssistantFeature : uint32_t { + FEATURE_VOICE_ASSISTANT = 1 << 0, + FEATURE_SPEAKER = 1 << 1, + FEATURE_API_AUDIO = 1 << 2, +}; enum class State { IDLE, @@ -49,11 +54,17 @@ enum class State { RESPONSE_FINISHED, }; +enum AudioMode : uint8_t { + AUDIO_MODE_UDP, + AUDIO_MODE_API, +}; + class VoiceAssistant : public Component { public: void setup() override; void loop() override; float get_setup_priority() const override; + void start_streaming(); void start_streaming(struct sockaddr_storage *addr, uint16_t port); void failed_to_start(); @@ -71,19 +82,32 @@ class VoiceAssistant : public Component { } #endif - uint32_t get_version() const { + uint32_t get_legacy_version() const { #ifdef USE_SPEAKER if (this->speaker_ != nullptr) { - return SPEAKER_SUPPORT; + return LEGACY_SPEAKER_SUPPORT; } #endif - return INITIAL_VERSION; + return LEGACY_INITIAL_VERSION; + } + + uint32_t get_feature_flags() const { + uint32_t flags = 0; + flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT; +#ifdef USE_SPEAKER + if (this->speaker_ != nullptr) { + flags |= VoiceAssistantFeature::FEATURE_SPEAKER; + flags |= VoiceAssistantFeature::FEATURE_API_AUDIO; + } +#endif + return flags; } void request_start(bool continuous, bool silence_detection); void request_stop(); void on_event(const api::VoiceAssistantEventResponse &msg); + void on_audio(const api::VoiceAssistantAudio &msg); bool is_running() const { return this->state_ != State::IDLE; } void set_continuous(bool continuous) { this->continuous_ = continuous; } @@ -201,6 +225,10 @@ class VoiceAssistant : public Component { State state_{State::IDLE}; State desired_state_{State::IDLE}; + + AudioMode audio_mode_{AUDIO_MODE_UDP}; + bool udp_socket_running_{false}; + bool start_udp_socket_(); }; template class StartAction : public Action, public Parented {