mirror of
https://github.com/esphome/esphome.git
synced 2024-12-22 13:34:54 +01:00
Send/Receive Voice Assistant audio via API (#6471)
Co-authored-by: Michael Hansen <mike@rhasspy.org>
This commit is contained in:
parent
97ff87b718
commit
6f71363d9b
9 changed files with 275 additions and 48 deletions
|
@ -217,7 +217,8 @@ message DeviceInfoResponse {
|
||||||
|
|
||||||
string friendly_name = 13;
|
string friendly_name = 13;
|
||||||
|
|
||||||
uint32 voice_assistant_version = 14;
|
uint32 legacy_voice_assistant_version = 14;
|
||||||
|
uint32 voice_assistant_feature_flags = 17;
|
||||||
|
|
||||||
string suggested_area = 16;
|
string suggested_area = 16;
|
||||||
}
|
}
|
||||||
|
@ -1422,12 +1423,18 @@ message BluetoothDeviceClearCacheResponse {
|
||||||
}
|
}
|
||||||
|
|
||||||
// ==================== PUSH TO TALK ====================
|
// ==================== PUSH TO TALK ====================
|
||||||
|
enum VoiceAssistantSubscribeFlag {
|
||||||
|
VOICE_ASSISTANT_SUBSCRIBE_NONE = 0;
|
||||||
|
VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1;
|
||||||
|
}
|
||||||
|
|
||||||
message SubscribeVoiceAssistantRequest {
|
message SubscribeVoiceAssistantRequest {
|
||||||
option (id) = 89;
|
option (id) = 89;
|
||||||
option (source) = SOURCE_CLIENT;
|
option (source) = SOURCE_CLIENT;
|
||||||
option (ifdef) = "USE_VOICE_ASSISTANT";
|
option (ifdef) = "USE_VOICE_ASSISTANT";
|
||||||
|
|
||||||
bool subscribe = 1;
|
bool subscribe = 1;
|
||||||
|
uint32 flags = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum VoiceAssistantRequestFlag {
|
enum VoiceAssistantRequestFlag {
|
||||||
|
@ -1495,6 +1502,16 @@ message VoiceAssistantEventResponse {
|
||||||
repeated VoiceAssistantEventData data = 2;
|
repeated VoiceAssistantEventData data = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message VoiceAssistantAudio {
|
||||||
|
option (id) = 106;
|
||||||
|
option (source) = SOURCE_BOTH;
|
||||||
|
option (ifdef) = "USE_VOICE_ASSISTANT";
|
||||||
|
|
||||||
|
bytes data = 1;
|
||||||
|
bool end = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// ==================== ALARM CONTROL PANEL ====================
|
// ==================== ALARM CONTROL PANEL ====================
|
||||||
enum AlarmControlPanelState {
|
enum AlarmControlPanelState {
|
||||||
ALARM_STATE_DISARMED = 0;
|
ALARM_STATE_DISARMED = 0;
|
||||||
|
|
|
@ -1040,11 +1040,16 @@ void APIConnection::on_voice_assistant_response(const VoiceAssistantResponse &ms
|
||||||
voice_assistant::global_voice_assistant->failed_to_start();
|
voice_assistant::global_voice_assistant->failed_to_start();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (msg.port == 0) {
|
||||||
|
// Use API Audio
|
||||||
|
voice_assistant::global_voice_assistant->start_streaming();
|
||||||
|
} else {
|
||||||
struct sockaddr_storage storage;
|
struct sockaddr_storage storage;
|
||||||
socklen_t len = sizeof(storage);
|
socklen_t len = sizeof(storage);
|
||||||
this->helper_->getpeername((struct sockaddr *) &storage, &len);
|
this->helper_->getpeername((struct sockaddr *) &storage, &len);
|
||||||
voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port);
|
voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) {
|
void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) {
|
||||||
if (voice_assistant::global_voice_assistant != nullptr) {
|
if (voice_assistant::global_voice_assistant != nullptr) {
|
||||||
|
@ -1055,6 +1060,15 @@ void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventR
|
||||||
voice_assistant::global_voice_assistant->on_event(msg);
|
voice_assistant::global_voice_assistant->on_event(msg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
void APIConnection::on_voice_assistant_audio(const VoiceAssistantAudio &msg) {
|
||||||
|
if (voice_assistant::global_voice_assistant != nullptr) {
|
||||||
|
if (voice_assistant::global_voice_assistant->get_api_connection() != this) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
voice_assistant::global_voice_assistant->on_audio(msg);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1142,7 +1156,7 @@ HelloResponse APIConnection::hello(const HelloRequest &msg) {
|
||||||
|
|
||||||
HelloResponse resp;
|
HelloResponse resp;
|
||||||
resp.api_version_major = 1;
|
resp.api_version_major = 1;
|
||||||
resp.api_version_minor = 9;
|
resp.api_version_minor = 10;
|
||||||
resp.server_info = App.get_name() + " (esphome v" ESPHOME_VERSION ")";
|
resp.server_info = App.get_name() + " (esphome v" ESPHOME_VERSION ")";
|
||||||
resp.name = App.get_name();
|
resp.name = App.get_name();
|
||||||
|
|
||||||
|
@ -1203,7 +1217,8 @@ DeviceInfoResponse APIConnection::device_info(const DeviceInfoRequest &msg) {
|
||||||
resp.bluetooth_proxy_feature_flags = bluetooth_proxy::global_bluetooth_proxy->get_feature_flags();
|
resp.bluetooth_proxy_feature_flags = bluetooth_proxy::global_bluetooth_proxy->get_feature_flags();
|
||||||
#endif
|
#endif
|
||||||
#ifdef USE_VOICE_ASSISTANT
|
#ifdef USE_VOICE_ASSISTANT
|
||||||
resp.voice_assistant_version = voice_assistant::global_voice_assistant->get_version();
|
resp.legacy_voice_assistant_version = voice_assistant::global_voice_assistant->get_legacy_version();
|
||||||
|
resp.voice_assistant_feature_flags = voice_assistant::global_voice_assistant->get_feature_flags();
|
||||||
#endif
|
#endif
|
||||||
return resp;
|
return resp;
|
||||||
}
|
}
|
||||||
|
|
|
@ -134,6 +134,7 @@ class APIConnection : public APIServerConnection {
|
||||||
void subscribe_voice_assistant(const SubscribeVoiceAssistantRequest &msg) override;
|
void subscribe_voice_assistant(const SubscribeVoiceAssistantRequest &msg) override;
|
||||||
void on_voice_assistant_response(const VoiceAssistantResponse &msg) override;
|
void on_voice_assistant_response(const VoiceAssistantResponse &msg) override;
|
||||||
void on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) override;
|
void on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) override;
|
||||||
|
void on_voice_assistant_audio(const VoiceAssistantAudio &msg) override;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef USE_ALARM_CONTROL_PANEL
|
#ifdef USE_ALARM_CONTROL_PANEL
|
||||||
|
|
|
@ -410,6 +410,19 @@ const char *proto_enum_to_string<enums::BluetoothDeviceRequestType>(enums::Bluet
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_PROTO_MESSAGE_DUMP
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
|
template<>
|
||||||
|
const char *proto_enum_to_string<enums::VoiceAssistantSubscribeFlag>(enums::VoiceAssistantSubscribeFlag value) {
|
||||||
|
switch (value) {
|
||||||
|
case enums::VOICE_ASSISTANT_SUBSCRIBE_NONE:
|
||||||
|
return "VOICE_ASSISTANT_SUBSCRIBE_NONE";
|
||||||
|
case enums::VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO:
|
||||||
|
return "VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO";
|
||||||
|
default:
|
||||||
|
return "UNKNOWN";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
template<> const char *proto_enum_to_string<enums::VoiceAssistantRequestFlag>(enums::VoiceAssistantRequestFlag value) {
|
template<> const char *proto_enum_to_string<enums::VoiceAssistantRequestFlag>(enums::VoiceAssistantRequestFlag value) {
|
||||||
switch (value) {
|
switch (value) {
|
||||||
case enums::VOICE_ASSISTANT_REQUEST_NONE:
|
case enums::VOICE_ASSISTANT_REQUEST_NONE:
|
||||||
|
@ -716,7 +729,11 @@ bool DeviceInfoResponse::decode_varint(uint32_t field_id, ProtoVarInt value) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
case 14: {
|
case 14: {
|
||||||
this->voice_assistant_version = value.as_uint32();
|
this->legacy_voice_assistant_version = value.as_uint32();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
case 17: {
|
||||||
|
this->voice_assistant_feature_flags = value.as_uint32();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
@ -784,7 +801,8 @@ void DeviceInfoResponse::encode(ProtoWriteBuffer buffer) const {
|
||||||
buffer.encode_uint32(15, this->bluetooth_proxy_feature_flags);
|
buffer.encode_uint32(15, this->bluetooth_proxy_feature_flags);
|
||||||
buffer.encode_string(12, this->manufacturer);
|
buffer.encode_string(12, this->manufacturer);
|
||||||
buffer.encode_string(13, this->friendly_name);
|
buffer.encode_string(13, this->friendly_name);
|
||||||
buffer.encode_uint32(14, this->voice_assistant_version);
|
buffer.encode_uint32(14, this->legacy_voice_assistant_version);
|
||||||
|
buffer.encode_uint32(17, this->voice_assistant_feature_flags);
|
||||||
buffer.encode_string(16, this->suggested_area);
|
buffer.encode_string(16, this->suggested_area);
|
||||||
}
|
}
|
||||||
#ifdef HAS_PROTO_MESSAGE_DUMP
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
|
@ -850,8 +868,13 @@ void DeviceInfoResponse::dump_to(std::string &out) const {
|
||||||
out.append("'").append(this->friendly_name).append("'");
|
out.append("'").append(this->friendly_name).append("'");
|
||||||
out.append("\n");
|
out.append("\n");
|
||||||
|
|
||||||
out.append(" voice_assistant_version: ");
|
out.append(" legacy_voice_assistant_version: ");
|
||||||
sprintf(buffer, "%" PRIu32, this->voice_assistant_version);
|
sprintf(buffer, "%" PRIu32, this->legacy_voice_assistant_version);
|
||||||
|
out.append(buffer);
|
||||||
|
out.append("\n");
|
||||||
|
|
||||||
|
out.append(" voice_assistant_feature_flags: ");
|
||||||
|
sprintf(buffer, "%" PRIu32, this->voice_assistant_feature_flags);
|
||||||
out.append(buffer);
|
out.append(buffer);
|
||||||
out.append("\n");
|
out.append("\n");
|
||||||
|
|
||||||
|
@ -6514,11 +6537,18 @@ bool SubscribeVoiceAssistantRequest::decode_varint(uint32_t field_id, ProtoVarIn
|
||||||
this->subscribe = value.as_bool();
|
this->subscribe = value.as_bool();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
case 2: {
|
||||||
|
this->flags = value.as_uint32();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { buffer.encode_bool(1, this->subscribe); }
|
void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const {
|
||||||
|
buffer.encode_bool(1, this->subscribe);
|
||||||
|
buffer.encode_uint32(2, this->flags);
|
||||||
|
}
|
||||||
#ifdef HAS_PROTO_MESSAGE_DUMP
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const {
|
void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const {
|
||||||
__attribute__((unused)) char buffer[64];
|
__attribute__((unused)) char buffer[64];
|
||||||
|
@ -6526,6 +6556,11 @@ void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const {
|
||||||
out.append(" subscribe: ");
|
out.append(" subscribe: ");
|
||||||
out.append(YESNO(this->subscribe));
|
out.append(YESNO(this->subscribe));
|
||||||
out.append("\n");
|
out.append("\n");
|
||||||
|
|
||||||
|
out.append(" flags: ");
|
||||||
|
sprintf(buffer, "%" PRIu32, this->flags);
|
||||||
|
out.append(buffer);
|
||||||
|
out.append("\n");
|
||||||
out.append("}");
|
out.append("}");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -6752,6 +6787,44 @@ void VoiceAssistantEventResponse::dump_to(std::string &out) const {
|
||||||
out.append("}");
|
out.append("}");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
bool VoiceAssistantAudio::decode_varint(uint32_t field_id, ProtoVarInt value) {
|
||||||
|
switch (field_id) {
|
||||||
|
case 2: {
|
||||||
|
this->end = value.as_bool();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited value) {
|
||||||
|
switch (field_id) {
|
||||||
|
case 1: {
|
||||||
|
this->data = value.as_string();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void VoiceAssistantAudio::encode(ProtoWriteBuffer buffer) const {
|
||||||
|
buffer.encode_string(1, this->data);
|
||||||
|
buffer.encode_bool(2, this->end);
|
||||||
|
}
|
||||||
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
|
void VoiceAssistantAudio::dump_to(std::string &out) const {
|
||||||
|
__attribute__((unused)) char buffer[64];
|
||||||
|
out.append("VoiceAssistantAudio {\n");
|
||||||
|
out.append(" data: ");
|
||||||
|
out.append("'").append(this->data).append("'");
|
||||||
|
out.append("\n");
|
||||||
|
|
||||||
|
out.append(" end: ");
|
||||||
|
out.append(YESNO(this->end));
|
||||||
|
out.append("\n");
|
||||||
|
out.append("}");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
bool ListEntitiesAlarmControlPanelResponse::decode_varint(uint32_t field_id, ProtoVarInt value) {
|
bool ListEntitiesAlarmControlPanelResponse::decode_varint(uint32_t field_id, ProtoVarInt value) {
|
||||||
switch (field_id) {
|
switch (field_id) {
|
||||||
case 6: {
|
case 6: {
|
||||||
|
|
|
@ -165,6 +165,10 @@ enum BluetoothDeviceRequestType : uint32_t {
|
||||||
BLUETOOTH_DEVICE_REQUEST_TYPE_CONNECT_V3_WITHOUT_CACHE = 5,
|
BLUETOOTH_DEVICE_REQUEST_TYPE_CONNECT_V3_WITHOUT_CACHE = 5,
|
||||||
BLUETOOTH_DEVICE_REQUEST_TYPE_CLEAR_CACHE = 6,
|
BLUETOOTH_DEVICE_REQUEST_TYPE_CLEAR_CACHE = 6,
|
||||||
};
|
};
|
||||||
|
enum VoiceAssistantSubscribeFlag : uint32_t {
|
||||||
|
VOICE_ASSISTANT_SUBSCRIBE_NONE = 0,
|
||||||
|
VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1,
|
||||||
|
};
|
||||||
enum VoiceAssistantRequestFlag : uint32_t {
|
enum VoiceAssistantRequestFlag : uint32_t {
|
||||||
VOICE_ASSISTANT_REQUEST_NONE = 0,
|
VOICE_ASSISTANT_REQUEST_NONE = 0,
|
||||||
VOICE_ASSISTANT_REQUEST_USE_VAD = 1,
|
VOICE_ASSISTANT_REQUEST_USE_VAD = 1,
|
||||||
|
@ -327,7 +331,8 @@ class DeviceInfoResponse : public ProtoMessage {
|
||||||
uint32_t bluetooth_proxy_feature_flags{0};
|
uint32_t bluetooth_proxy_feature_flags{0};
|
||||||
std::string manufacturer{};
|
std::string manufacturer{};
|
||||||
std::string friendly_name{};
|
std::string friendly_name{};
|
||||||
uint32_t voice_assistant_version{0};
|
uint32_t legacy_voice_assistant_version{0};
|
||||||
|
uint32_t voice_assistant_feature_flags{0};
|
||||||
std::string suggested_area{};
|
std::string suggested_area{};
|
||||||
void encode(ProtoWriteBuffer buffer) const override;
|
void encode(ProtoWriteBuffer buffer) const override;
|
||||||
#ifdef HAS_PROTO_MESSAGE_DUMP
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
|
@ -1674,6 +1679,7 @@ class BluetoothDeviceClearCacheResponse : public ProtoMessage {
|
||||||
class SubscribeVoiceAssistantRequest : public ProtoMessage {
|
class SubscribeVoiceAssistantRequest : public ProtoMessage {
|
||||||
public:
|
public:
|
||||||
bool subscribe{false};
|
bool subscribe{false};
|
||||||
|
uint32_t flags{0};
|
||||||
void encode(ProtoWriteBuffer buffer) const override;
|
void encode(ProtoWriteBuffer buffer) const override;
|
||||||
#ifdef HAS_PROTO_MESSAGE_DUMP
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
void dump_to(std::string &out) const override;
|
void dump_to(std::string &out) const override;
|
||||||
|
@ -1749,6 +1755,19 @@ class VoiceAssistantEventResponse : public ProtoMessage {
|
||||||
bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override;
|
bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override;
|
||||||
bool decode_varint(uint32_t field_id, ProtoVarInt value) override;
|
bool decode_varint(uint32_t field_id, ProtoVarInt value) override;
|
||||||
};
|
};
|
||||||
|
class VoiceAssistantAudio : public ProtoMessage {
|
||||||
|
public:
|
||||||
|
std::string data{};
|
||||||
|
bool end{false};
|
||||||
|
void encode(ProtoWriteBuffer buffer) const override;
|
||||||
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
|
void dump_to(std::string &out) const override;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
protected:
|
||||||
|
bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override;
|
||||||
|
bool decode_varint(uint32_t field_id, ProtoVarInt value) override;
|
||||||
|
};
|
||||||
class ListEntitiesAlarmControlPanelResponse : public ProtoMessage {
|
class ListEntitiesAlarmControlPanelResponse : public ProtoMessage {
|
||||||
public:
|
public:
|
||||||
std::string object_id{};
|
std::string object_id{};
|
||||||
|
|
|
@ -476,6 +476,14 @@ bool APIServerConnectionBase::send_voice_assistant_request(const VoiceAssistantR
|
||||||
#endif
|
#endif
|
||||||
#ifdef USE_VOICE_ASSISTANT
|
#ifdef USE_VOICE_ASSISTANT
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef USE_VOICE_ASSISTANT
|
||||||
|
bool APIServerConnectionBase::send_voice_assistant_audio(const VoiceAssistantAudio &msg) {
|
||||||
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
|
ESP_LOGVV(TAG, "send_voice_assistant_audio: %s", msg.dump().c_str());
|
||||||
|
#endif
|
||||||
|
return this->send_message_<VoiceAssistantAudio>(msg, 106);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#ifdef USE_ALARM_CONTROL_PANEL
|
#ifdef USE_ALARM_CONTROL_PANEL
|
||||||
bool APIServerConnectionBase::send_list_entities_alarm_control_panel_response(
|
bool APIServerConnectionBase::send_list_entities_alarm_control_panel_response(
|
||||||
const ListEntitiesAlarmControlPanelResponse &msg) {
|
const ListEntitiesAlarmControlPanelResponse &msg) {
|
||||||
|
@ -971,6 +979,17 @@ bool APIServerConnectionBase::read_message(uint32_t msg_size, uint32_t msg_type,
|
||||||
ESP_LOGVV(TAG, "on_date_command_request: %s", msg.dump().c_str());
|
ESP_LOGVV(TAG, "on_date_command_request: %s", msg.dump().c_str());
|
||||||
#endif
|
#endif
|
||||||
this->on_date_command_request(msg);
|
this->on_date_command_request(msg);
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 106: {
|
||||||
|
#ifdef USE_VOICE_ASSISTANT
|
||||||
|
VoiceAssistantAudio msg;
|
||||||
|
msg.decode(msg_data, msg_size);
|
||||||
|
#ifdef HAS_PROTO_MESSAGE_DUMP
|
||||||
|
ESP_LOGVV(TAG, "on_voice_assistant_audio: %s", msg.dump().c_str());
|
||||||
|
#endif
|
||||||
|
this->on_voice_assistant_audio(msg);
|
||||||
#endif
|
#endif
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -240,6 +240,10 @@ class APIServerConnectionBase : public ProtoService {
|
||||||
#ifdef USE_VOICE_ASSISTANT
|
#ifdef USE_VOICE_ASSISTANT
|
||||||
virtual void on_voice_assistant_event_response(const VoiceAssistantEventResponse &value){};
|
virtual void on_voice_assistant_event_response(const VoiceAssistantEventResponse &value){};
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef USE_VOICE_ASSISTANT
|
||||||
|
bool send_voice_assistant_audio(const VoiceAssistantAudio &msg);
|
||||||
|
virtual void on_voice_assistant_audio(const VoiceAssistantAudio &value){};
|
||||||
|
#endif
|
||||||
#ifdef USE_ALARM_CONTROL_PANEL
|
#ifdef USE_ALARM_CONTROL_PANEL
|
||||||
bool send_list_entities_alarm_control_panel_response(const ListEntitiesAlarmControlPanelResponse &msg);
|
bool send_list_entities_alarm_control_panel_response(const ListEntitiesAlarmControlPanelResponse &msg);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -24,28 +24,24 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
|
||||||
|
|
||||||
float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
|
float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
|
||||||
|
|
||||||
void VoiceAssistant::setup() {
|
bool VoiceAssistant::start_udp_socket_() {
|
||||||
ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");
|
|
||||||
|
|
||||||
global_voice_assistant = this;
|
|
||||||
|
|
||||||
this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
|
this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
|
||||||
if (socket_ == nullptr) {
|
if (this->socket_ == nullptr) {
|
||||||
ESP_LOGW(TAG, "Could not create socket");
|
ESP_LOGE(TAG, "Could not create socket");
|
||||||
this->mark_failed();
|
this->mark_failed();
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
int enable = 1;
|
int enable = 1;
|
||||||
int err = socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
|
int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
|
ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
|
||||||
// we can still continue
|
// we can still continue
|
||||||
}
|
}
|
||||||
err = socket_->setblocking(false);
|
err = this->socket_->setblocking(false);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
ESP_LOGW(TAG, "Socket unable to set nonblocking mode: errno %d", err);
|
ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
|
||||||
this->mark_failed();
|
this->mark_failed();
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_SPEAKER
|
#ifdef USE_SPEAKER
|
||||||
|
@ -54,18 +50,30 @@ void VoiceAssistant::setup() {
|
||||||
|
|
||||||
socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
|
socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
|
||||||
if (sl == 0) {
|
if (sl == 0) {
|
||||||
ESP_LOGW(TAG, "Socket unable to set sockaddr: errno %d", errno);
|
ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
|
||||||
this->mark_failed();
|
this->mark_failed();
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
err = socket_->bind((struct sockaddr *) &server, sizeof(server));
|
err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
ESP_LOGW(TAG, "Socket unable to bind: errno %d", errno);
|
ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
|
||||||
this->mark_failed();
|
this->mark_failed();
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
this->udp_socket_running_ = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VoiceAssistant::setup() {
|
||||||
|
ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");
|
||||||
|
|
||||||
|
global_voice_assistant = this;
|
||||||
|
|
||||||
|
#ifdef USE_SPEAKER
|
||||||
|
if (this->speaker_ != nullptr) {
|
||||||
ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
||||||
this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
|
this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
|
||||||
if (this->speaker_buffer_ == nullptr) {
|
if (this->speaker_buffer_ == nullptr) {
|
||||||
|
@ -238,8 +246,20 @@ void VoiceAssistant::loop() {
|
||||||
size_t available = this->ring_buffer_->available();
|
size_t available = this->ring_buffer_->available();
|
||||||
while (available >= SEND_BUFFER_SIZE) {
|
while (available >= SEND_BUFFER_SIZE) {
|
||||||
size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
|
size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
|
||||||
|
if (this->audio_mode_ == AUDIO_MODE_API) {
|
||||||
|
api::VoiceAssistantAudio msg;
|
||||||
|
msg.data.assign((char *) this->send_buffer_, read_bytes);
|
||||||
|
this->api_client_->send_voice_assistant_audio(msg);
|
||||||
|
} else {
|
||||||
|
if (!this->udp_socket_running_) {
|
||||||
|
if (!this->start_udp_socket_()) {
|
||||||
|
this->set_state_(State::STOP_MICROPHONE, State::IDLE);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
|
this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
|
||||||
sizeof(this->dest_addr_));
|
sizeof(this->dest_addr_));
|
||||||
|
}
|
||||||
available = this->ring_buffer_->available();
|
available = this->ring_buffer_->available();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -268,6 +288,7 @@ void VoiceAssistant::loop() {
|
||||||
#ifdef USE_SPEAKER
|
#ifdef USE_SPEAKER
|
||||||
if (this->speaker_ != nullptr) {
|
if (this->speaker_ != nullptr) {
|
||||||
ssize_t received_len = 0;
|
ssize_t received_len = 0;
|
||||||
|
if (this->audio_mode_ == AUDIO_MODE_UDP) {
|
||||||
if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
|
if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
|
||||||
received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
|
received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
|
||||||
if (received_len > 0) {
|
if (received_len > 0) {
|
||||||
|
@ -278,12 +299,14 @@ void VoiceAssistant::loop() {
|
||||||
} else {
|
} else {
|
||||||
ESP_LOGD(TAG, "Receive buffer full");
|
ESP_LOGD(TAG, "Receive buffer full");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// Build a small buffer of audio before sending to the speaker
|
// Build a small buffer of audio before sending to the speaker
|
||||||
if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4)
|
bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
|
||||||
|
if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
|
||||||
this->write_speaker_();
|
this->write_speaker_();
|
||||||
if (this->wait_for_stream_end_) {
|
if (this->wait_for_stream_end_) {
|
||||||
this->cancel_timeout("playing");
|
this->cancel_timeout("playing");
|
||||||
if (this->stream_ended_ && received_len < 0) {
|
if (end_of_stream) {
|
||||||
ESP_LOGD(TAG, "End of audio stream received");
|
ESP_LOGD(TAG, "End of audio stream received");
|
||||||
this->cancel_timeout("speaker-timeout");
|
this->cancel_timeout("speaker-timeout");
|
||||||
this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
|
this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
|
||||||
|
@ -428,6 +451,22 @@ void VoiceAssistant::failed_to_start() {
|
||||||
this->set_state_(State::STOP_MICROPHONE, State::IDLE);
|
this->set_state_(State::STOP_MICROPHONE, State::IDLE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void VoiceAssistant::start_streaming() {
|
||||||
|
if (this->state_ != State::STARTING_PIPELINE) {
|
||||||
|
this->signal_stop_();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ESP_LOGD(TAG, "Client started, streaming microphone");
|
||||||
|
this->audio_mode_ = AUDIO_MODE_API;
|
||||||
|
|
||||||
|
if (this->mic_->is_running()) {
|
||||||
|
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
|
||||||
|
} else {
|
||||||
|
this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
|
void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
|
||||||
if (this->state_ != State::STARTING_PIPELINE) {
|
if (this->state_ != State::STARTING_PIPELINE) {
|
||||||
this->signal_stop_();
|
this->signal_stop_();
|
||||||
|
@ -435,6 +474,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
|
||||||
}
|
}
|
||||||
|
|
||||||
ESP_LOGD(TAG, "Client started, streaming microphone");
|
ESP_LOGD(TAG, "Client started, streaming microphone");
|
||||||
|
this->audio_mode_ = AUDIO_MODE_UDP;
|
||||||
|
|
||||||
memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
|
memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
|
||||||
if (this->dest_addr_.ss_family == AF_INET) {
|
if (this->dest_addr_.ss_family == AF_INET) {
|
||||||
|
@ -688,6 +728,17 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) {
|
||||||
|
if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
|
||||||
|
memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
|
||||||
|
this->speaker_buffer_index_ += msg.data.length();
|
||||||
|
this->speaker_buffer_size_ += msg.data.length();
|
||||||
|
this->speaker_bytes_received_ += msg.data.length();
|
||||||
|
} else {
|
||||||
|
ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
|
VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
|
||||||
|
|
||||||
} // namespace voice_assistant
|
} // namespace voice_assistant
|
||||||
|
|
|
@ -29,9 +29,14 @@ namespace voice_assistant {
|
||||||
|
|
||||||
// Version 1: Initial version
|
// Version 1: Initial version
|
||||||
// Version 2: Adds raw speaker support
|
// Version 2: Adds raw speaker support
|
||||||
// Version 3: Unused/skip
|
static const uint32_t LEGACY_INITIAL_VERSION = 1;
|
||||||
static const uint32_t INITIAL_VERSION = 1;
|
static const uint32_t LEGACY_SPEAKER_SUPPORT = 2;
|
||||||
static const uint32_t SPEAKER_SUPPORT = 2;
|
|
||||||
|
enum VoiceAssistantFeature : uint32_t {
|
||||||
|
FEATURE_VOICE_ASSISTANT = 1 << 0,
|
||||||
|
FEATURE_SPEAKER = 1 << 1,
|
||||||
|
FEATURE_API_AUDIO = 1 << 2,
|
||||||
|
};
|
||||||
|
|
||||||
enum class State {
|
enum class State {
|
||||||
IDLE,
|
IDLE,
|
||||||
|
@ -49,11 +54,17 @@ enum class State {
|
||||||
RESPONSE_FINISHED,
|
RESPONSE_FINISHED,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum AudioMode : uint8_t {
|
||||||
|
AUDIO_MODE_UDP,
|
||||||
|
AUDIO_MODE_API,
|
||||||
|
};
|
||||||
|
|
||||||
class VoiceAssistant : public Component {
|
class VoiceAssistant : public Component {
|
||||||
public:
|
public:
|
||||||
void setup() override;
|
void setup() override;
|
||||||
void loop() override;
|
void loop() override;
|
||||||
float get_setup_priority() const override;
|
float get_setup_priority() const override;
|
||||||
|
void start_streaming();
|
||||||
void start_streaming(struct sockaddr_storage *addr, uint16_t port);
|
void start_streaming(struct sockaddr_storage *addr, uint16_t port);
|
||||||
void failed_to_start();
|
void failed_to_start();
|
||||||
|
|
||||||
|
@ -71,19 +82,32 @@ class VoiceAssistant : public Component {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint32_t get_version() const {
|
uint32_t get_legacy_version() const {
|
||||||
#ifdef USE_SPEAKER
|
#ifdef USE_SPEAKER
|
||||||
if (this->speaker_ != nullptr) {
|
if (this->speaker_ != nullptr) {
|
||||||
return SPEAKER_SUPPORT;
|
return LEGACY_SPEAKER_SUPPORT;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return INITIAL_VERSION;
|
return LEGACY_INITIAL_VERSION;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t get_feature_flags() const {
|
||||||
|
uint32_t flags = 0;
|
||||||
|
flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT;
|
||||||
|
#ifdef USE_SPEAKER
|
||||||
|
if (this->speaker_ != nullptr) {
|
||||||
|
flags |= VoiceAssistantFeature::FEATURE_SPEAKER;
|
||||||
|
flags |= VoiceAssistantFeature::FEATURE_API_AUDIO;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
void request_start(bool continuous, bool silence_detection);
|
void request_start(bool continuous, bool silence_detection);
|
||||||
void request_stop();
|
void request_stop();
|
||||||
|
|
||||||
void on_event(const api::VoiceAssistantEventResponse &msg);
|
void on_event(const api::VoiceAssistantEventResponse &msg);
|
||||||
|
void on_audio(const api::VoiceAssistantAudio &msg);
|
||||||
|
|
||||||
bool is_running() const { return this->state_ != State::IDLE; }
|
bool is_running() const { return this->state_ != State::IDLE; }
|
||||||
void set_continuous(bool continuous) { this->continuous_ = continuous; }
|
void set_continuous(bool continuous) { this->continuous_ = continuous; }
|
||||||
|
@ -201,6 +225,10 @@ class VoiceAssistant : public Component {
|
||||||
|
|
||||||
State state_{State::IDLE};
|
State state_{State::IDLE};
|
||||||
State desired_state_{State::IDLE};
|
State desired_state_{State::IDLE};
|
||||||
|
|
||||||
|
AudioMode audio_mode_{AUDIO_MODE_UDP};
|
||||||
|
bool udp_socket_running_{false};
|
||||||
|
bool start_udp_socket_();
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
|
template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
|
||||||
|
|
Loading…
Reference in a new issue