roboattic Lab
Published © CC BY-NC-SA

Desk buddy | Companion robot on wheels & Speech Recognition

In this project I built a desktop companion robot using an ESP32 s3, OLED display, and four gear motors.

IntermediateFull instructions provided8 hours14
Desk buddy | Companion robot on wheels & Speech Recognition

Things used in this project

Hardware components

Seeed Studio XIAO ESP32S3 Sense
Seeed Studio XIAO ESP32S3 Sense
×1

Software apps and online services

Arduino IDE
Arduino IDE
Fusion
Autodesk Fusion
Tinkercad
Autodesk Tinkercad

Story

Read more

Custom parts and enclosures

Car Chassis

Box

Motor Clip

Schematics

Circuit Diagram

Code

Code

C/C++
/*
 * ============================================================
 *  Voice-Controlled Robot with Animated OLED Eyes
 *  Author    : Shahbaz Hashmi Ansari
 *  Copyright : Roboattic Lab — All Rights Reserved
 * ============================================================
 *
 */

#include <WiFi.h>
#include <HTTPClient.h>
#include <ArduinoJson.h>

#include <Wire.h>
#include <Adafruit_GFX.h>
#include <Adafruit_SSD1306.h>
#include <FluxGarage_RoboEyes.h>

#define SCREEN_WIDTH 128
#define SCREEN_HEIGHT 64
#define OLED_RESET -1
Adafruit_SSD1306 display(SCREEN_WIDTH, SCREEN_HEIGHT, &Wire, OLED_RESET);

RoboEyes<Adafruit_SSD1306> roboEyes(display);


// --- WiFi Credentials ---
const char* ssid = "***********";
const char* password = "**********";


// --- Pin Assignments ---
#define LED 21

#define LED_ON()  digitalWrite(LED, LOW)
#define LED_OFF() digitalWrite(LED, HIGH)


// --- L298N Motor Driver Pins (Seeed XIAO ESP32-S3) ---
#define ENA       43
#define MOTOR_IN1 44
#define MOTOR_IN2 7
#define ENB       8
#define MOTOR_IN3 9
#define MOTOR_IN4 1

#define MOTOR_PWM_FREQ 1000
#define MOTOR_PWM_RES  8

int motor_speed = 180;

#define MOTOR_MOVE_DURATION 2000


// --- PSRAM Audio Buffer ---
uint8_t* psram_audio_buffer   = NULL;
size_t   psram_audio_size     = 0;
size_t   psram_audio_capacity = 0;

#define PSRAM_AUDIO_MAX_SIZE (512 * 1024)


// --- Continuous Listening Settings ---
#define RECORD_DURATION_MS 5000


// --- ElevenLabs API ---
const char* elevenlabs_api_key = "API_KEY HERE";
const char* elevenlabs_stt_url = "https://api.elevenlabs.io/v1/speech-to-text";


// --- Function Declarations ---
bool   I2S_Record_Init();
bool   Record_Start(const char* filename);
bool   Record_Available(const char* filename, float* audiolength_sec);
String SpeechToText_ElevenLabs(const char* filename);
void   motor_init();
void   motor_forward();
void   motor_backward();
void   motor_left();
void   motor_right();
void   motor_stop();
void   process_voice_command(String transcription);



// ==============================================================
//                     Motor Control
// ==============================================================

void motor_init() {
  pinMode(MOTOR_IN1, OUTPUT);
  pinMode(MOTOR_IN2, OUTPUT);
  pinMode(MOTOR_IN3, OUTPUT);
  pinMode(MOTOR_IN4, OUTPUT);
  pinMode(ENA, OUTPUT);
  pinMode(ENB, OUTPUT);
  digitalWrite(MOTOR_IN1, LOW);
  digitalWrite(MOTOR_IN2, LOW);
  digitalWrite(MOTOR_IN3, LOW);
  digitalWrite(MOTOR_IN4, LOW);
  analogWrite(ENA, 0);
  analogWrite(ENB, 0);

  Serial.println("> Motors initialized and STOPPED. Default speed: " + String(motor_speed));
}

void motor_forward() {
  Serial.println("> MOTOR: Moving FORWARD (speed=" + String(motor_speed) + ")");

  roboEyes.setMood(HAPPY);
  roboEyes.setPosition(DEFAULT);

  digitalWrite(MOTOR_IN1, HIGH);
  digitalWrite(MOTOR_IN2, LOW);
  digitalWrite(MOTOR_IN3, HIGH);
  digitalWrite(MOTOR_IN4, LOW);
  analogWrite(ENA, motor_speed);
  analogWrite(ENB, motor_speed);

  uint32_t move_start = millis();
  while (millis() - move_start < MOTOR_MOVE_DURATION) {
    roboEyes.update();
    delay(5);
  }

  motor_stop();
}

void motor_backward() {
  Serial.println("> MOTOR: Moving BACKWARD (speed=" + String(motor_speed) + ")");

  roboEyes.setMood(TIRED);
  roboEyes.setPosition(DEFAULT);

  digitalWrite(MOTOR_IN1, LOW);
  digitalWrite(MOTOR_IN2, HIGH);
  digitalWrite(MOTOR_IN3, LOW);
  digitalWrite(MOTOR_IN4, HIGH);
  analogWrite(ENA, motor_speed);
  analogWrite(ENB, motor_speed);

  uint32_t move_start = millis();
  while (millis() - move_start < MOTOR_MOVE_DURATION) {
    roboEyes.update();
    delay(5);
  }

  motor_stop();
}

void motor_left() {
  Serial.println("> MOTOR: Turning LEFT (speed=" + String(motor_speed) + ")");

  roboEyes.setMood(ANGRY);
  roboEyes.setPosition(W);

  digitalWrite(MOTOR_IN1, HIGH);
  digitalWrite(MOTOR_IN2, LOW);
  digitalWrite(MOTOR_IN3, LOW);
  digitalWrite(MOTOR_IN4, HIGH);
  analogWrite(ENA, motor_speed);
  analogWrite(ENB, motor_speed);

  uint32_t move_start = millis();
  while (millis() - move_start < MOTOR_MOVE_DURATION) {
    roboEyes.update();
    delay(5);
  }

  motor_stop();
}

void motor_right() {
  Serial.println("> MOTOR: Turning RIGHT (speed=" + String(motor_speed) + ")");

  roboEyes.setMood(ANGRY);
  roboEyes.setPosition(E);

  digitalWrite(MOTOR_IN1, LOW);
  digitalWrite(MOTOR_IN2, HIGH);
  digitalWrite(MOTOR_IN3, HIGH);
  digitalWrite(MOTOR_IN4, LOW);
  analogWrite(ENA, motor_speed);
  analogWrite(ENB, motor_speed);

  uint32_t move_start = millis();
  while (millis() - move_start < MOTOR_MOVE_DURATION) {
    roboEyes.update();
    delay(5);
  }

  motor_stop();
}

void motor_stop() {
  digitalWrite(MOTOR_IN1, LOW);
  digitalWrite(MOTOR_IN2, LOW);
  digitalWrite(MOTOR_IN3, LOW);
  digitalWrite(MOTOR_IN4, LOW);
  analogWrite(ENA, 0);
  analogWrite(ENB, 0);

  roboEyes.setMood(DEFAULT);
  roboEyes.setPosition(DEFAULT);
  roboEyes.update();
}


// --- Strip punctuation and convert to lowercase for reliable command matching ---
String clean_text(String text) {
  String cleaned = "";
  for (int i = 0; i < (int)text.length(); i++) {
    char c = text.charAt(i);
    if (isAlpha(c) || isDigit(c) || c == ' ') {
      cleaned += (char)tolower(c);
    }
  }
  String result = "";
  bool last_was_space = false;
  for (int i = 0; i < (int)cleaned.length(); i++) {
    if (cleaned.charAt(i) == ' ') {
      if (!last_was_space && result.length() > 0) {
        result += ' ';
        last_was_space = true;
      }
    } else {
      result += cleaned.charAt(i);
      last_was_space = false;
    }
  }
  result.trim();
  return result;
}

// --- Extract a number from text — supports digits and English words ---
int extract_number(String text) {
  String num_str = "";
  bool found_digit = false;
  for (int i = 0; i < (int)text.length(); i++) {
    if (isDigit(text.charAt(i))) {
      num_str += text.charAt(i);
      found_digit = true;
    } else if (found_digit) {
      break;
    }
  }
  if (num_str.length() > 0) {
    return num_str.toInt();
  }

  int current = 0;
  bool found_word_num = false;

  int start = 0;
  while (start < text.length()) {
    int space_idx = text.indexOf(' ', start);
    if (space_idx == -1) space_idx = text.length();
    String word = text.substring(start, space_idx);

    int val = -1;
    if (word == "zero") val = 0;
    else if (word == "one") val = 1;
    else if (word == "two" || word == "to" || word == "too") val = 2;
    else if (word == "three") val = 3;
    else if (word == "four" || word == "for") val = 4;
    else if (word == "five") val = 5;
    else if (word == "six") val = 6;
    else if (word == "seven") val = 7;
    else if (word == "eight") val = 8;
    else if (word == "nine") val = 9;
    else if (word == "ten") val = 10;
    else if (word == "eleven") val = 11;
    else if (word == "twelve") val = 12;
    else if (word == "thirteen") val = 13;
    else if (word == "fourteen") val = 14;
    else if (word == "fifteen") val = 15;
    else if (word == "sixteen") val = 16;
    else if (word == "seventeen") val = 17;
    else if (word == "eighteen") val = 18;
    else if (word == "nineteen") val = 19;
    else if (word == "twenty") val = 20;
    else if (word == "thirty") val = 30;
    else if (word == "forty" || word == "fourty") val = 40;
    else if (word == "fifty") val = 50;
    else if (word == "sixty") val = 60;
    else if (word == "seventy") val = 70;
    else if (word == "eighty") val = 80;
    else if (word == "ninety") val = 90;
    else if (word == "hundred") val = 100;

    if (val != -1) {
      found_word_num = true;
      if (val == 100) {
        if (current == 0) current = 1;
        current *= 100;
      } else {
        current += val;
      }
    } else {
      if (found_word_num) break;
    }
    start = space_idx + 1;
  }

  if (found_word_num) {
    return current;
  }

  return -1;
}

// --- Parse transcription and execute the appropriate motor command ---
void process_voice_command(String transcription) {
  if (transcription.length() == 0) return;

  String cmd = clean_text(transcription);
  Serial.println("> Command (cleaned): [" + cmd + "]");

  if (cmd.indexOf("forward") >= 0) {
    motor_forward();
    return;
  }
  if (cmd.indexOf("backward") >= 0 || cmd.indexOf("back") >= 0 || cmd.indexOf("reverse") >= 0) {
    motor_backward();
    return;
  }
  if (cmd.indexOf("left") >= 0) {
    motor_left();
    return;
  }
  if (cmd.indexOf("right") >= 0) {
    motor_right();
    return;
  }
  if (cmd.indexOf("stop") >= 0) {
    motor_stop();
    Serial.println("> MOTOR: STOPPED");
    return;
  }

  if (cmd.indexOf("speed") >= 0) {
    roboEyes.setMood(HAPPY);
    roboEyes.setPosition(DEFAULT);

    uint32_t move_start = millis();
    while (millis() - move_start < 1000) {
      roboEyes.update();
      delay(5);
    }

    int new_speed = extract_number(cmd);
    if (new_speed >= 0 && new_speed <= 255) {
      motor_speed = new_speed;
      Serial.println("> MOTOR: Speed set to " + String(motor_speed));
    } else if (new_speed > 255) {
      motor_speed = 255;
      Serial.println("> MOTOR: Speed clamped to max 255");
    } else {
      Serial.println("> MOTOR: Could not parse speed value from command");
    }
    return;
  }

  Serial.println("> No motor command detected.");
}



// ==============================================================
//                     I2S Audio Recording
// ==============================================================

#include "driver/i2s_std.h"

#ifndef DEBUG
#define DEBUG true
#define DebugPrint(x)   ; if (DEBUG) { Serial.print(x); }
#define DebugPrintln(x) ; if (DEBUG) { Serial.println(x); }
#endif

// --- I2S Pin Assignments ---
#define I2S_WS  2
#define I2S_SD  4
#define I2S_SCK 3

#define SAMPLE_RATE      16000
#define BITS_PER_SAMPLE  16
#define GAIN_BOOSTER_I2S 4

// Set to true if L/R pin is tied to 3.3V, false if tied to GND
#define MIC_CHANNEL_RIGHT true

i2s_chan_handle_t rx_handle;

struct WAV_HEADER {
  char  riff[4]        = { 'R', 'I', 'F', 'F' };
  long  flength        = 0;
  char  wave[4]        = { 'W', 'A', 'V', 'E' };
  char  fmt[4]         = { 'f', 'm', 't', ' ' };
  long  chunk_size     = 16;
  short format_tag     = 1;
  short num_chans      = 1;
  long  srate          = SAMPLE_RATE;
  long  bytes_per_sec  = SAMPLE_RATE * (BITS_PER_SAMPLE / 8);
  short bytes_per_samp = (BITS_PER_SAMPLE / 8);
  short bits_per_samp  = BITS_PER_SAMPLE;
  char  dat[4]         = { 'd', 'a', 't', 'a' };
  long  dlength        = 0;
} myWAV_Header;

bool flg_is_recording     = false;
bool flg_I2S_initialized  = false;


bool I2S_Record_Init() {
  i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_AUTO, I2S_ROLE_MASTER);
  i2s_new_channel(&chan_cfg, NULL, &rx_handle);

  i2s_std_config_t std_cfg = {
    .clk_cfg  = I2S_STD_CLK_DEFAULT_CONFIG(SAMPLE_RATE),
    .slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_STEREO),
    .gpio_cfg = {
      .mclk  = I2S_GPIO_UNUSED,
      .bclk  = (gpio_num_t)I2S_SCK,
      .ws    = (gpio_num_t)I2S_WS,
      .dout  = I2S_GPIO_UNUSED,
      .din   = (gpio_num_t)I2S_SD,
      .invert_flags = {
        .mclk_inv = false,
        .bclk_inv = false,
        .ws_inv   = false,
      },
    },
  };

  i2s_channel_init_std_mode(rx_handle, &std_cfg);
  i2s_channel_enable(rx_handle);

  flg_I2S_initialized = true;

  // Flush initial garbage samples
  int16_t dummy_buf[1024];
  size_t  dummy_read = 0;
  for (int i = 0; i < 10; i++) {
    i2s_channel_read(rx_handle, dummy_buf, sizeof(dummy_buf), &dummy_read, portMAX_DELAY);
  }

  // Microphone diagnostic test
  int16_t test_buf[1024];
  size_t  test_read = 0;
  i2s_channel_read(rx_handle, test_buf, sizeof(test_buf), &test_read, portMAX_DELAY);

  int total_stereo_samples = test_read / 2;
  int frames               = total_stereo_samples / 2;

  int   left_nonzero = 0;
  int16_t left_max   = 0;
  int   left_clipped = 0;
  for (int i = 0; i < frames; i++) {
    int16_t s = test_buf[i * 2];
    if (s != 0) left_nonzero++;
    if (abs(s) > abs(left_max)) left_max = s;
    if (s == 32767 || s == -32768) left_clipped++;
  }

  int   right_nonzero = 0;
  int16_t right_max   = 0;
  int   right_clipped = 0;
  for (int i = 0; i < frames; i++) {
    int16_t s = test_buf[i * 2 + 1];
    if (s != 0) right_nonzero++;
    if (abs(s) > abs(right_max)) right_max = s;
    if (s == 32767 || s == -32768) right_clipped++;
  }

  Serial.println("> I2S initialized and flushed.");
  Serial.println("> MIC TEST (" + String(frames) + " stereo frames):");
  Serial.println(">   LEFT  ch: " + String(left_nonzero) + " non-zero, max=" + String(left_max) + ", clipped=" + String(left_clipped));
  Serial.println(">   RIGHT ch: " + String(right_nonzero) + " non-zero, max=" + String(right_max) + ", clipped=" + String(right_clipped));
  Serial.println(">   Using: " + String(MIC_CHANNEL_RIGHT ? "RIGHT" : "LEFT") + " channel");

  Serial.print(">   First 8 stereo frames (L,R): ");
  for (int i = 0; i < 8 && i < frames; i++) {
    Serial.print("[" + String(test_buf[i * 2]) + "," + String(test_buf[i * 2 + 1]) + "] ");
  }
  Serial.println();

  return flg_I2S_initialized;
}


bool Record_Start(const char* audio_filename) {
  if (!flg_I2S_initialized) {
    Serial.println("ERROR in Record_Start() - I2S not initialized, call 'I2S_Record_Init()' missed");
    return false;
  }

  if (!flg_is_recording) {
    flg_is_recording  = true;
    psram_audio_size  = 0;

    memcpy(psram_audio_buffer, (uint8_t*)&myWAV_Header, 44);
    psram_audio_size = 44;

    DebugPrintln("\n> WAV Header generated in PSRAM, Audio Recording started ... ");
    return true;
  }

  if (flg_is_recording) {
    int16_t stereo_buffer[2048];
    size_t  bytes_read = 0;
    i2s_channel_read(rx_handle, stereo_buffer, sizeof(stereo_buffer), &bytes_read, portMAX_DELAY);

    int total_samples  = bytes_read / 2;
    int stereo_frames  = total_samples / 2;

    int16_t mono_buffer[1024];
    for (int i = 0; i < stereo_frames && i < 1024; i++) {
      if (MIC_CHANNEL_RIGHT)
        mono_buffer[i] = stereo_buffer[i * 2 + 1];
      else
        mono_buffer[i] = stereo_buffer[i * 2];
    }

    if (GAIN_BOOSTER_I2S > 1 && GAIN_BOOSTER_I2S <= 64) {
      for (int i = 0; i < stereo_frames && i < 1024; ++i) {
        int32_t amplified = (int32_t)mono_buffer[i] * GAIN_BOOSTER_I2S;
        if (amplified > 32767)  amplified = 32767;
        if (amplified < -32768) amplified = -32768;
        mono_buffer[i] = (int16_t)amplified;
      }
    }

    size_t write_bytes = stereo_frames * 2;
    if (write_bytes > 2048) write_bytes = 2048;

    if ((psram_audio_size + write_bytes) <= psram_audio_capacity) {
      memcpy(psram_audio_buffer + psram_audio_size, (uint8_t*)mono_buffer, write_bytes);
      psram_audio_size += write_bytes;
      return true;
    } else {
      Serial.println("ERROR in Record_Start() - PSRAM buffer full!");
      return false;
    }
  }
  return false;
}


bool Record_Available(const char* audio_filename, float* audiolength_sec) {
  if (!flg_is_recording)    return false;
  if (!flg_I2S_initialized) return false;

  if (flg_is_recording) {
    long filesize          = psram_audio_size;
    myWAV_Header.flength   = filesize;
    myWAV_Header.dlength   = (filesize - 44);
    memcpy(psram_audio_buffer, (uint8_t*)&myWAV_Header, 44);

    flg_is_recording = false;

    *audiolength_sec = (float)(filesize - 44) / (SAMPLE_RATE * BITS_PER_SAMPLE / 8);

    DebugPrintln("> ... Done. Audio Recording finished.");
    DebugPrint("> AUDIO in PSRAM, size [bytes]: " + (String)filesize);
    DebugPrintln(", length [sec]: " + (String)*audiolength_sec);

    if (filesize > 76) {
      DebugPrint("> First 16 mono samples (16bit signed): ");
      int16_t* samples = (int16_t*)(psram_audio_buffer + 44);
      for (int i = 0; i < 16; i++) {
        DebugPrint((String)samples[i] + " ");
      }
      DebugPrintln("");
    }

    return true;
  }
  return false;
}



// ==============================================================
//                  ElevenLabs Speech-to-Text
// ==============================================================

String SpeechToText_ElevenLabs(const char* audio_filename) {
  uint32_t t_start = millis();

  if (WiFi.status() != WL_CONNECTED) {
    Serial.println("ERROR - WiFi not connected, cannot send to ElevenLabs STT");
    return ("");
  }

  size_t audio_size = psram_audio_size;
  if (audio_size == 0) {
    Serial.println("ERROR - No audio data in PSRAM");
    return ("");
  }
  if (audio_size > 500000) {
    Serial.println("ERROR - Audio data too large for STT request (>500KB)");
    return ("");
  }
  DebugPrintln("> Audio data in PSRAM, size: " + (String)audio_size);

  uint32_t t_data_ready = millis();

  HTTPClient http;
  if (!http.begin(elevenlabs_stt_url)) {
    Serial.println("ERROR - Failed to initialize HTTP connection to ElevenLabs");
    return ("");
  }

  http.setTimeout(30000);
  http.setConnectTimeout(10000);
  http.addHeader("xi-api-key", elevenlabs_api_key);

  String boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW";
  http.addHeader("Content-Type", "multipart/form-data; boundary=" + boundary);

  String body_start = "--" + boundary + "\r\n";
  body_start += "Content-Disposition: form-data; name=\"model_id\"\r\n\r\n";
  body_start += "scribe_v2\r\n";
  body_start += "--" + boundary + "\r\n";
  body_start += "Content-Disposition: form-data; name=\"language_code\"\r\n\r\n";
  body_start += "eng\r\n";
  body_start += "--" + boundary + "\r\n";
  body_start += "Content-Disposition: form-data; name=\"file\"; filename=\"audio.wav\"\r\n";
  body_start += "Content-Type: audio/wav\r\n\r\n";

  String body_end = "\r\n--" + boundary + "--\r\n";

  size_t total_size = body_start.length() + audio_size + body_end.length();

  uint8_t* complete_body = (uint8_t*)malloc(total_size);
  if (!complete_body) {
    Serial.println("ERROR - Failed to allocate memory for HTTP body!");
    http.end();
    return ("");
  }

  memcpy(complete_body, body_start.c_str(), body_start.length());
  memcpy(complete_body + body_start.length(), psram_audio_buffer, audio_size);
  memcpy(complete_body + body_start.length() + audio_size, body_end.c_str(), body_end.length());

  uint32_t t_request_prepared = millis();

  DebugPrintln("> POST Request to ElevenLabs STT, sending " + String(total_size) + " bytes ...");

  uint32_t t_request_sent      = millis();
  int httpResponseCode          = http.POST(complete_body, total_size);
  uint32_t t_response_received = millis();

  free(complete_body);

  String transcription = "";
  String response      = http.getString();

  uint32_t t_response_parsed = millis();

  if (httpResponseCode == 200) {
    DebugPrintln("> HTTP 200 OK");
    DynamicJsonDocument doc(2048);
    if (deserializeJson(doc, response) == DeserializationError::Ok) {
      if (doc.containsKey("text")) {
        transcription = doc["text"].as<String>();
      }
    } else {
      Serial.println("ERROR - Failed to parse ElevenLabs JSON response");
    }
  } else {
    Serial.printf("ERROR - HTTP Response Code: %d\n", httpResponseCode);
    Serial.println("Response: " + response);
  }

  http.end();

  DebugPrintln("---------------------------------------------------");
  DebugPrintln("-> Audio data size [bytes]: " + (String)audio_size);
  DebugPrintln("-> Latency Data Ready [t_data_ready]:          " + (String)((float)((t_data_ready - t_start)) / 1000) + " sec");
  DebugPrintln("-> Latency Request Preparation:                " + (String)((float)((t_request_prepared - t_data_ready)) / 1000) + " sec");
  DebugPrintln("-> Latency ElevenLabs STT Response:            " + (String)((float)((t_response_received - t_request_sent)) / 1000) + " sec");
  DebugPrintln("-> Latency Response Parsing:                   " + (String)((float)((t_response_parsed - t_response_received)) / 1000) + " sec");
  DebugPrintln("=> TOTAL Duration [sec]: ..................... " + (String)((float)((t_response_parsed - t_start)) / 1000));
  DebugPrintln("=> Server response length [bytes]: " + (String)response.length());
  DebugPrintln("=> Transcription: [" + transcription + "]");
  DebugPrintln("---------------------------------------------------\n");

  return transcription;
}



// ==============================================================
//                        Setup & Loop
// ==============================================================

void setup() {
  motor_init();
  Serial.begin(115200);
  Serial.setTimeout(100);

  pinMode(LED, OUTPUT);
  LED_OFF();

  delay(500);
  Wire.setPins(5, 6);
  Wire.begin();

  bool display_found = false;
  if (display.begin(SSD1306_SWITCHCAPVCC, 0x3C)) {
    display_found = true;
  } else if (display.begin(SSD1306_SWITCHCAPVCC, 0x3D)) {
    display_found = true;
  }

  if (!display_found) {
    Serial.println(F("\nCRITICAL ERROR: OLED Display Not Found!"));
    Serial.println(F("Check I2C wiring: SDA to D4(GPIO5) & SCL to D5(GPIO6)"));
    while (true) { delay(100); }
  }

  display.clearDisplay();
  display.display();

  roboEyes.begin(SCREEN_WIDTH, SCREEN_HEIGHT, 100);
  roboEyes.setAutoblinker(true, 3, 2);
  roboEyes.setIdleMode(true, 2, 2);
  roboEyes.setMood(DEFAULT);
  roboEyes.setPosition(DEFAULT);

  Serial.println(VERSION);

  if (psramFound()) {
    psram_audio_buffer = (uint8_t*)ps_malloc(PSRAM_AUDIO_MAX_SIZE);
    if (psram_audio_buffer) {
      psram_audio_capacity = PSRAM_AUDIO_MAX_SIZE;
      Serial.println("PSRAM initialized. Allocated " + String(PSRAM_AUDIO_MAX_SIZE) + " bytes for audio buffer.");
    } else {
      Serial.println("ERROR - PSRAM allocation failed!");
      return;
    }
  } else {
    Serial.println("ERROR - No PSRAM found! XIAO ESP32-S3 PSRAM required.");
    return;
  }

  WiFi.mode(WIFI_STA);
  WiFi.begin(ssid, password);
  Serial.print("Connecting WLAN ");
  while (WiFi.status() != WL_CONNECTED) {
    Serial.print(".");
    roboEyes.update();
    delay(500);
  }
  Serial.println(". Done, device connected.");
  LED_OFF();

  I2S_Record_Init();

  Serial.println("> Continuous listening mode: recording " + String(RECORD_DURATION_MS / 1000) + " sec chunks, transcribing each ...");
  Serial.println("> Voice commands: 'go forward/backward/left/right/stop'");
  Serial.println("> Speed command:  'make speed 150' (0-255)");
}


void loop() {

  roboEyes.setMood(DEFAULT);
  roboEyes.setPosition(N);
  roboEyes.update();

  // --- Step 1: Record audio for configured duration ---
  LED_ON();
  motor_stop();

  uint32_t record_start = millis();

  Record_Start("PSRAM_AUDIO");

  while (millis() - record_start < RECORD_DURATION_MS) {
    Record_Start("PSRAM_AUDIO");
    roboEyes.update();

    display.setTextSize(1);
    display.setTextColor(SSD1306_WHITE);
    int16_t  x1, y1;
    uint16_t tw, th;
    display.getTextBounds("Listening...", 0, 0, &x1, &y1, &tw, &th);
    int textX = (SCREEN_WIDTH - tw) / 2;
    int textY = SCREEN_HEIGHT - th - 2;
    display.fillRect(textX - 2, textY - 1, tw + 4, th + 2, SSD1306_BLACK);
    display.setCursor(textX, textY);
    display.print("Listening...");
    display.display();

    delay(5);
  }

  float recorded_seconds = 0;
  Record_Available("PSRAM_AUDIO", &recorded_seconds);

  LED_OFF();

  // --- Step 2: Transcribe audio via ElevenLabs ---
  if (recorded_seconds > 0.4) {
    roboEyes.setMood(TIRED);
    roboEyes.setPosition(DEFAULT);
    roboEyes.update();

    String transcription = SpeechToText_ElevenLabs("PSRAM_AUDIO");

    if (transcription.length() > 0) {
      Serial.println(">> " + transcription);

      // --- Step 3: Execute the detected voice command ---
      process_voice_command(transcription);

    } else {
      Serial.println("> (silence or no speech detected)");
    }
  }

  // --- Step 4: Begin next recording cycle ---
}

Credits

roboattic Lab
20 projects • 14 followers
YouTube Content Creator Robotics Enthusiast

Comments