Published November 21, 2025 © MIT

DIY PC Speaker to AI Voice Assistant

This project combines embedded systems and AI inference to create an end-to-end conversational assistant. The ESP32 handles real-time audio

IntermediateFull instructions provided5 hours17

Things used in this project

Hardware components

Espressif ESP32 Development Board - Developer Edition

INMP441 I2S MEMS Microphone

LM386 Audio Amplifier Module

2inch 8Ohm 12W Midrange Speaker

12x12x12mm Tactile Push Button

https://robu.in/product/tp4056-1a-li-ion-lithium-battery-charging-module-with-current-protection-type-c/

1000mAh Rechargeable 3.7v Lithium Polymer Battery

Software apps and online services

Arduino IDE

Microsoft VS Code

Story

ESP32 Voice Assistant

Github Repo: https://github.com/arpy8/ESP32_Voice_AssistantYoutube Video: https://www.youtube.com/@arpy8

Step 1: Wiring the circuitOnce you have all the components on hand, start wiring the connections. Refer to the following schematic:

Keep in mind:

INMP441 requires strictly 3v3 ONLY
DAC output (GPIO 25) → LM386 → Speaker
TP4056 powers ESP32 via 5V out

Workflow:

It is important to understand the workflow of the whole thing:1. Button Press -> ESP32 starts recording via the INMP441 microphone.2. I2S Audio Capture, 16 kHz samples are streamed in real-time over WebSocket.3. AI Processing (Server)

Whisper converts audio → text
Gemini 2.5 Flash generates a contextual reply
Piper converts text → natural speech

4. Response Playback, The server streams 8-bit PCM chunks back to ESP32 for DAC output.5. User Hears AI Voice, LM386 amplifier drives the speaker.

Step 2: Flash Code to ESP32

First make sure you have WebSocketsClient library installed. For the Arduino IDE, you can simply download the zip file and install the libraries to the IDE.

1. ESP32 Sketch

//esp-code.ino
#include <WiFi.h>
#include <driver/i2s.h>
#include <driver/dac.h>
#include <WebSocketsClient.h>
#include "secrets.h"
#define I2S_WS 15
#define I2S_SD 32
#define I2S_SCK 14
#define I2S_PORT I2S_NUM_0
#define RECORD_BUTTON 26
#define LED_BUILTIN 2
#define DAC_CHANNEL DAC_CHANNEL_1
#define SAMPLE_RATE 16000
#define BUFFER_SIZE 4096
WebSocketsClient webSocket;
volatile bool isRecording = false;
volatile bool isReceivingAudio = false;
hw_timer_t* timer = NULL;
portMUX_TYPE timerMux = portMUX_INITIALIZER_UNLOCKED;
void IRAM_ATTR onTimer() {} // kept for future use if needed
void setupWifi() {
Serial.print("[ESP] WiFi");
WiFi.begin(WIFI_SSID, WIFI_PASS);
while (WiFi.status() != WL_CONNECTED) {
delay(500);
Serial.print(".");
}
Serial.println(" ✓");
}
void setupI2SMicrophone() {
Serial.print("[ESP] Microphone...");
const i2s_config_t i2s_config = {
.mode = i2s_mode_t(I2S_MODE_MASTER | I2S_MODE_RX),
.sample_rate = 16000,
.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT,
.communication_format = i2s_comm_format_t(I2S_COMM_FORMAT_STAND_I2S),
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 8,
.dma_buf_len = 512,
.use_apll = false,
.tx_desc_auto_clear = false,
.fixed_mclk = 0
};
esp_err_t err = i2s_driver_install(I2S_PORT, &i2s_config, 0, NULL);
if (err != ESP_OK) {
Serial.printf("[ESP] Failed to install I2S driver: %d\n", err);
return;
}
const i2s_pin_config_t pin_config = {
.bck_io_num = I2S_SCK,
.ws_io_num = I2S_WS,
.data_out_num = I2S_PIN_NO_CHANGE,
.data_in_num = I2S_SD
};
err = i2s_set_pin(I2S_PORT, &pin_config);
if (err != ESP_OK) {
Serial.printf("[ESP] Failed to set I2S pins: %d\n", err);
return;
}
i2s_zero_dma_buffer(I2S_PORT);
Serial.println("✓");
}
void setupDACOutput() {
Serial.print("[ESP] Speaker (DAC)...");
dac_output_enable(DAC_CHANNEL);
dac_output_voltage(DAC_CHANNEL, 128);
timer = timerBegin(0, 80, true);
timerAttachInterrupt(timer, &onTimer, true);
timerAlarmWrite(timer, 1000000 / SAMPLE_RATE, true);
timerAlarmEnable(timer);
Serial.println(" ✓");
}
void send_audio_chunk() {
const int samples = 1024;
int32_t buffer32[samples];
size_t bytes_read;
esp_err_t result = i2s_read(I2S_PORT, buffer32, samples * sizeof(int32_t),
&bytes_read, portMAX_DELAY);
if (result != ESP_OK) {
Serial.printf("[ESP] I2S read error: %d\n", result);
return;
}
int16_t pcm16[samples];
for (int i = 0; i < samples; i++) {
pcm16[i] = (int16_t)(buffer32[i] >> 16);
}
webSocket.sendBIN((uint8_t*)pcm16, sizeof(pcm16));
}
void playTestTone(int durationMs, int times) {
Serial.println("[ESP] Playing 1kHz test tone...");
int samples = (8000 * durationMs) / 1000;
for (int i = 0; i < times; i++) {
for (int i = 0; i < samples; i++) {
float t = (float)i / 8000.0;
float sine = sin(2.0 * PI * 1000.0 * t);
uint8_t value = (uint8_t)((sine * 100) + 128);
dac_output_voltage(DAC_CHANNEL, value);
delayMicroseconds(62.5);
}
delay(50);
}
dac_output_voltage(DAC_CHANNEL, 128);
}
void webSocketEvent(WStype_t type, uint8_t* payload, size_t length) {
switch (type) {
case WStype_CONNECTED:
webSocket.sendTXT("ping");
break;
case WStype_DISCONNECTED:
Serial.println("[WS] Disconnected");
isReceivingAudio = false;
dac_output_voltage(DAC_CHANNEL, 128);
break;
case WStype_TEXT:
if (strcmp((const char*)payload, "pong") == 0) {
Serial.println("[WS] Connected to server");
} else {
Serial.print("[WS] ");
Serial.println((const char*)payload);
}
break;
case WStype_BIN:
if (isReceivingAudio || length > 100) {
if (!isReceivingAudio) {
Serial.println("[ESP] Starting playback");
isReceivingAudio = true;
}
for (size_t i = 0; i < length; i++) {
dac_output_voltage(DAC_CHANNEL, payload[i]);
delayMicroseconds(62.5);
}
static unsigned long lastDot = 0;
if (millis() - lastDot > 500) {
Serial.print(".");
lastDot = millis();
}
}
break;
case WStype_ERROR:
Serial.println("[WS] Error occurred");
break;
}
}
void setup() {
Serial.begin(115200);
pinMode(LED_BUILTIN, OUTPUT);
pinMode(RECORD_BUTTON, INPUT_PULLDOWN);
Serial.println("\n\n╔═══════════════════════════════════╗");
Serial.println("║ ESP32 Voice Assistant v1.0 ║");
Serial.println("╚═══════════════════════════════════╝\n");
setupWifi();
setupI2SMicrophone();
setupDACOutput();
Serial.print("[ESP] WiFi addr: ");
Serial.println(WiFi.localIP());
Serial.print("[ESP] Server: ");
Serial.print(WS_HOST);
Serial.print(":");
Serial.println(WS_PORT);
webSocket.begin(WS_HOST, WS_PORT, WS_PATH);
webSocket.onEvent(webSocketEvent);
webSocket.setReconnectInterval(5000);
// digitalWrite(LED_BUILTIN, HIGH);
// delay(3000);
// digitalWrite(LED_BUILTIN, LOW);
playTestTone(250, 2);
Serial.println("[ESP] Setup complete! Press button to talk.");
}
void loop() {
webSocket.loop();
static bool lastButtonState = LOW;
bool button = digitalRead(RECORD_BUTTON);
if (button == HIGH && lastButtonState == LOW) {
digitalWrite(LED_BUILTIN, HIGH);
Serial.println("[ESP] 🎤 Recording...");
isRecording = true;
isReceivingAudio = false;
dac_output_voltage(DAC_CHANNEL, 128);
webSocket.sendTXT("pause");
} else if (button == LOW && lastButtonState == HIGH) {
digitalWrite(LED_BUILTIN, LOW);
Serial.println("[ESP] ⏹ Stopped. Processing...");
isRecording = false;
webSocket.sendTXT("stop");
}
lastButtonState = button;
if (isRecording && webSocket.isConnected()) {
send_audio_chunk();
}
delay(10);
}

2. Secrets file

You'll also require a secrets.h file:

//secrets.h
const char* WIFI_SSID = "YOUR WIFI SSID";
const char* WIFI_PASS = "YOUR WIFI PASSWORD";
const char* WS_HOST = "YOUR WS HOST"; // ip addr
const uint16_t WS_PORT = 7860;
const char* WS_PATH = "/ws";

Step 3: AI Backend Setup (Python)

For this I would recommend hosting an AWS EC2 instance assigned with a static ip. Alternatively you can run a local server on your laptop and connect your ESP32 through hotspot. Consider this backend as the brainof the system because this is where all the processing happens. Find the source code under /server directory. Here's how to set it up:

1) Install all the dependencies:

cd server
pip install uv
uv sync
uv run main.py

or using Docker:

docker build -t esp32-ws-server .
docker run -p 7860:7860 esp32-ws-server

2) Environment variables and models needed

GEMINI_API_KEY
You'll also need a piper voice model. Download it using the following command:

python -m piper.download_voices en_US-libritts_r-medium --data-dir tts_models

You can choose any other model as well, find them here.

And that's all! Your voice assistant should now respond to your queries once you press the push button!

Code

Credits

Arpit Sengar

1 project • 0 followers

Hardware engineer specializing in embedded systems, circuit design, PCB layout, and low-level integration for IoT.

Thanks to Weslei Prudencio.

DIY PC Speaker to AI Voice Assistant

Things used in this project

Hardware components

Software apps and online services

Story

ESP32 Voice Assistant

Workflow:

Step 2: Flash Code to ESP32

Step 3: AI Backend Setup (Python)

Schematics

ESP32 Voice Assistant Schematic

Code

ESP32 Voice Assistant Github Repo

Credits

Arpit Sengar

Comments

Embed the widget on your own site

DIY PC Speaker to AI Voice Assistant

DIY PC Speaker to AI Voice Assistant

Things used in this project

Hardware components

Software apps and online services

Story

ESP32 Voice Assistant

Workflow:

Step 2: Flash Code to ESP32

Step 3: AI Backend Setup (Python)

Schematics

ESP32 Voice Assistant Schematic

Code

ESP32 Voice Assistant Github Repo

Credits

Arpit Sengar

Comments

Related channels and tags