This project transforms a basic ESP32-CAM into a powerful AI vision assistant using OpenAI's GPT-4o. When a button is pressed, it captures an image, encodes it in Base64, sends it to ChatGPT-4o with the prompt "Summarize the context of this image", and displays the response on a compact OLED screen.
A lightweight, yet impressive example of combining edge hardware with AI services to make embedded devices smarter!
Features
Image capture using ESP32-CAM
Image encoding to Base64
Upload to OpenAI GPT-4o
Summary text generation with a fixed prompt
OLED display of AI response
Simple button-triggered interaction
AI Camera Mini on action:
Hardware Used
ESP32-CAM module
0.96" I2C OLED Display (SSD1306)
Push button (for image capture trigger)
Breadboard, jumper wires, and micro USB breakout
How It Works
Button PressA hardware push button triggers the image capture process.
Image Capture + Base64ESP32-CAM captures a JPEG image and converts it into a Base64-encoded string.
API Request to OpenAIThe Base64 image is sent to the OpenAI GPT-4o Vision endpoint with the prompt:"Summarize the context of this image"
Text Response HandlingThe summary generated by ChatGPT is extracted from the API response.
OLED DisplayThe OLED display shows the result in a readable format.
Use Case
This compact AI vision system can be used to:
Understand scenes or objects with minimal hardware
Assist visually impaired users with context description
Make smart assistants and IoT vision nodes more intelligent
Enhance edge devices with AI-driven image understanding
/* Tested with: - Arduino IDE version 2.3.2 - ESP32 boards package version 3.0.0 - Adafruit GFX library version 1.11.11 - Adafruit SSD1306 library version 2.5.13 - ArduinoJson library version 7.1.0 - Base64 library (default version with ESP32 boards package)*/#include<WiFi.h>#include<HTTPClient.h>#include<Base64.h>#include"esp_camera.h"#include<Adafruit_GFX.h>#include<Wire.h>#include<Adafruit_SSD1306.h>#include<ArduinoJson.h>// WiFi credentialsconstchar*ssid="SSID";constchar*password="PASSWORD";// OpenAI API keyconstStringapiKey="******";#define SCREEN_WIDTH 128#define SCREEN_HEIGHT 64#define OLED_SCL 14#define OLED_SDA 15#define OLED_RESET -1Adafruit_SSD1306display(SCREEN_WIDTH,SCREEN_HEIGHT,&Wire,OLED_RESET);// Pin definitions for ESP32-CAM AI-Thinker module#define PWDN_GPIO_NUM 32#define RESET_GPIO_NUM -1#define XCLK_GPIO_NUM 0#define SIOD_GPIO_NUM 26#define SIOC_GPIO_NUM 27#define Y9_GPIO_NUM 35#define Y8_GPIO_NUM 34#define Y7_GPIO_NUM 39#define Y6_GPIO_NUM 36#define Y5_GPIO_NUM 21#define Y4_GPIO_NUM 19#define Y3_GPIO_NUM 18#define Y2_GPIO_NUM 5#define VSYNC_GPIO_NUM 25#define HREF_GPIO_NUM 23#define PCLK_GPIO_NUM 22#define BUTTON_PIN 13#define BUZZER_PIN 2 // Buzzer connected to GPIO2voiddisplayCenteredText(constString&text,inttextSize=1){display.clearDisplay();display.setTextSize(textSize);display.setTextColor(SSD1306_WHITE);intmaxLineLength=16;// Assuming 16 characters fit per line at textSize 1StringlineBuffer="";StringwordBuffer="";int16_tx1,y1;uint16_ttextWidth,textHeight;// Calculate line heightdisplay.getTextBounds("A",0,0,&x1,&y1,&textWidth,&textHeight);intlineHeight=textHeight+2;// Calculate the total number of lines neededintlineCount=0;for(size_ti=0;i<=text.length();i++){charc=text.charAt(i);if(c==' '||c=='\n'||c=='\0'){if(lineBuffer.length()+wordBuffer.length()>maxLineLength){lineCount++;lineBuffer=wordBuffer;}else{lineBuffer+=(lineBuffer.isEmpty()?"":" ")+wordBuffer;}wordBuffer="";if(c=='\n'){lineCount++;lineBuffer="";}}else{wordBuffer+=c;}}if(!lineBuffer.isEmpty())lineCount++;// Count the last line// Calculate the vertical offset to center the block of textinttotalTextHeight=lineCount*lineHeight;intyOffset=(SCREEN_HEIGHT-totalTextHeight)/2;// Render the text line by line, vertically centeredintyPos=yOffset;lineBuffer="";wordBuffer="";for(size_ti=0;i<=text.length();i++){charc=text.charAt(i);if(c==' '||c=='\n'||c=='\0'){if(lineBuffer.length()+wordBuffer.length()>maxLineLength){// Render the current linedisplay.setCursor((SCREEN_WIDTH-lineBuffer.length()*textWidth)/2,yPos);display.print(lineBuffer);yPos+=lineHeight;lineBuffer=wordBuffer;}else{lineBuffer+=(lineBuffer.isEmpty()?"":" ")+wordBuffer;}wordBuffer="";if(c=='\n'||c=='\0'){display.setCursor((SCREEN_WIDTH-lineBuffer.length()*textWidth)/2,yPos);display.print(lineBuffer);yPos+=lineHeight;lineBuffer="";}}else{wordBuffer+=c;}}display.display();}// Function to encode image to Base64StringencodeImageToBase64(constuint8_t*imageData,size_timageSize){returnbase64::encode(imageData,imageSize);}voidsetup(){Serial.begin(115200);WiFi.begin(ssid,password);pinMode(BUTTON_PIN,INPUT_PULLUP);pinMode(BUZZER_PIN,OUTPUT);// Set Buzzer pin as outputWire.begin(OLED_SDA,OLED_SCL);if(!display.begin(SSD1306_SWITCHCAPVCC,0x3C)){Serial.println("SSD1306 allocation failed");for(;;);}// Display the project title on power-ondisplayCenteredText("AI CAMERA MINI",1);delay(3000);// Hold the title screen for 3 secondsdisplayCenteredText("Connecting to WiFi...");while(WiFi.status()!=WL_CONNECTED){delay(1000);Serial.println("Connecting to WiFi...");}displayCenteredText("WiFi Connected!");delay(2000);camera_config_tconfig;config.ledc_channel=LEDC_CHANNEL_0;config.ledc_timer=LEDC_TIMER_0;config.pin_d0=Y2_GPIO_NUM;config.pin_d1=Y3_GPIO_NUM;config.pin_d2=Y4_GPIO_NUM;config.pin_d3=Y5_GPIO_NUM;config.pin_d4=Y6_GPIO_NUM;config.pin_d5=Y7_GPIO_NUM;config.pin_d6=Y8_GPIO_NUM;config.pin_d7=Y9_GPIO_NUM;config.pin_xclk=XCLK_GPIO_NUM;config.pin_pclk=PCLK_GPIO_NUM;config.pin_vsync=VSYNC_GPIO_NUM;config.pin_href=HREF_GPIO_NUM;config.pin_sscb_sda=SIOD_GPIO_NUM;config.pin_sscb_scl=SIOC_GPIO_NUM;config.pin_pwdn=PWDN_GPIO_NUM;config.pin_reset=RESET_GPIO_NUM;config.xclk_freq_hz=20000000;config.pixel_format=PIXFORMAT_JPEG;config.frame_size=FRAMESIZE_QVGA;config.jpeg_quality=10;config.fb_count=1;if(esp_camera_init(&config)!=ESP_OK){Serial.println("Camera init failed");displayCenteredText("Camera Init Failed");return;}displayCenteredText("Camera Initialized");delay(2000);displayCenteredText("Press button to capture");}voidcaptureAndAnalyzeImage(){Serial.println("Capturing image...");// Capture the image frame buffercamera_fb_t*fb=esp_camera_fb_get();// Get the frame bufferif(!fb){Serial.println("Camera capture failed");displayCenteredText("Capture Failed");return;}// After the new frame is obtained, ensure the buffer is returned (cleared)esp_camera_fb_return(fb);// Release the frame buffer from the previous capture// Now, capture the new imagefb=esp_camera_fb_get();// Get the frame buffer again for the new imageif(!fb){Serial.println("Camera capture failed");displayCenteredText("Capture Failed");return;}Serial.println("Image captured");Stringbase64Image=encodeImageToBase64(fb->buf,fb->len);beep();// Return the frame buffer after processing the imageesp_camera_fb_return(fb);// Return the frame buffer to free memoryif(base64Image.isEmpty()){Serial.println("Failed to encode the image!");displayCenteredText("Encode Failed");return;}// Send the image to OpenAI for analysisAnalyzeImage(base64Image);}voidAnalyzeImage(constString&base64Image){Serial.println("Sending image for analysis...");displayCenteredText("Processing...");Stringresult;// Prepare the payload for the OpenAI APIStringurl="data:image/jpeg;base64,"+base64Image;Serial.println(url);// Construct the payloadStringpayload=String("{")+"\"model\": \"gpt-4o\", "+"\"max_tokens\": 400, "+"\"messages\": [{\"role\": \"user\", \"content\": "+"[{\"type\": \"text\", \"text\": \"Summarize the context of this image?\"}, "+"{\"type\": \"image_url\", \"image_url\": {\"url\": \""+url+"\", \"detail\": \"auto\"}}]}]}";// Send request and validate responseif(sendPostRequest(payload,result)){Serial.print("[ChatGPT] Response: ");Serial.println(result);// Clear the display before showing the new responsedisplay.clearDisplay();display.display();DynamicJsonDocumentresponseDoc(4096);deserializeJson(responseDoc,result);StringresponseContent=responseDoc["choices"][0]["message"]["content"].as<String>();Serial.println("[ChatGPT] Parsed response: "+responseContent);// Smooth scrolling and proper word wrappingdisplay.clearDisplay();intlineHeight=8;// Height of each line in pixelsintmaxLineChars=21;// Approx. max characters per lineintvisibleLines=7;intscrollDelay=2000;// Delay for scrolling in millisecondsstd::vector<String>lines;// Store formatted lines for display// Split responseContent into words for word wrappingStringword="";StringcurrentLine="";for(inti=0;i<responseContent.length();i++){charc=responseContent.charAt(i);if(c==' '||c=='\n'){if(currentLine.length()+word.length()<=maxLineChars){currentLine+=(currentLine.isEmpty()?"":" ")+word;}else{lines.push_back(currentLine);currentLine=word;}word="";}else{word+=c;}}if(!currentLine.isEmpty())lines.push_back(currentLine);if(!word.isEmpty())lines.push_back(word);// Display lines with scrolling effectfor(size_ti=0;i<lines.size();i++){display.clearDisplay();for(size_tj=0;j<visibleLines&&(i+j)<lines.size();j++){display.setCursor(0,j*lineHeight);display.print(lines[i+j]);}display.display();delay(scrollDelay);}// Clear display after the responsedisplay.clearDisplay();display.display();displayCenteredText("Press button to capture");}else{Serial.print("[ChatGPT] Error: ");Serial.println(result);display.clearDisplay();display.setCursor(0,0);display.print("API Error");display.display();}}boolsendPostRequest(constString&payload,String&result){HTTPClienthttp;http.begin("https://api.openai.com/v1/chat/completions");http.addHeader("Content-Type","application/json");http.addHeader("Authorization","Bearer "+apiKey);http.setTimeout(20000);Serial.print("Payload size: ");Serial.println(payload.length());inthttpResponseCode=http.POST(payload);if(httpResponseCode>0){result=http.getString();Serial.println("HTTP Response Code: "+String(httpResponseCode));Serial.println("Response Body: "+result);http.end();returntrue;}else{result="HTTP request failed, response code: "+String(httpResponseCode);Serial.println("Error Code: "+String(httpResponseCode));Serial.println("Error Message: "+http.errorToString(httpResponseCode));http.end();returnfalse;}}voidloop(){if(digitalRead(BUTTON_PIN)==LOW){Serial.println("Button pressed! Capturing image...");displayCenteredText("Capturing...");captureAndAnalyzeImage();delay(1000);// Small delay to debounce button press}}voidbeep(){digitalWrite(2,HIGH);delay(300);digitalWrite(2,LOW);}
Comments