Aditya Lambat
Published

Shop-Sense: A Shopping Guide for Visually Impaired People

Smart spectacles designed to help visually impaired people navigate easily and shop independently in a mall.

IntermediateWork in progress156

Things used in this project

Hardware components

Seeed Studio XIAO ESP32S3 Sense
Seeed Studio XIAO ESP32S3 Sense
×1
Blues Notecarrier A
Blues Notecarrier A
×1
Blues Notecard (Wi-Fi)
Blues Notecard (Wi-Fi)
×1
SparkFun IMU Breakout - MPU-9250
SparkFun IMU Breakout - MPU-9250
×1
STMicroelectronics - STM32F411
×1

Software apps and online services

Arduino IDE
Arduino IDE
Visual Studio 2017
Microsoft Visual Studio 2017
Jetbrains Datalore

Hand tools and fabrication machines

Soldering iron (generic)
Soldering iron (generic)

Story

Read more

Custom parts and enclosures

Image

Blend file

Sketchfab still processing.

STL File

Sketchfab still processing.

Schematics

Schematics

Pre-mature Design Concept

Schematics

Code

Camera_Microphone.ino

C/C++
#define CAMERA_MODEL_XIAO_ESP32S3

#include <WiFi.h>
#include <HTTPClient.h>
#include <base64.h>
#include <esp_camera.h>
#include <Arduino.h>
#include <esp_system.h>
#include <esp_heap_caps.h>
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include "camera_pins.h"
#include <ESP_I2S.h>
I2SClass I2S;
#include "FS.h"
#include "SD.h"
#include "SPI.h"

#define SAMPLE_RATE 16000U
#define SAMPLE_BITS 16
#define WAV_HEADER_SIZE 44
#define BUFFER_SIZE 1024
#define RECORD_TIME   5 
#define WAV_FILE_NAME "arduino_rec_DI2S"

int16_t i2s_buffer[BUFFER_SIZE];

std::vector<uint8_t> captureImage();

const char* ssid = "your-ssid";
const char* password = "your-password";
const char* serverName = "your server/upload";
const char* currencyServer = "your server/currencyIdentification"; // Replace with your server's currency identification endpoint
const char* server = "your server/speechtoText";

void setup() {
  Serial.begin(115200);
  WiFi.begin(ssid, password);

  while (WiFi.status() != WL_CONNECTED) {
    delay(1000);
    Serial.println("Connecting to WiFi...");
  }
  Serial.println("Connected to WiFi");

  camera_config_t config;
  Serial.println("Starting camera work!");
  config.ledc_channel = LEDC_CHANNEL_0;
  config.ledc_timer = LEDC_TIMER_0;
  config.pin_d0 = Y2_GPIO_NUM;
  config.pin_d1 = Y3_GPIO_NUM;
  config.pin_d2 = Y4_GPIO_NUM;
  config.pin_d3 = Y5_GPIO_NUM;
  config.pin_d4 = Y6_GPIO_NUM;
  config.pin_d5 = Y7_GPIO_NUM;
  config.pin_d6 = Y8_GPIO_NUM;
  config.pin_d7 = Y9_GPIO_NUM;
  config.pin_xclk = XCLK_GPIO_NUM;
  config.pin_pclk = PCLK_GPIO_NUM;
  config.pin_vsync = VSYNC_GPIO_NUM;
  config.pin_href = HREF_GPIO_NUM;
  config.pin_sscb_sda = SIOD_GPIO_NUM;
  config.pin_sscb_scl = SIOC_GPIO_NUM;
  config.pin_pwdn = PWDN_GPIO_NUM;
  config.pin_reset = RESET_GPIO_NUM;
  config.xclk_freq_hz = 20000000;
  config.pixel_format = PIXFORMAT_JPEG;
  config.frame_size = FRAMESIZE_UXGA;
  config.grab_mode = CAMERA_GRAB_WHEN_EMPTY;
  config.fb_location = CAMERA_FB_IN_PSRAM;
  config.jpeg_quality = 12;
  config.fb_count = 1;

  if(config.pixel_format == PIXFORMAT_JPEG){
    if(psramFound()){
      config.jpeg_quality = 10;
      config.fb_count = 2;
      config.grab_mode = CAMERA_GRAB_LATEST;
    } else {
      config.frame_size = FRAMESIZE_SVGA;
      config.fb_location = CAMERA_FB_IN_DRAM;
    }
  } else {
    config.frame_size = FRAMESIZE_240X240;
  #if CONFIG_IDF_TARGET_ESP32S3
    config.fb_count = 2;
  #endif
  }

  esp_err_t err = esp_camera_init(&config);
  if (err != ESP_OK) {
    Serial.printf("Camera init failed with error 0x%x", err);
    return;
  }

  Serial.println("I2S initializing");
  I2S.setPinsPdmRx(42, 41);
    if (!I2S.begin(I2S_MODE_PDM_RX, 16000, I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO)) {
        Serial.println("Failed to initialize I2S!");
        while (1) ;
    }
    Serial.println("I2S Successful");

    if (!SD.begin(21)) {
        Serial.println("Failed to mount SD Card!");
        while (1);
    }

     xTaskCreatePinnedToCore(
    videoStreamTask,     // Function to implement the task
    "VideoStreamTask",   // Name of the task
    4096,                // Stack size in words
    NULL,                // Task input parameter
    1,                   // Priority of the task
    NULL,                // Task handle
    1                    // Core where the task should run
  );
  
}

void loop() {
  String transcribed = record_wav();
  if (transcribed == "capture" || transcribed == "captured") {
    captureAndSendImage();
  } else if (transcribed == "value" || transcribed == "currency") {
    captureAndSendToCurrencyModule();
  }
  delay(60000);
}

void captureAndSendImage() {

  std::vector<uint8_t> imageData = captureImage();
  String imageBase64 = base64::encode(imageData.data(), imageData.size());

  if (WiFi.status() == WL_CONNECTED) {
    HTTPClient http;
    http.begin(serverName);
    http.addHeader("Content-Type", "application/json");

    String jsonPayload = "{\"image\":\"" + imageBase64 + "\"}";
    int httpResponseCode = http.POST(jsonPayload);

    if (httpResponseCode > 0) {
      String response = http.getString();
      Serial.println(httpResponseCode);
      Serial.println(response);
    } else {
      Serial.print("Error on sending POST: ");
      Serial.println(httpResponseCode);
    }

    http.end();
  } else {
    Serial.println("WiFi Disconnected");
  }
}

void captureAndSendToCurrencyModule() {
  std::vector<uint8_t> imageData = captureImage();
  String imageBase64 = base64::encode(imageData.data(), imageData.size());

  if (WiFi.status() == WL_CONNECTED) {
    HTTPClient http;
    http.begin(currencyServer); // Use the currency identification endpoint
    http.addHeader("Content-Type", "application/json");

    String jsonPayload = "{\"image\":\"" + imageBase64 + "\"}";
    int httpResponseCode = http.POST(jsonPayload);

    if (httpResponseCode > 0) {
      String response = http.getString();
      Serial.println("Currency Denomination: " + response);  // Print the denomination value
    } else {
      Serial.print("Error on sending POST: ");
      Serial.println(httpResponseCode);
    }

    http.end();
  } else {
    Serial.println("WiFi Disconnected");
  }
}

std::vector<uint8_t> captureImage() {
  camera_fb_t *fb = esp_camera_fb_get();
    if (!fb) {
      Serial.println("Camera capture failed");
      return std::vector<uint8_t>();
    }
  std::vector<uint8_t> imageData(fb->buf, fb->buf + fb->len);
  esp_camera_fb_return(fb);
  return imageData;
}

String record_wav() {
    delay(2000); 

    uint32_t record_size = (SAMPLE_RATE * SAMPLE_BITS / 8) * RECORD_TIME;
    uint8_t *rec_buffer = NULL;
    size_t bytes_written = 0;

    Serial.printf("Ready to start recording ...\n");

    File file = SD.open("/" WAV_FILE_NAME ".wav", FILE_WRITE);
    // Write the header to the WAV file
    uint8_t wav_header[WAV_HEADER_SIZE];
    generate_wav_header(wav_header, record_size, SAMPLE_RATE);
    file.write(wav_header, WAV_HEADER_SIZE);

    // Allocate buffer for recording
    rec_buffer = (uint8_t *)malloc(BUFFER_SIZE);
    if (rec_buffer == NULL) {
        Serial.printf("malloc failed!\n");
        while (1);
    }

    // Start recording
    for (int i = 0; i < (RECORD_TIME * SAMPLE_RATE * SAMPLE_BITS / 8) / BUFFER_SIZE; ++i) {
        for (int j = 0; j < BUFFER_SIZE / 2; ++j) {
            int sample = I2S.read();
            if (sample == -1) {
                Serial.printf("Read failed\n");
            } else {
                ((int16_t*)rec_buffer)[j] = sample;
            }
        }
        file.write(rec_buffer, BUFFER_SIZE);
        bytes_written += BUFFER_SIZE;
    }

    free(rec_buffer);
    file.close();
    Serial.printf("The recording is over. Total bytes written: %d\n", bytes_written);

    if (WiFi.status() == WL_CONNECTED) {
        String answer = uploadFile();
        return answer;
    }
    return "Device not connected";
}

void generate_wav_header(uint8_t *wav_header, uint32_t wav_size, uint32_t sample_rate) {
    uint32_t file_size = wav_size + WAV_HEADER_SIZE - 8;
    uint32_t byte_rate = SAMPLE_RATE * SAMPLE_BITS / 8;
    const uint8_t set_wav_header[] = {
        'R', 'I', 'F', 'F', // ChunkID
        file_size, file_size >> 8, file_size >> 16, file_size >> 24, // ChunkSize
        'W', 'A', 'V', 'E', // Format
        'f', 'm', 't', ' ', // Subchunk1ID
        0x10, 0x00, 0x00, 0x00, // Subchunk1Size (16 for PCM)
        0x01, 0x00, // AudioFormat (1 for PCM)
        0x01, 0x00, // NumChannels (1 channel)
        sample_rate, sample_rate >> 8, sample_rate >> 16, sample_rate >> 24, // SampleRate
        byte_rate, byte_rate >> 8, byte_rate >> 16, byte_rate >> 24, // ByteRate
        0x02, 0x00, // BlockAlign (NumChannels * BitsPerSample / 8)
        SAMPLE_BITS, 0x00, // BitsPerSample
        'd', 'a', 't', 'a', // Subchunk2ID
        wav_size, wav_size >> 8, wav_size >> 16, wav_size >> 24 // Subchunk2Size
    };
    memcpy(wav_header, set_wav_header, sizeof(set_wav_header));
}

String uploadFile() {
    File file = SD.open("/" WAV_FILE_NAME ".wav");

    if (!file) {
        Serial.println("Failed to open file for reading");
        return "";
    }

    if (WiFi.status() == WL_CONNECTED) {
        HTTPClient http;
        http.begin(server);
        http.addHeader("Content-Type", "text/plain");

        int fileSize = file.size();
        String response;
        while (fileSize > 0) {
            int chunkSize = min(fileSize, 1024);
            uint8_t chunk[chunkSize];
            file.read(chunk, chunkSize);
            response += String((char*)chunk);
            fileSize -= chunkSize;
        }
        int httpResponseCode = http.POST(response);

        if (httpResponseCode > 0) {
            String payload = http.getString();
            Serial.println(payload);
            return payload;
        } else {
            Serial.print("Error on sending POST: ");
            Serial.println(httpResponseCode);
        }

        http.end();
    } else {
        Serial.println("WiFi Disconnected");
    }

    file.close();
    return "Failure";
}

void videoStreamTask(void *pvParameters) {
  while (true) {
    captureAndSendVideoFrame();
    delay(100); // Adjust the delay as needed for frame rate
  }
}

void captureAndSendVideoFrame() {
  std::vector<uint8_t> imageData = captureImage();
  String imageBase64 = base64::encode(imageData.data(), imageData.size());

  if (WiFi.status() == WL_CONNECTED) {
    HTTPClient http;
    http.begin(videoStreamServer);
    http.addHeader("Content-Type", "application/json");

    String jsonPayload = "{\"frame\":\"" + imageBase64 + "\"}";
    int httpResponseCode = http.POST(jsonPayload);

    if (httpResponseCode > 0) {
      String response = http.getString();
      Serial.println(httpResponseCode);
      Serial.println(response);
    } else {
      Serial.print("Error on sending POST: ");
      Serial.println(httpResponseCode);
    }

    http.end();
  } else {
    Serial.println("WiFi Disconnected");
  }
}

Flask Server

Python
from flask import Flask, request, jsonify
import firebase_admin
from firebase_admin import credentials, firestore
import logging
import deepspeech
import wave
import numpy as np
import os
import base64
import datetime

logging.basicConfig(level=logging.DEBUG)

app = Flask(__name__)
image_data = None

cred = credentials.Certificate("your certificate json file")
firebase_admin.initialize_app(cred)
db = firestore.client()

model_path = 'FlaskServer\deepspeech-0.9.3-models.pbmm'
scorer_path = 'FlaskServer\deepspeech-0.9.3-models.scorer'
model = deepspeech.Model(model_path)
model.enableExternalScorer(scorer_path)

@app.route('/notehubWebhook', methods=['POST'])
def notehub_webhook():
    logging.debug('Received POST request')
    raw_data = request.data
    logging.debug(f'Raw data: {raw_data}')
    
    try:
        data = request.get_json()
        logging.debug(f'Received data: {data}')
        if data is None:
            raise ValueError("No JSON data received")
        data['creationTimestamp'] = datetime.utcnow()
        db.collection('notehub-data').add(data)
        return jsonify({"status": "success", "message": "Data received and saved"}), 200
    except Exception as e:
        logging.error(f'Error processing request: {e}')
        return jsonify({"status": "error", "message": str(e)}), 500


@app.route('/latestData', methods=['GET'])
def get_latest_data():
    try:
        docs = db.collection('notehub-data').order_by('timestamp', direction=firestore.Query.DESCENDING).limit(1).stream()
        latest_data = None
        for doc in docs:
            latest_data = doc.to_dict()
        
        if latest_data:
            return jsonify(latest_data), 200
        else:
            return jsonify({"error": "No data found"}), 404
    except Exception as e:
        logging.error(f'Error retrieving data: {e}')
        return jsonify({"error": str(e)}), 500

@app.route('/speechtoText', methods=['POST'])
def transcribe_audio():
    if 'audio/wav' in request.headers['Content-Type']:
        audio_data = request.data
        with open("received_audio.wav", "wb") as f:
            f.write(audio_data)
        
        with wave.open("received_audio.wav", "rb") as wf:
            audio = np.frombuffer(wf.readframes(wf.getnframes()), np.int16)
            text = model.stt(audio)
        
        os.remove("received_audio.wav")
        return text
    else:
        return "Unsupported media type", 415

@app.route('/upload', methods=['POST', 'GET'])
def upload_image():
    global image_data
    if request.method == 'POST':
        try:
            data = request.get_json()
            if 'image' in data:
                image_data = base64.b64decode(data['image'])
                with open("received_image.jpg", "wb") as f:
                    f.write(image_data)
                return jsonify({"status": "Image received"}), 200
            else:
                return jsonify({"error": "Invalid data"}), 400
        except Exception as e:
            return jsonify({"error": str(e)}), 500
    else:
        return "<h1>Upload endpoint for POST requests only</h1>"
    
model = tf.keras.models.load_model('path_to_your_model.h5')

def recognize_currency(image):
    image = image.convert('RGB') 
    image = np.array(image)    
    image = np.expand_dims(image, axis=0) 
    image = image / 255.0  

    predictions = model.predict(image)

    denomination = np.argmax(predictions)  
    return denomination

@app.route('/currencyIdentification', methods=['POST'])
def currency_identification():
    data = request.json
    if 'image' not in data:
        return jsonify({'error': 'No image found in request'}), 400

    image_base64 = data['image']
    try:
        image_data = base64.b64decode(image_base64)
        image = Image.open(io.BytesIO(image_data))
        denomination = recognize_currency(image)
        return jsonify({'denomination': denomination}), 200

    except Exception as e:
        print(f'Error: {e}')
        return jsonify({'error': 'Error processing image'}), 500

@app.route('/image', methods=['GET'])
def get_image():
    if image_data:
        return base64.b64encode(image_data).decode('utf-8'), 200
    else:
        return jsonify({"error": "No image available"}), 404

@app.route('/')
def home():
    return "Hello, Flask!"

if __name__ == '__main__':
    app.run(debug=True, port=5000)

Model_Training

Python
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Lambda, Dense, Dropout
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Layer
from collections import defaultdict
from os.path import join
import numpy as np
from PIL import Image
import os

path_train = 'your dataset path'
batch_size = 2
embedding_dim = 512

def l2_normalize(x):
    return x / np.sqrt(np.sum(x**2, axis=-1, keepdims=True))

def read_and_resize(filepath, retries =3):
    try:
        im = Image.open(filepath).convert('RGB')
        im = im.resize((224, 224))
        return np.array(im, dtype="float32")
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        if retries > 0 :
            print(f"Error Reading {filepath}: {e}. Retrying {retries} more")
            return read_and_resize(filepath, retries-1)
        print("Giving Up")
        return None

def augment(im_array):
    if np.random.uniform(0, 1) > 0.9:
        im_array = np.fliplr(im_array)
    return im_array

class sample_gen(object):
    def __init__(self, file_class_mapping, other_class="new"):
        self.file_class_mapping = file_class_mapping
        self.class_to_list_files = defaultdict(list)
        self.list_other_class = []
        self.list_all_files = list(file_class_mapping.keys())
        self.range_all_files = list(range(len(self.list_all_files)))

        for file, class_ in file_class_mapping.items():
            if class_ == other_class:
                self.list_other_class.append(file)
            else:
                self.class_to_list_files[class_].append(file)

        self.list_classes = list(set(self.file_class_mapping.values()))
        self.range_list_classes = range(len(self.list_classes))
        self.class_weight = np.array([len(self.class_to_list_files[class_]) for class_ in self.list_classes])
        self.class_weight = self.class_weight / np.sum(self.class_weight)

    def get_sample(self):
        class_idx = np.random.choice(self.range_list_classes, 1, p=self.class_weight)[0]
        examples_class_idx = np.random.choice(range(len(self.class_to_list_files[self.list_classes[class_idx]])), 2)
        positive_example_1, positive_example_2 = \
            self.class_to_list_files[self.list_classes[class_idx]][examples_class_idx[0]], \
            self.class_to_list_files[self.list_classes[class_idx]][examples_class_idx[1]]

        negative_example = None
        while negative_example is None or self.file_class_mapping[negative_example] == \
                self.file_class_mapping[positive_example_1]:
            negative_example_idx = np.random.choice(self.range_all_files, 1)[0]
            negative_example = self.list_all_files[negative_example_idx]
        return positive_example_1, negative_example, positive_example_2

def gen(triplet_gen):
    while True:
        list_positive_examples_1 = []
        list_negative_examples = []
        list_positive_examples_2 = []

        for i in range(batch_size):
            positive_example_1, negative_example, positive_example_2 = triplet_gen.get_sample()
            path_pos1 = join(path_train, positive_example_1)
            path_neg = join(path_train, negative_example)
            path_pos2 = join(path_train, positive_example_2)
            
            positive_example_1_img = read_and_resize(path_pos1)
            negative_example_img = read_and_resize(path_neg)
            positive_example_2_img = read_and_resize(path_pos2)

            if positive_example_1_img is None or negative_example_img is None or positive_example_2_img is None:
                print(f"Skipping batch due to None image: {positive_example_1}, {negative_example}, {positive_example_2}")
                continue

            positive_example_1_img = augment(positive_example_1_img)
            negative_example_img = augment(negative_example_img)
            positive_example_2_img = augment(positive_example_2_img)
            
            list_positive_examples_1.append(positive_example_1_img)
            list_negative_examples.append(negative_example_img)
            list_positive_examples_2.append(positive_example_2_img)

        A = preprocess_input(np.array(list_positive_examples_1))
        B = preprocess_input(np.array(list_positive_examples_2))
        C = preprocess_input(np.array(list_negative_examples))
        
        label = np.zeros((batch_size,))
        
        print(f"Yielding batch - A: {A.shape}, B: {B.shape}, C: {C.shape}")
        yield {'anchor_input': A, 'positive_input': B, 'negative_input': C}, label

class MACPooling(Layer):
    def __init__(self, **kwargs):
        super(MACPooling, self).__init__(**kwargs)

    def call(self, inputs):
        return tf.reduce_max(inputs, axis=[1, 2])

class TripletLossLayer(Layer):
    def __init__(self, margin=1.0, **kwargs):
        self.margin = margin
        super(TripletLossLayer, self).__init__(**kwargs)

    def call(self, inputs):
        anchor, positive, negative = inputs
        pos_dist = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
        neg_dist = tf.reduce_sum(tf.square(anchor - negative), axis=-1)
        loss = tf.maximum(pos_dist - neg_dist + self.margin, 0.0)
        self.add_loss(tf.reduce_mean(loss))
        return loss

def GetModel(image_size, embedding_dim):
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(image_size, image_size, 3))
    base_model_output = base_model.get_layer('block4_pool').output

    x = MACPooling()(base_model_output)
    x = Dropout(0.6)(x)
    x = Dense(embedding_dim)(x)
    x = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x)

    embedding_model = Model(base_model.input, x, name="embedding")
    input_shape = (image_size, image_size, 3)
    anchor_input = Input(input_shape, name='anchor_input')
    positive_input = Input(input_shape, name='positive_input')
    negative_input = Input(input_shape, name='negative_input')
    
    anchor_embedding = embedding_model(anchor_input)
    positive_embedding = embedding_model(positive_input)
    negative_embedding = embedding_model(negative_input)

    triplet_loss_layer = TripletLossLayer(name='triplet_loss_layer')([anchor_embedding, positive_embedding, negative_embedding])
    
    triplet_model = Model(inputs=[anchor_input, positive_input, negative_input], outputs=[anchor_embedding, positive_embedding, negative_embedding])

    return embedding_model, triplet_model

# file_class_mapping = {
#     'Image1.jpeg': 'class1',
#     'Image2.jpeg': 'class1',
#     'Image3.jpg': 'class2',
#     'Image4.jpg': 'class2',
# }

triplet_gen = sample_gen(file_class_mapping)
train_generator = gen(triplet_gen)

steps_per_epoch = len(file_class_mapping) // batch_size
epochs = 10
def dummy_loss(y_true, y_pred):
    return tf.reduce_mean(y_pred)


embedding_model, triplet_model = GetModel(image_size=224, embedding_dim=embedding_dim)
triplet_model.compile(loss = dummy_loss , optimizer=Adam(learning_rate=0.0001))

for i in range(1):
    data = next(train_generator)
    print(f"Generator output - {data}")
dummy_target_data = np.zeros(batch_size)
# Train the model
history = triplet_model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=epochs, 
                            validation_data=None, 
                            verbose = 1, 
                            callbacks=None, 
                            # Supplying the dummy labels here
                            # y=np.zeros((steps_per_epoch,)),  
                            class_weight=None,
                            max_queue_size=10, 
                            workers=1, 
                            use_multiprocessing=False, 
                            shuffle=True, 
                            initial_epoch=0)

Credits

Aditya Lambat

Aditya Lambat

1 project • 2 followers

Comments