Dhruv Bhat
Published © CC BY

Exploring RAG on NVIDIA Jetson TX2

In this blog, we'll delve into the innovative use of the NVIDIA Jetson TX2 to power Retrieval-Augmented Generation (RAG) for LLMs

IntermediateFull instructions provided7 hours225
Exploring RAG on NVIDIA Jetson TX2

Things used in this project

Hardware components

Nvidia Jetson TX2
Nvidia Jetson TX2
×1

Software apps and online services

VS Code
Microsoft VS Code
PyTorch with Cuda

Story

Read more

Code

image_embedding.py

Python
import os
import torch
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
import faiss
import pickle
from dotenv import load_dotenv

load_dotenv()

def get_model_info(model_ID, device):
    model = CLIPModel.from_pretrained(model_ID).to(device)
    processor = CLIPProcessor.from_pretrained(model_ID)
    tokenizer = CLIPTokenizer.from_pretrained(model_ID)
    return model, processor, tokenizer

def get_single_text_embedding(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    text_embeddings = model.get_text_features(**inputs)
    embedding_as_np = text_embeddings.cpu().detach().numpy()
    embedding_as_np = embedding_as_np / np.linalg.norm(embedding_as_np)
    return embedding_as_np

def get_single_image_embedding(image_path, model, processor, device):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    image_embeddings = model.get_image_features(**inputs)
    embedding_as_np = image_embeddings.cpu().detach().numpy()
    embedding_as_np = embedding_as_np / np.linalg.norm(embedding_as_np)
    return embedding_as_np

device = "cuda" if torch.cuda.is_available() else "cpu"
model_ID = "openai/clip-vit-base-patch32"
model, processor, tokenizer = get_model_info(model_ID, device)

folder_path = r"Folder path containing images"
image_embeddings = []
image_paths = []

for filename in os.listdir(folder_path):
    if filename.endswith(".jpg") or filename.endswith(".jpeg"):
        image_path = os.path.join(folder_path, filename)
        image_embedding = get_single_image_embedding(image_path, model, processor, device)
        image_embeddings.append(image_embedding)
        image_paths.append(image_path)

image_embeddings = np.vstack(image_embeddings)
dimension = image_embeddings.shape[1]

# Create a FAISS index and add image embeddings
index = faiss.IndexFlatIP(dimension)
index.add(image_embeddings)

faiss.write_index(index, "image_paths_clip.faiss")

print(f"Index type: {type(index)}")
print(f"Number of vectors: {index.ntotal}")
print(f"Dimensionality of vectors: {index.d}")

with open("image_paths_clip.pkl", "wb") as f:
    pickle.dump(image_paths, f)

retrieval.py

Python
import os
import torch
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
import faiss
import pickle
import matplotlib.pyplot as plt
from dotenv import load_dotenv

# Load environment variables if using dotenv
load_dotenv()

# Function to retrieve CLIP model, processor, and tokenizer
def get_model_info(model_ID, device):
    model = CLIPModel.from_pretrained(model_ID).to(device)
    processor = CLIPProcessor.from_pretrained(model_ID)
    tokenizer = CLIPTokenizer.from_pretrained(model_ID)
    return model, processor, tokenizer

# Function to get text embedding
def get_single_text_embedding(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    text_embeddings = model.get_text_features(**inputs)
    embedding_as_np = text_embeddings.cpu().detach().numpy()
    embedding_as_np = embedding_as_np / np.linalg.norm(embedding_as_np)
    return embedding_as_np

# Function to query FAISS index and retrieve matching image paths
def query_index(query_text, model, tokenizer, index, image_paths, device, k=1, threshold=0.2):
    text_embedding = get_single_text_embedding(query_text, model, tokenizer, device)
    D, I = index.search(text_embedding, k)
    
    retrieved_indices = I[0]
    retrieved_distances = D[0]
    
    results = []
    for idx, dist in zip(retrieved_indices, retrieved_distances):
        if dist >= threshold:
            matching_image_path = image_paths[idx]
            results.append((matching_image_path, dist))
    
    return results

# Check if CUDA is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model_ID = "openai/clip-vit-base-patch32"

# Load CLIP model, processor, and tokenizer
model, processor, tokenizer = get_model_info(model_ID, device)

# Load FAISS index and image paths using pickle
index = faiss.read_index("image_paths_clip.faiss")
with open("image_paths_clip.pkl", "rb") as f:
    image_paths = pickle.load(f)

# Example text prompt for querying
text_prompt = "Prompt here"

# Perform query and retrieve results
results = query_index(text_prompt, model, tokenizer, index, image_paths, device, k=1)

# Display results using Matplotlib
if not results:
    print("No matching image path found.")
else:
    for result in results:
        matching_image_path, distance = result
        print(f"Matching Image Path: {matching_image_path}, Distance (Cosine Similarity): {distance}")
        if os.path.exists(matching_image_path):
            matching_image = Image.open(matching_image_path)
            
            # Display image using Matplotlib
            plt.imshow(matching_image)
            plt.title(f"Prompt: {text_prompt}\nDistance (Cosine Similarity): {distance:.2f}")
            plt.axis('off')  # Hide axes
            plt.show()

Credits

Dhruv Bhat
1 project • 1 follower
Software Developer

Comments