Published July 31, 2024

Generative AI Techniques For High Quality DNA Sequence Synth

Generative AI Techniques For High Quality DNA Sequence Synthesis for CRISPR Edit Detection. That said, I have no idea if these scripts work.

AdvancedWork in progress10 hours26

Things used in this project

Hardware components

Minisforum Venus UM790 Pro with AMD Ryzen™ 9

Software apps and online services

Microsoft Windows 10

Story

I wanted to see if high quality gene sequence data could be created through machine learning techniques. Unfortunately, the code provided is to illustrate my thought process and to accomplish a rudimentary implementation of the idea. Due to life events (primarily focused on buying a new home and selling the old one), I did not have as much time as I would have liked to focus on this project.

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the DNA-BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dsmilkov/dnabert-base")
model = AutoModelForSeq2SeqLM.from_pretrained("dsmilkov/dnabert-base")

# Initialize the RL agent
agent = DRL_Agent(model, tokenizer)

# Number of synthetic CRISPR gRNA encoding data sequences to generate
num_sequences = 100

# Generate the synthetic sequences
synthetic_sequences = []
for _ in range(num_sequences):
    sequence = agent.generate_sequence()
    synthetic_sequences.append(sequence)

# Evaluate the generated sequences using DNABert
predictions = model.generate(tokenizer(synthetic_sequences, return_tensors="pt").input_ids)
predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

# Identify anomalous DNA edit sites
anomalous_sites = []
for i, prediction in enumerate(predictions):
    if "N" in prediction:
        anomalous_sites.append(i)

# Retrain the RL agent on the anomalous sequences
agent.retrain(anomalous_sites)

# Generate a new set of synthetic sequences using the retrained agent
new_synthetic_sequences = []
for _ in range(num_sequences):
    sequence = agent.generate_sequence()
    new_synthetic_sequences.append(sequence)

import numpy as np
import pandas as pd
import random

# Define the parameters for the synthetic data
num_samples = 1000  # Number of DNA samples to generate
seq_length = 1000  # Length of each DNA sequence
num_crispr_sites = 10  # Number of CRISPR-Cas9 binding sites per sequence

# Create a list of all possible CRISPR-Cas9 binding sites
possible_sites = list('NNNNNNNN')

# Generate a list of random DNA sequences
dna_sequences = []
for i in range(num_samples):
    sequence = ''.join(random.choice('ATCG') for _ in range(seq_length))
    dna_sequences.append(sequence)

# Generate a list of CRISPR-Cas9 binding sites for each sequence
crispr_sites = []
for sequence in dna_sequences:
    sites = []
    for i in range(num_crispr_sites):
        # Select a random start position for the binding site
        start = random.randint(0, seq_length - len(possible_sites))

        # Select a random binding site from the list of possible sites
        site = random.choice(possible_sites)

        # Add the binding site to the list of sites for this sequence
        sites.append(sequence[start:start + len(site)])

    crispr_sites.append(sites)

# Create a pandas DataFrame with the DNA sequences and CRISPR-Cas9 binding sites
df = pd.DataFrame({'dna_sequence': dna_sequences, 'crispr_sites': crispr_sites})

import torch
import torch.nn as nn
import torch.nn.functional as F


class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        # 1D convolutional layers
        self.conv1 = nn.Conv1d(in_channels=10, out_channels=30, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=30, out_channels=100, kernel_size=5, padding=2)
        self.conv3 = nn.Conv1d(in_channels=100, out_channels=200, kernel_size=7, padding=3)

        # Max pooling layers
        self.pool1 = nn.MaxPool1d(kernel_size=5)
        self.pool2 = nn.MaxPool1d(kernel_size=5)
        self.pool3 = nn.MaxPool1d(kernel_size=5)

        # Dense layers
        self.fc1 = nn.Linear(in_features=200 * 96, out_features=100)
        self.fc2 = nn.Linear(in_features=100, out_features=10)
        self.fc3 = nn.Linear(in_features=10, out_features=1)

    def forward(self, x):
        # Pass the input through the convolutional and max pooling layers
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = self.pool3(x)

        # Flatten the output of the convolutional layers
        x = x.view(x.size(0), -1)

        # Pass the flattened output through the dense layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))

        return x

import torch
import numpy as np
import gym

# Define the environment
class CRISPREnv(gym.Env):
    def __init__(self):
        self.action_space = gym.spaces.Discrete(4)  # 4 possible actions: A, C, G, T
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(4,))  # 4 features: A, C, G, T

    def reset(self):
        # Initialize the environment with a random sequence of 20 nucleotides
        self.sequence = np.random.choice(['A', 'C', 'G', 'T'], size=20)
        return self.sequence

    def step(self, action):
        # Take an action and update the environment
        self.sequence[self.position] = ['A', 'C', 'G', 'T'][action]
        self.position += 1

        # Calculate the reward
        reward = 0
        if self.sequence[self.position - 1] == 'G':
            reward = 1

        # Check if the episode is done
        done = (self.position == 20)

        # Return the observation, reward, done, and info
        return self.sequence, reward, done, {}

# Define the agent
class CRISPRAgent(nn.Module):
    def __init__(self):
        super(CRISPRAgent, self).__init__()
        self.fc1 = nn.Linear(4, 16)
        self.fc2 = nn.Linear(16, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Train the agent
env = CRISPREnv()
agent = CRISPRAgent()
optimizer = torch.optim.Adam(agent.parameters())

for episode in range(10000):
    done = False
    observation = env.reset()

    while not done:
        # Get the agent's action
        action = agent(observation)

        # Take the action and observe the environment's response
        next_observation, reward, done, _ = env.step(action)

        # Update the agent's parameters
        loss = F.mse_loss(action, reward)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update the observation
        observation = next_observation

# Generate synthetic pcas-9 CRISPR gRNA encoding data
sequences = []
for _ in range(10000):
    env.reset()
    done = False

    while not done:
        # Get the agent's action
        action = agent(observation)

        # Take the action and observe the environment's response
        next_observation, reward, done, _ = env.step(action)
        sequences.append(''.join(observation))

# Save the synthetic data to a file
with open('synthetic_data.txt', 'w') as f:
    for sequence in sequences:
        f.write(sequence + '\n')


import torch
import torch.nn as nn
import torch.optim as optim
from transformers import DNABertModel

# Define the agent network
class Agent(nn.Module):
    def __init__(self):
        super(Agent, self).__init__()
        self.dnabert = DNABertModel.from_pretrained("dsmiles/dnabert-base")
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(256, 1)

    def forward(self, x):
        x = self.dnabert(x)
        x = self.fc1(x.last_hidden_state[:, 0])
        x = self.fc2(x)
        return x

# Define the environment
class Environment:
    def __init__(self, data):
        self.data = data
        self.index = 0

    def reset(self):
        self.index = 0

    def step(self, action):
        reward = 0
        if action == self.data[self.index][1]:
            reward = 1
        self.index += 1
        done = self.index == len(self.data)
        return reward, done

# Define the training loop
def train(agent, env, num_episodes):
    optimizer = optim.Adam(agent.parameters(), lr=1e-3)
    for episode in range(num_episodes):
        env.reset()
        done = False
        while not done:
            # Get the current state
            state = env.data[env.index][0]

            # Take an action
            action = agent(state)

            # Get the reward
            reward, done = env.step(action)

            # Update the agent's network
            optimizer.zero_grad()
            loss = -reward
            loss.backward()
            optimizer.step()

# Generate synthetic data
def generate_data(agent, num_samples):
    data = []
    for i in range(num_samples):
        # Get a random DNA sequence
        dna = "ACGT" * np.random.randint(1, 100)

        # Get the agent's prediction
        action = agent(dna)

        # Add the data to the list
        data.append((dna, action))

    return data

# Main function
if __name__ == "__main__":
    # Load the data
    data = pd.read_csv("data.csv")

    # Create the environment
    env = Environment(data)

    # Create the agent
    agent = Agent()

    # Train the agent
    train(agent, env, 1000)

    # Generate synthetic data
    data = generate_data(agent, 10000)

    # Save the data
    pd.DataFrame(data).to_csv("synthetic_data.csv")

import torch
import numpy as np

class DNA_Generator(torch.nn.Module):
    def __init__(self, num_actions):
        super(DNA_Generator, self).__init__()
        self.num_actions = num_actions

        # Define the LSTM network
        self.lstm = torch.nn.LSTM(input_size=1, hidden_size=64, num_layers=1, batch_first=True)

        # Define the policy network (the head that outputs the action probabilities)
        self.policy = torch.nn.Linear(64, num_actions)

    def forward(self, x):
        # Pass the input through the LSTM network
        lstm_out, _ = self.lstm(x)

        # Pass the LSTM output through the policy network
        action_probs = self.policy(lstm_out)

        # Return the action probabilities
        return action_probs

# Initialize the DNA generator network
dna_generator = DNA_Generator(num_actions=4)  # 4 actions: A, C, G, T

# Define the loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(dna_generator.parameters(), lr=0.01)

# Training loop
for epoch in range(1000):
    # Generate the input data (random sequence of 1s and 0s)
    input_data = torch.rand((1, 20, 1))

    # Generate the target data (correct PCA-9 CRISPR gRNA sequence)
    target_data = torch.tensor([1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1])

    # Forward pass through the DNA generator network
    action_probs = dna_generator(input_data)

    # Compute the loss
    loss = loss_fn(action_probs.squeeze(), target_data)

    # Backpropagate and update the network parameters
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Generate a synthetic DNA sequence using the trained DNA generator network
input_data = torch.rand((1, 20, 1))
action_probs = dna_generator(input_data)
synthetic_dna = ''.join([['A', 'C', 'G', 'T'][i] for i in action_probs.argmax(dim=1).numpy().squeeze()])

print("Synthetic DNA sequence:", synthetic_dna)

Credits

Blake Gage

1 project • 0 followers

Generative AI Techniques For High Quality DNA Sequence Synth

Things used in this project

Hardware components

Software apps and online services

Story

Code

evaluate.py

generate.py

cnn.py

dna_drl.py

dna_gen.py

Credits

Blake Gage

Comments

Embed the widget on your own site

Generative AI Techniques For High Quality DNA Sequence Synth

Generative AI Techniques For High Quality DNA Sequence Synth

Things used in this project

Hardware components

Software apps and online services

Story

Code

evaluate.py

generate.py

cnn.py

dna_drl.py

dna_gen.py

Credits

Blake Gage

Comments

Related channels and tags