Blake Gage
Published

Generative AI Techniques For High Quality DNA Sequence Synth

Generative AI Techniques For High Quality DNA Sequence Synthesis for CRISPR Edit Detection. That said, I have no idea if these scripts work.

AdvancedWork in progress10 hours26

Things used in this project

Hardware components

Minisforum Venus UM790 Pro with AMD Ryzen™ 9
Minisforum Venus UM790 Pro with AMD Ryzen™ 9
×1

Software apps and online services

Windows 10
Microsoft Windows 10

Story

Read more

Code

evaluate.py

Python
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the DNA-BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dsmilkov/dnabert-base")
model = AutoModelForSeq2SeqLM.from_pretrained("dsmilkov/dnabert-base")

# Initialize the RL agent
agent = DRL_Agent(model, tokenizer)

# Number of synthetic CRISPR gRNA encoding data sequences to generate
num_sequences = 100

# Generate the synthetic sequences
synthetic_sequences = []
for _ in range(num_sequences):
    sequence = agent.generate_sequence()
    synthetic_sequences.append(sequence)

# Evaluate the generated sequences using DNABert
predictions = model.generate(tokenizer(synthetic_sequences, return_tensors="pt").input_ids)
predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

# Identify anomalous DNA edit sites
anomalous_sites = []
for i, prediction in enumerate(predictions):
    if "N" in prediction:
        anomalous_sites.append(i)

# Retrain the RL agent on the anomalous sequences
agent.retrain(anomalous_sites)

# Generate a new set of synthetic sequences using the retrained agent
new_synthetic_sequences = []
for _ in range(num_sequences):
    sequence = agent.generate_sequence()
    new_synthetic_sequences.append(sequence)

generate.py

Python
import numpy as np
import pandas as pd
import random

# Define the parameters for the synthetic data
num_samples = 1000  # Number of DNA samples to generate
seq_length = 1000  # Length of each DNA sequence
num_crispr_sites = 10  # Number of CRISPR-Cas9 binding sites per sequence

# Create a list of all possible CRISPR-Cas9 binding sites
possible_sites = list('NNNNNNNN')

# Generate a list of random DNA sequences
dna_sequences = []
for i in range(num_samples):
    sequence = ''.join(random.choice('ATCG') for _ in range(seq_length))
    dna_sequences.append(sequence)

# Generate a list of CRISPR-Cas9 binding sites for each sequence
crispr_sites = []
for sequence in dna_sequences:
    sites = []
    for i in range(num_crispr_sites):
        # Select a random start position for the binding site
        start = random.randint(0, seq_length - len(possible_sites))

        # Select a random binding site from the list of possible sites
        site = random.choice(possible_sites)

        # Add the binding site to the list of sites for this sequence
        sites.append(sequence[start:start + len(site)])

    crispr_sites.append(sites)

# Create a pandas DataFrame with the DNA sequences and CRISPR-Cas9 binding sites
df = pd.DataFrame({'dna_sequence': dna_sequences, 'crispr_sites': crispr_sites})

cnn.py

Python
import torch
import torch.nn as nn
import torch.nn.functional as F


class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        # 1D convolutional layers
        self.conv1 = nn.Conv1d(in_channels=10, out_channels=30, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=30, out_channels=100, kernel_size=5, padding=2)
        self.conv3 = nn.Conv1d(in_channels=100, out_channels=200, kernel_size=7, padding=3)

        # Max pooling layers
        self.pool1 = nn.MaxPool1d(kernel_size=5)
        self.pool2 = nn.MaxPool1d(kernel_size=5)
        self.pool3 = nn.MaxPool1d(kernel_size=5)

        # Dense layers
        self.fc1 = nn.Linear(in_features=200 * 96, out_features=100)
        self.fc2 = nn.Linear(in_features=100, out_features=10)
        self.fc3 = nn.Linear(in_features=10, out_features=1)

    def forward(self, x):
        # Pass the input through the convolutional and max pooling layers
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = self.pool3(x)

        # Flatten the output of the convolutional layers
        x = x.view(x.size(0), -1)

        # Pass the flattened output through the dense layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))

        return x

dna_drl.py

Python
import torch
import numpy as np
import gym

# Define the environment
class CRISPREnv(gym.Env):
    def __init__(self):
        self.action_space = gym.spaces.Discrete(4)  # 4 possible actions: A, C, G, T
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(4,))  # 4 features: A, C, G, T

    def reset(self):
        # Initialize the environment with a random sequence of 20 nucleotides
        self.sequence = np.random.choice(['A', 'C', 'G', 'T'], size=20)
        return self.sequence

    def step(self, action):
        # Take an action and update the environment
        self.sequence[self.position] = ['A', 'C', 'G', 'T'][action]
        self.position += 1

        # Calculate the reward
        reward = 0
        if self.sequence[self.position - 1] == 'G':
            reward = 1

        # Check if the episode is done
        done = (self.position == 20)

        # Return the observation, reward, done, and info
        return self.sequence, reward, done, {}

# Define the agent
class CRISPRAgent(nn.Module):
    def __init__(self):
        super(CRISPRAgent, self).__init__()
        self.fc1 = nn.Linear(4, 16)
        self.fc2 = nn.Linear(16, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Train the agent
env = CRISPREnv()
agent = CRISPRAgent()
optimizer = torch.optim.Adam(agent.parameters())

for episode in range(10000):
    done = False
    observation = env.reset()

    while not done:
        # Get the agent's action
        action = agent(observation)

        # Take the action and observe the environment's response
        next_observation, reward, done, _ = env.step(action)

        # Update the agent's parameters
        loss = F.mse_loss(action, reward)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update the observation
        observation = next_observation

# Generate synthetic pcas-9 CRISPR gRNA encoding data
sequences = []
for _ in range(10000):
    env.reset()
    done = False

    while not done:
        # Get the agent's action
        action = agent(observation)

        # Take the action and observe the environment's response
        next_observation, reward, done, _ = env.step(action)
        sequences.append(''.join(observation))

# Save the synthetic data to a file
with open('synthetic_data.txt', 'w') as f:
    for sequence in sequences:
        f.write(sequence + '\n')


import torch
import torch.nn as nn
import torch.optim as optim
from transformers import DNABertModel

# Define the agent network
class Agent(nn.Module):
    def __init__(self):
        super(Agent, self).__init__()
        self.dnabert = DNABertModel.from_pretrained("dsmiles/dnabert-base")
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(256, 1)

    def forward(self, x):
        x = self.dnabert(x)
        x = self.fc1(x.last_hidden_state[:, 0])
        x = self.fc2(x)
        return x

# Define the environment
class Environment:
    def __init__(self, data):
        self.data = data
        self.index = 0

    def reset(self):
        self.index = 0

    def step(self, action):
        reward = 0
        if action == self.data[self.index][1]:
            reward = 1
        self.index += 1
        done = self.index == len(self.data)
        return reward, done

# Define the training loop
def train(agent, env, num_episodes):
    optimizer = optim.Adam(agent.parameters(), lr=1e-3)
    for episode in range(num_episodes):
        env.reset()
        done = False
        while not done:
            # Get the current state
            state = env.data[env.index][0]

            # Take an action
            action = agent(state)

            # Get the reward
            reward, done = env.step(action)

            # Update the agent's network
            optimizer.zero_grad()
            loss = -reward
            loss.backward()
            optimizer.step()

# Generate synthetic data
def generate_data(agent, num_samples):
    data = []
    for i in range(num_samples):
        # Get a random DNA sequence
        dna = "ACGT" * np.random.randint(1, 100)

        # Get the agent's prediction
        action = agent(dna)

        # Add the data to the list
        data.append((dna, action))

    return data

# Main function
if __name__ == "__main__":
    # Load the data
    data = pd.read_csv("data.csv")

    # Create the environment
    env = Environment(data)

    # Create the agent
    agent = Agent()

    # Train the agent
    train(agent, env, 1000)

    # Generate synthetic data
    data = generate_data(agent, 10000)

    # Save the data
    pd.DataFrame(data).to_csv("synthetic_data.csv")

dna_gen.py

Python
import torch
import numpy as np

class DNA_Generator(torch.nn.Module):
    def __init__(self, num_actions):
        super(DNA_Generator, self).__init__()
        self.num_actions = num_actions

        # Define the LSTM network
        self.lstm = torch.nn.LSTM(input_size=1, hidden_size=64, num_layers=1, batch_first=True)

        # Define the policy network (the head that outputs the action probabilities)
        self.policy = torch.nn.Linear(64, num_actions)

    def forward(self, x):
        # Pass the input through the LSTM network
        lstm_out, _ = self.lstm(x)

        # Pass the LSTM output through the policy network
        action_probs = self.policy(lstm_out)

        # Return the action probabilities
        return action_probs

# Initialize the DNA generator network
dna_generator = DNA_Generator(num_actions=4)  # 4 actions: A, C, G, T

# Define the loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(dna_generator.parameters(), lr=0.01)

# Training loop
for epoch in range(1000):
    # Generate the input data (random sequence of 1s and 0s)
    input_data = torch.rand((1, 20, 1))

    # Generate the target data (correct PCA-9 CRISPR gRNA sequence)
    target_data = torch.tensor([1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1])

    # Forward pass through the DNA generator network
    action_probs = dna_generator(input_data)

    # Compute the loss
    loss = loss_fn(action_probs.squeeze(), target_data)

    # Backpropagate and update the network parameters
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Generate a synthetic DNA sequence using the trained DNA generator network
input_data = torch.rand((1, 20, 1))
action_probs = dna_generator(input_data)
synthetic_dna = ''.join([['A', 'C', 'G', 'T'][i] for i in action_probs.argmax(dim=1).numpy().squeeze()])

print("Synthetic DNA sequence:", synthetic_dna)

Credits

Blake Gage
1 project • 0 followers

Comments