File size: 4,400 Bytes
086ca7b
dd2b907
a713c85
 
086ca7b
dd2b907
 
35eb5e1
 
 
 
 
 
 
 
dd2b907
35eb5e1
 
 
dd2b907
35eb5e1
 
086ca7b
35eb5e1
 
 
 
086ca7b
e3148e0
35eb5e1
 
086ca7b
35eb5e1
086ca7b
35eb5e1
086ca7b
 
 
 
 
 
 
35eb5e1
 
e3148e0
 
 
 
 
 
 
35eb5e1
 
e3148e0
 
00539a5
e3148e0
 
 
00539a5
e3148e0
 
 
 
 
717cff1
 
 
 
 
 
 
 
 
 
 
 
 
 
e3148e0
 
 
 
 
 
 
 
00539a5
568fe9a
ca699c2
717cff1
00539a5
e3148e0
 
 
717cff1
 
 
 
e3148e0
 
 
 
a713c85
e3148e0
 
 
 
 
 
 
00539a5
e3148e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717cff1
e3148e0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from typing import List, Dict
import numpy as np
import gradio as gr

import data_utils



def smiles2monomers(smiles: str) -> list[str]:
    """
    Converts SMILES representation of a molecule to a list of monomers (also SMILES). Not implemented yet.
    """
    return []


def generate_monomers(num_monomers: int, monomers_vocab: List[str]) -> list[str]:
    """
    Produces random list of monomers with num_monomers elements. Not implemented yet.
    """
    return np.random.choice(monomers_vocab, num_monomers)


def monomer2domains(monomer: str, is_start=False, is_final=False) -> list[str]:
    """
    Converts a monomer to a list of domains. 
    Not implemented yet.
    """
    nrps_module = data_utils.module_generator.suggest_module(monomer, is_start=is_start, is_final=is_final)
    return nrps_module


def monomers2modules(monomer_list: list[str], is_cyclic: bool=False) -> List[List[Dict[str, str]]]:
    """
    Cyclicity flag is always ignored, since I haven't figured out yet how this is encoded in NRPS domains sequence.
    """
    modules_data = []
    for index, monomer in enumerate(monomer_list):
        is_start = index == 0
        is_final = index == len(monomer_list)
        domains_list = monomer2domains(monomer, is_start=is_start, is_final=is_final)
        modules_data.append(domains_list)
    return modules_data


# def find_bacteria(monomers: list[str]) -> list[str]:
#     """
#     Finds bacteria which might produce the target peptide.
#     Input: sequence of possible domains. Each domain is represented as a protein sequence.
#     Output: possible hist from blastp search.
#     """
#     return []


# def letter_counter(word, letter):
#     """Count the occurrences of a specific letter in a word.
    
#     Args:
#         word: The word or phrase to analyze
#         letter: The letter to count occurrences of
        
#     Returns:
#         The number of times the letter appears in the word
#     """
#     return word.lower().count(letter.lower())

def convert_to_fasta(modules_list):
    fasta_lines = []
    for imodule, module_data in enumerate(modules_list):
        for idomain, domain_data in enumerate(module_data):
            name = domain_data['name']
            sequence = domain_data['sequence']
            fasta_lines.extend(
                [
                    f">module_{imodule:02d}_domain_{idomain:02d}_{name}",
                    sequence
                ]
            )
    return '\n'.join(fasta_lines)


def generate_peptide_monomers(num_monomers: int):
    """Produces the peptide constructed from specified number of monomer fragments.
    Currently the sequence is picked randomly from predefined collection of monomers (aminoacids and their D- isomers).
    
    Args:
        num_monomers: The number of monomer fragments in the resulting 'peptide'
    
    Returns:
        The string which is constructed from specified number of monomer fragments separated by commas, 
        and the data for the corresponding domain sequences (for future searches with blastp, in .fasta format).
        
    """
    MONOMER_NAMES = data_utils.load_monomers()
    monomers_list = generate_monomers(num_monomers, MONOMER_NAMES)
    modules_data = monomers2modules(monomers_list)
    
    #print(modules_data)
    return ",".join(monomers_list), convert_to_fasta(modules_data)



if __name__ == "__main__":
    

    # demo = gr.Interface(
    #     fn=letter_counter,
    #     inputs=["text", "text"],
    #     outputs="number",
    #     title="Letter Counter",
    #     description="Count how many times a letter appears in a word"
    # )



    with gr.Blocks(title="NRPS domains 'generator'") as demo:
        gr.Markdown("""# BioCynthia
```
There are bacteria in soil and sea
They have what is called a B-G-C
These genes produce some complex peptides
And they might save our lives!
```
                    """)

        gr.Markdown("For more details on project goals and motivation, please refer to the README.md")
        
        frequency_slider = gr.Slider(
            minimum=2, 
            maximum=10, 
            step=1, 
            value=3, 
            label="Number of monomers in the target peptide"
        )
        gr.Interface(
            fn=generate_peptide_monomers,
            inputs=[frequency_slider],
            outputs=["text", "text"],
        )


    demo.launch(mcp_server=True)