Spaces:
Sleeping
Sleeping
Fixes
Browse files- .gitignore +3 -1
- app.py +20 -7
- config.py +13 -1
- scaler.pkl +0 -0
.gitignore
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
.venv
|
| 2 |
-
flagged
|
|
|
|
|
|
|
|
|
| 1 |
.venv
|
| 2 |
+
flagged
|
| 3 |
+
*.tif
|
| 4 |
+
*.tiff
|
app.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
-
from io import BytesIO
|
| 2 |
import os
|
|
|
|
| 3 |
import re
|
| 4 |
import PIL.Image
|
| 5 |
import pandas as pd
|
| 6 |
import numpy as np
|
| 7 |
import gradio as gr
|
| 8 |
from datasets import load_dataset
|
| 9 |
-
import infer
|
| 10 |
import matplotlib.pyplot as plt
|
| 11 |
from sklearn.manifold import TSNE
|
| 12 |
from sklearn.preprocessing import LabelEncoder
|
|
@@ -15,8 +14,10 @@ from torch import nn
|
|
| 15 |
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
|
| 16 |
from huggingface_hub import PyTorchModelHubMixin
|
| 17 |
from pinecone import Pinecone
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
from config import DEFAULT_INPUTS, MODELS, DATASETS, ID_TO_GENUS_MAP
|
| 20 |
|
| 21 |
# We need this for the eco layers because they are too big
|
| 22 |
PIL.Image.MAX_IMAGE_PIXELS = None
|
|
@@ -52,15 +53,15 @@ classification_model = DNASeqClassifier.from_pretrained(
|
|
| 52 |
BertConfig(vocab_size=259, output_hidden_states=True),
|
| 53 |
),
|
| 54 |
)
|
|
|
|
|
|
|
| 55 |
|
| 56 |
embeddings_model.eval()
|
| 57 |
classification_model.eval()
|
| 58 |
|
| 59 |
# Load datasets
|
| 60 |
-
ecolayers_ds = load_dataset(DATASETS["ecolayers"])
|
| 61 |
amazon_ds = load_dataset(DATASETS["amazon"])
|
| 62 |
|
| 63 |
-
|
| 64 |
def set_default_inputs():
|
| 65 |
return (DEFAULT_INPUTS["dna_sequence"],
|
| 66 |
DEFAULT_INPUTS["latitude"],
|
|
@@ -99,7 +100,6 @@ def tokenize(dna_sequence: str) -> dict[str, torch.Tensor]:
|
|
| 99 |
return tokenizer(dna_seq_preprocessed, return_tensors="pt")
|
| 100 |
|
| 101 |
|
| 102 |
-
|
| 103 |
def get_embedding(dna_sequence: str) -> torch.Tensor:
|
| 104 |
dna_embedding: torch.Tensor = embeddings_model(
|
| 105 |
**tokenize(dna_sequence)
|
|
@@ -126,7 +126,20 @@ def predict_genus(method: str, dna_sequence: str, latitude: str, longitude: str)
|
|
| 126 |
|
| 127 |
if method == "fine_tuned_model":
|
| 128 |
bert_inputs = tokenize(dna_sequence)
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
temperature = 0.2
|
| 131 |
probs = torch.softmax(logits / temperature, dim=1).squeeze()
|
| 132 |
top_k = torch.topk(probs, 10)
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import pickle
|
| 3 |
import re
|
| 4 |
import PIL.Image
|
| 5 |
import pandas as pd
|
| 6 |
import numpy as np
|
| 7 |
import gradio as gr
|
| 8 |
from datasets import load_dataset
|
|
|
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
from sklearn.manifold import TSNE
|
| 11 |
from sklearn.preprocessing import LabelEncoder
|
|
|
|
| 14 |
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
|
| 15 |
from huggingface_hub import PyTorchModelHubMixin
|
| 16 |
from pinecone import Pinecone
|
| 17 |
+
import rasterio
|
| 18 |
+
from rasterio.sample import sample_gen
|
| 19 |
|
| 20 |
+
from config import DEFAULT_INPUTS, MODELS, DATASETS, ID_TO_GENUS_MAP, LAYER_NAMES
|
| 21 |
|
| 22 |
# We need this for the eco layers because they are too big
|
| 23 |
PIL.Image.MAX_IMAGE_PIXELS = None
|
|
|
|
| 53 |
BertConfig(vocab_size=259, output_hidden_states=True),
|
| 54 |
),
|
| 55 |
)
|
| 56 |
+
with open("scaler.pkl", "rb") as f:
|
| 57 |
+
scaler = pickle.load(f)
|
| 58 |
|
| 59 |
embeddings_model.eval()
|
| 60 |
classification_model.eval()
|
| 61 |
|
| 62 |
# Load datasets
|
|
|
|
| 63 |
amazon_ds = load_dataset(DATASETS["amazon"])
|
| 64 |
|
|
|
|
| 65 |
def set_default_inputs():
|
| 66 |
return (DEFAULT_INPUTS["dna_sequence"],
|
| 67 |
DEFAULT_INPUTS["latitude"],
|
|
|
|
| 100 |
return tokenizer(dna_seq_preprocessed, return_tensors="pt")
|
| 101 |
|
| 102 |
|
|
|
|
| 103 |
def get_embedding(dna_sequence: str) -> torch.Tensor:
|
| 104 |
dna_embedding: torch.Tensor = embeddings_model(
|
| 105 |
**tokenize(dna_sequence)
|
|
|
|
| 126 |
|
| 127 |
if method == "fine_tuned_model":
|
| 128 |
bert_inputs = tokenize(dna_sequence)
|
| 129 |
+
|
| 130 |
+
env_data = []
|
| 131 |
+
for layer in LAYER_NAMES:
|
| 132 |
+
with rasterio.open(layer) as dataset:
|
| 133 |
+
# Get the corresponding ecological values for the samples
|
| 134 |
+
results = sample_gen(dataset, [coords])
|
| 135 |
+
results = [r for r in results]
|
| 136 |
+
layer_data = np.mean(results[0])
|
| 137 |
+
env_data.append(layer_data)
|
| 138 |
+
|
| 139 |
+
env_data = scaler.transform([env_data])
|
| 140 |
+
env_data = torch.from_numpy(env_data).to(torch.float32)
|
| 141 |
+
|
| 142 |
+
logits = classification_model(bert_inputs, env_data)
|
| 143 |
temperature = 0.2
|
| 144 |
probs = torch.softmax(logits / temperature, dim=1).squeeze()
|
| 145 |
top_k = torch.topk(probs, 10)
|
config.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
|
|
@@ -24,6 +25,17 @@ MODELS = {
|
|
| 24 |
}
|
| 25 |
|
| 26 |
DATASETS = {
|
| 27 |
-
"ecolayers": "LofiAmazon/Global-Ecolayers",
|
| 28 |
"amazon": "LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon",
|
| 29 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
import json
|
| 3 |
|
| 4 |
|
|
|
|
| 25 |
}
|
| 26 |
|
| 27 |
DATASETS = {
|
|
|
|
| 28 |
"amazon": "LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon",
|
| 29 |
}
|
| 30 |
+
|
| 31 |
+
HUGGINGFACE_DW_URL = "https://huggingface.co/datasets/LofiAmazon/Global-Ecolayers/resolve/main/{filename}?download=true"
|
| 32 |
+
|
| 33 |
+
LAYER_NAMES = [
|
| 34 |
+
"median_elevation_1km.tiff",
|
| 35 |
+
"human_footprint.tiff",
|
| 36 |
+
"population_density_1km.tif",
|
| 37 |
+
"annual_precipitation.tif",
|
| 38 |
+
"precipitation_seasonality.tif",
|
| 39 |
+
"annual_mean_air_temp.tif",
|
| 40 |
+
"temp_seasonality.tif",
|
| 41 |
+
]
|
scaler.pkl
ADDED
|
Binary file (863 Bytes). View file
|
|
|