Spaces:
Running
Running
Bachmann Roman Christian
commited on
Commit
·
758da21
1
Parent(s):
100478a
Changed number of tokens to percentages of tokens
Browse files
app.py
CHANGED
|
@@ -292,6 +292,11 @@ def plot_predictions(input_dict, preds, masks, image_size=224):
|
|
| 292 |
|
| 293 |
|
| 294 |
def inference(img, num_tokens, manual_mode, num_rgb, num_depth, num_semseg, seed):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
im = Image.open(img)
|
| 296 |
|
| 297 |
# Center crop and resize RGB
|
|
@@ -367,7 +372,7 @@ title = "MultiMAE"
|
|
| 367 |
description = "Gradio demo for MultiMAE: Multi-modal Multi-task Masked Autoencoders. \
|
| 368 |
Upload your own images or try one of the examples below to explore the multi-modal masked reconstruction of a pre-trained MultiMAE model. \
|
| 369 |
Uploaded images are pseudo labeled using a DPT trained on Omnidata depth, and a Mask2Former trained on COCO. \
|
| 370 |
-
Choose the
|
| 371 |
|
| 372 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.01678' \
|
| 373 |
target='_blank'>MultiMAE: Multi-modal Multi-task Masked Autoencoders</a> | \
|
|
@@ -381,20 +386,20 @@ os.system("wget https://i.imgur.com/KTKgYKi.jpg")
|
|
| 381 |
os.system("wget https://i.imgur.com/lWYuRI7.jpg")
|
| 382 |
|
| 383 |
examples = [
|
| 384 |
-
['c9ObJdK.jpg',
|
| 385 |
-
['KTKgYKi.jpg',
|
| 386 |
-
['lWYuRI7.jpg',
|
| 387 |
]
|
| 388 |
|
| 389 |
gr.Interface(
|
| 390 |
fn=inference,
|
| 391 |
inputs=[
|
| 392 |
gr.inputs.Image(label='RGB input image', type='filepath'),
|
| 393 |
-
gr.inputs.Slider(label='
|
| 394 |
gr.inputs.Checkbox(label='Manual mode: Check this to manually set the number of input tokens per modality using the sliders below', default=False),
|
| 395 |
-
gr.inputs.Slider(label='
|
| 396 |
-
gr.inputs.Slider(label='
|
| 397 |
-
gr.inputs.Slider(label='
|
| 398 |
gr.inputs.Number(label='Random seed: Change this to sample different masks (for manual mode only)', default=0),
|
| 399 |
],
|
| 400 |
outputs=[
|
|
|
|
| 292 |
|
| 293 |
|
| 294 |
def inference(img, num_tokens, manual_mode, num_rgb, num_depth, num_semseg, seed):
|
| 295 |
+
num_tokens = int(588 * num_tokens / 100.0)
|
| 296 |
+
num_rgb = int(196 * num_rgb / 100.0)
|
| 297 |
+
num_depth = int(196 * num_depth / 100.0)
|
| 298 |
+
num_semseg = int(196 * num_semseg / 100.0)
|
| 299 |
+
|
| 300 |
im = Image.open(img)
|
| 301 |
|
| 302 |
# Center crop and resize RGB
|
|
|
|
| 372 |
description = "Gradio demo for MultiMAE: Multi-modal Multi-task Masked Autoencoders. \
|
| 373 |
Upload your own images or try one of the examples below to explore the multi-modal masked reconstruction of a pre-trained MultiMAE model. \
|
| 374 |
Uploaded images are pseudo labeled using a DPT trained on Omnidata depth, and a Mask2Former trained on COCO. \
|
| 375 |
+
Choose the percentage of visible tokens using the sliders below and see how MultiMAE reconstructs the modalities!"
|
| 376 |
|
| 377 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.01678' \
|
| 378 |
target='_blank'>MultiMAE: Multi-modal Multi-task Masked Autoencoders</a> | \
|
|
|
|
| 386 |
os.system("wget https://i.imgur.com/lWYuRI7.jpg")
|
| 387 |
|
| 388 |
examples = [
|
| 389 |
+
['c9ObJdK.jpg', 15, False, 15, 15, 15, 0],
|
| 390 |
+
['KTKgYKi.jpg', 15, False, 15, 15, 15, 0],
|
| 391 |
+
['lWYuRI7.jpg', 15, False, 15, 15, 15, 0],
|
| 392 |
]
|
| 393 |
|
| 394 |
gr.Interface(
|
| 395 |
fn=inference,
|
| 396 |
inputs=[
|
| 397 |
gr.inputs.Image(label='RGB input image', type='filepath'),
|
| 398 |
+
gr.inputs.Slider(label='Percentage of input tokens', default=15, step=0.1, minimum=0, maximum=100),
|
| 399 |
gr.inputs.Checkbox(label='Manual mode: Check this to manually set the number of input tokens per modality using the sliders below', default=False),
|
| 400 |
+
gr.inputs.Slider(label='Percentage of RGB input tokens (for manual mode only)', default=15, step=0.1, minimum=0, maximum=100),
|
| 401 |
+
gr.inputs.Slider(label='Percentage of depth input tokens (for manual mode only)', default=15, step=0.1, minimum=0, maximum=100),
|
| 402 |
+
gr.inputs.Slider(label='Percentage of semantic input tokens (for manual mode only)', default=15, step=0.1, minimum=0, maximum=100),
|
| 403 |
gr.inputs.Number(label='Random seed: Change this to sample different masks (for manual mode only)', default=0),
|
| 404 |
],
|
| 405 |
outputs=[
|