Spaces:

EPFL-VILAB
/

MultiMAE

Running

App Files Files Community

Bachmann Roman Christian commited on Apr 19, 2022

Commit

758da21

1 Parent(s): 100478a

Changed number of tokens to percentages of tokens

Browse files

Files changed (1) hide show

app.py +13 -8

app.py CHANGED Viewed

@@ -292,6 +292,11 @@ def plot_predictions(input_dict, preds, masks, image_size=224):
 def inference(img, num_tokens, manual_mode, num_rgb, num_depth, num_semseg, seed):
     im = Image.open(img)
     # Center crop and resize RGB
@@ -367,7 +372,7 @@ title = "MultiMAE"
 description = "Gradio demo for MultiMAE: Multi-modal Multi-task Masked Autoencoders. \
     Upload your own images or try one of the examples below to explore the multi-modal masked reconstruction of a pre-trained MultiMAE model. \
     Uploaded images are pseudo labeled using a DPT trained on Omnidata depth, and a Mask2Former trained on COCO. \
-    Choose the number of visible tokens using the sliders below and see how MultiMAE reconstructs the modalities!"
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.01678' \
            target='_blank'>MultiMAE: Multi-modal Multi-task Masked Autoencoders</a> | \
@@ -381,20 +386,20 @@ os.system("wget https://i.imgur.com/KTKgYKi.jpg")
 os.system("wget https://i.imgur.com/lWYuRI7.jpg")
 examples = [
-    ['c9ObJdK.jpg', 98, False, 32, 32, 32, 0],
-    ['KTKgYKi.jpg', 98, False, 32, 32, 32, 0],
-    ['lWYuRI7.jpg', 98, False, 32, 32, 32, 0],
 ]
 gr.Interface(
     fn=inference,
     inputs=[
         gr.inputs.Image(label='RGB input image', type='filepath'),
-        gr.inputs.Slider(label='Number of input tokens', default=98, step=1, minimum=0, maximum=588),
         gr.inputs.Checkbox(label='Manual mode: Check this to manually set the number of input tokens per modality using the sliders below', default=False),
-        gr.inputs.Slider(label='Number of RGB input tokens (for manual mode only)', default=32, step=1, minimum=0, maximum=196),
-        gr.inputs.Slider(label='Number of depth input tokens (for manual mode only)', default=32, step=1, minimum=0, maximum=196),
-        gr.inputs.Slider(label='Number of semantic input tokens (for manual mode only)', default=32, step=1, minimum=0, maximum=196),
         gr.inputs.Number(label='Random seed: Change this to sample different masks (for manual mode only)', default=0),
     ],
     outputs=[

 def inference(img, num_tokens, manual_mode, num_rgb, num_depth, num_semseg, seed):
+    num_tokens = int(588 * num_tokens / 100.0)
+    num_rgb = int(196 * num_rgb / 100.0)
+    num_depth = int(196 * num_depth / 100.0)
+    num_semseg = int(196 * num_semseg / 100.0)
     im = Image.open(img)
     # Center crop and resize RGB
 description = "Gradio demo for MultiMAE: Multi-modal Multi-task Masked Autoencoders. \
     Upload your own images or try one of the examples below to explore the multi-modal masked reconstruction of a pre-trained MultiMAE model. \
     Uploaded images are pseudo labeled using a DPT trained on Omnidata depth, and a Mask2Former trained on COCO. \
+    Choose the percentage of visible tokens using the sliders below and see how MultiMAE reconstructs the modalities!"
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.01678' \
            target='_blank'>MultiMAE: Multi-modal Multi-task Masked Autoencoders</a> | \
 os.system("wget https://i.imgur.com/lWYuRI7.jpg")
 examples = [
+    ['c9ObJdK.jpg', 15, False, 15, 15, 15, 0],
+    ['KTKgYKi.jpg', 15, False, 15, 15, 15, 0],
+    ['lWYuRI7.jpg', 15, False, 15, 15, 15, 0],
 ]
 gr.Interface(
     fn=inference,
     inputs=[
         gr.inputs.Image(label='RGB input image', type='filepath'),
+        gr.inputs.Slider(label='Percentage of input tokens', default=15, step=0.1, minimum=0, maximum=100),
         gr.inputs.Checkbox(label='Manual mode: Check this to manually set the number of input tokens per modality using the sliders below', default=False),
+        gr.inputs.Slider(label='Percentage of RGB input tokens (for manual mode only)', default=15, step=0.1, minimum=0, maximum=100),
+        gr.inputs.Slider(label='Percentage of depth input tokens (for manual mode only)', default=15, step=0.1, minimum=0, maximum=100),
+        gr.inputs.Slider(label='Percentage of semantic input tokens (for manual mode only)', default=15, step=0.1, minimum=0, maximum=100),
         gr.inputs.Number(label='Random seed: Change this to sample different masks (for manual mode only)', default=0),
     ],
     outputs=[