#!/bin/sh export CUDA_VISIBLE_DEVICES=0 # Set the GPU ID. Comment this line to use all GPUs and DDP. # Train the commonly used VGG19-based encoder-decoder model on NWPU-Crowd. # Change `--dataset` to `sha` or `shb` or `qnrf` to train on ShanghaiTech A, or ShanghaiTech B, or UCF-QNRF. python trainer.py \ --model vgg19_ae --input_size 448 --reduction 8 --truncation 4 --anchor_points average \ --dataset nwpu \ --count_loss dmcount && # Train the CLIP-EBC (ResNet50) model on ShanghaiTech A. Use `--dataset shb` if you want to train on ShanghaiTech B. python trainer.py \ --model clip_resnet50 --input_size 448 --reduction 8 --truncation 4 --anchor_points average --prompt_type word \ --dataset sha \ # --sliding_window --window_size 448 --stride 448 \ # Uncomment this line to enable sliding window prediction with a stride size of 448. --count_loss dmcount && # Train the CLIP-EBC (ViT-B/16) model on UCF-QNRF, using VPT in training and sliding window prediction in testing. # By default, 32 tokens for each layer are used in VPT. You can also set `--num_vpt` to change the number of tokens. # By default, the deep visual prompt tuning is used. You can set `--shallow_vpt` to use the shallow visual prompt tuning. # `--amp` enables automatic mixed precision training. python trainer.py \ --model clip_vit_b_16 --input_size 224 --reduction 8 --truncation 4 \ --dataset qnrf --batch_size 16 --amp \ --num_crops 2 --sliding_window --window_size 224 --stride 224 --warmup_lr 1e-3 \ --count_loss dmcount # Generate results on NWPU-Crowd Test. # python test_nwpu.py \ # --model clip_vit_b_16 --input_size 224 --reduction 8 --truncation 4 --anchor_points average --prompt_type word \ # --num_vpt 32 --vpt_drop 0.0 --sliding_window --stride 224 \ # --weight_path ./checkpoints/nwpu/clip_vit_b_16_word_224_8_4_fine_1.0_dmcount/best_rmse_1.pth