@@ -697,6 +697,13 @@ def _cfg(url='', **kwargs):
697697
698698default_cfgs = generate_default_cfgs ({
699699
700+ # re-finetuned augreg 21k FT on in1k weights
701+ 'vit_base_patch16_224.augreg2_in21k_ft_in1k' : _cfg (
702+ hf_hub_id = 'timm/' ),
703+ 'vit_base_patch16_384.augreg2_in21k_ft_in1k' : _cfg (),
704+ 'vit_base_patch8_224.augreg2_in21k_ft_in1k' : _cfg (
705+ hf_hub_id = 'timm/' ),
706+
700707 # How to train your ViT (augreg) weights, pretrained on 21k FT on in1k
701708 'vit_tiny_patch16_224.augreg_in21k_ft_in1k' : _cfg (
702709 url = 'https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz' ,
@@ -751,13 +758,6 @@ def _cfg(url='', **kwargs):
751758 hf_hub_id = 'timm/' ,
752759 custom_load = True , input_size = (3 , 384 , 384 ), crop_pct = 1.0 ),
753760
754- # re-finetuned augreg 21k FT on in1k weights
755- 'vit_base_patch16_224.augreg2_in21k_ft_in1k' : _cfg (
756- hf_hub_id = 'timm/' ),
757- 'vit_base_patch16_384.augreg2_in21k_ft_in1k' : _cfg (),
758- 'vit_base_patch8_224.augreg2_in21k_ft_in1k' : _cfg (
759- hf_hub_id = 'timm/' ),
760-
761761 # patch models (weights from official Google JAX impl) pretrained on in21k FT on in1k
762762 'vit_base_patch16_224.orig_in21k_ft_in1k' : _cfg (
763763 url = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth' ,
@@ -802,7 +802,6 @@ def _cfg(url='', **kwargs):
802802 'vit_giant_patch14_224.untrained' : _cfg (url = '' ),
803803 'vit_gigantic_patch14_224.untrained' : _cfg (url = '' ),
804804
805-
806805 # patch models, imagenet21k (weights from official Google JAX impl)
807806 'vit_large_patch32_224.orig_in21k' : _cfg (
808807 url = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth' ,
@@ -869,7 +868,6 @@ def _cfg(url='', **kwargs):
869868 hf_hub_id = 'timm/' ,
870869 mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD , num_classes = 0 ),
871870
872-
873871 # ViT ImageNet-21K-P pretraining by MILL
874872 'vit_base_patch16_224_miil.in21k' : _cfg (
875873 url = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_in21k_miil-887286df.pth' ,
@@ -880,7 +878,7 @@ def _cfg(url='', **kwargs):
880878 hf_hub_id = 'timm/' ,
881879 mean = (0. , 0. , 0. ), std = (1. , 1. , 1. ), crop_pct = 0.875 , interpolation = 'bilinear' ),
882880
883- # custom timm variants
881+ # Custom timm variants
884882 'vit_base_patch16_rpn_224.in1k' : _cfg (
885883 url = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_base_patch16_rpn_224-sw-3b07e89d.pth' ,
886884 hf_hub_id = 'timm/' ),
@@ -896,52 +894,6 @@ def _cfg(url='', **kwargs):
896894 'vit_base_patch16_gap_224' : _cfg (),
897895
898896 # CLIP pretrained image tower and related fine-tuned weights
899- 'vit_base_patch32_clip_224.laion2b' : _cfg (
900- hf_hub_id = 'laion/CLIP-ViT-B-32-laion2B-s34B-b79K' ,
901- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
902- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
903- 'vit_base_patch16_clip_224.laion2b' : _cfg (
904- #hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K',
905- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
906- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
907- 'vit_large_patch14_clip_224.laion2b' : _cfg (
908- hf_hub_id = 'laion/CLIP-ViT-L-14-laion2B-s32B-b82K' ,
909- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
910- mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 , num_classes = 768 ),
911- 'vit_huge_patch14_clip_224.laion2b' : _cfg (
912- hf_hub_id = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K' ,
913- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
914- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
915- 'vit_giant_patch14_clip_224.laion2b' : _cfg (
916- hf_hub_id = 'laion/CLIP-ViT-g-14-laion2B-s12B-b42K' ,
917- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
918- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
919-
920- 'vit_base_patch32_clip_224.laion2b_ft_in1k' : _cfg (
921- hf_hub_id = 'timm/' ,
922- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
923- 'vit_base_patch16_clip_224.laion2b_ft_in1k' : _cfg (
924- hf_hub_id = 'timm/' ,
925- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
926- 'vit_base_patch16_clip_384.laion2b_ft_in1k' : _cfg (
927- hf_hub_id = 'timm/' ,
928- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
929- crop_pct = 1.0 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
930- 'vit_large_patch14_clip_224.laion2b_ft_in1k' : _cfg (
931- hf_hub_id = 'timm/' ,
932- mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 ),
933- 'vit_large_patch14_clip_336.laion2b_ft_in1k' : _cfg (
934- hf_hub_id = 'timm/' ,
935- mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD ,
936- crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
937- 'vit_huge_patch14_clip_224.laion2b_ft_in1k' : _cfg (
938- hf_hub_id = 'timm/' ,
939- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
940- 'vit_huge_patch14_clip_336.laion2b_ft_in1k' : _cfg (
941- hf_hub_id = '' ,
942- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
943- crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
944-
945897 'vit_base_patch32_clip_224.laion2b_ft_in12k_in1k' : _cfg (
946898 hf_hub_id = 'timm/' ,
947899 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
@@ -973,28 +925,52 @@ def _cfg(url='', **kwargs):
973925 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
974926 crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
975927
976- 'vit_base_patch32_clip_224.laion2b_ft_in12k ' : _cfg (
977- #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k ',
978- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
979- 'vit_base_patch16_clip_224.laion2b_ft_in12k ' : _cfg (
928+ 'vit_base_patch32_clip_224.openai_ft_in12k_in1k ' : _cfg (
929+ # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k_in1k ',
930+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
931+ 'vit_base_patch32_clip_384.openai_ft_in12k_in1k ' : _cfg (
980932 hf_hub_id = 'timm/' ,
981- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
982- 'vit_large_patch14_clip_224.laion2b_ft_in12k' : _cfg (
933+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
934+ crop_pct = 0.95 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
935+ 'vit_base_patch16_clip_224.openai_ft_in12k_in1k' : _cfg (
983936 hf_hub_id = 'timm/' ,
984- mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 , num_classes = 11821 ),
985- 'vit_huge_patch14_clip_224.laion2b_ft_in12k ' : _cfg (
937+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 0.95 ),
938+ 'vit_base_patch16_clip_384.openai_ft_in12k_in1k ' : _cfg (
986939 hf_hub_id = 'timm/' ,
987- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 11821 ),
940+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
941+ crop_pct = 0.95 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
942+ 'vit_large_patch14_clip_224.openai_ft_in12k_in1k' : _cfg (
943+ hf_hub_id = 'timm/' ,
944+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
945+ 'vit_large_patch14_clip_336.openai_ft_in12k_in1k' : _cfg (
946+ hf_hub_id = 'timm/' ,
947+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
948+ crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
988949
989- 'vit_base_patch32_clip_224.openai ' : _cfg (
950+ 'vit_base_patch32_clip_224.laion2b_ft_in1k ' : _cfg (
990951 hf_hub_id = 'timm/' ,
991- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
992- 'vit_base_patch16_clip_224.openai ' : _cfg (
952+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
953+ 'vit_base_patch16_clip_224.laion2b_ft_in1k ' : _cfg (
993954 hf_hub_id = 'timm/' ,
994- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
995- 'vit_large_patch14_clip_224.openai ' : _cfg (
955+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
956+ 'vit_base_patch16_clip_384.laion2b_ft_in1k ' : _cfg (
996957 hf_hub_id = 'timm/' ,
997- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 768 ),
958+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
959+ crop_pct = 1.0 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
960+ 'vit_large_patch14_clip_224.laion2b_ft_in1k' : _cfg (
961+ hf_hub_id = 'timm/' ,
962+ mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 ),
963+ 'vit_large_patch14_clip_336.laion2b_ft_in1k' : _cfg (
964+ hf_hub_id = 'timm/' ,
965+ mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD ,
966+ crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
967+ 'vit_huge_patch14_clip_224.laion2b_ft_in1k' : _cfg (
968+ hf_hub_id = 'timm/' ,
969+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
970+ 'vit_huge_patch14_clip_336.laion2b_ft_in1k' : _cfg (
971+ hf_hub_id = '' ,
972+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
973+ crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
998974
999975 'vit_base_patch32_clip_224.openai_ft_in1k' : _cfg (
1000976 hf_hub_id = 'timm/' ,
@@ -1010,30 +986,21 @@ def _cfg(url='', **kwargs):
1010986 hf_hub_id = 'timm/' ,
1011987 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
1012988
1013- 'vit_base_patch32_clip_224.openai_ft_in12k_in1k' : _cfg (
1014- #hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k_in1k',
1015- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
1016- 'vit_base_patch32_clip_384.openai_ft_in12k_in1k' : _cfg (
1017- hf_hub_id = 'timm/' ,
1018- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1019- crop_pct = 0.95 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
1020- 'vit_base_patch16_clip_224.openai_ft_in12k_in1k' : _cfg (
1021- hf_hub_id = 'timm/' ,
1022- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 0.95 ),
1023- 'vit_base_patch16_clip_384.openai_ft_in12k_in1k' : _cfg (
989+ 'vit_base_patch32_clip_224.laion2b_ft_in12k' : _cfg (
990+ #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k',
991+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
992+ 'vit_base_patch16_clip_224.laion2b_ft_in12k' : _cfg (
1024993 hf_hub_id = 'timm/' ,
1025- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1026- crop_pct = 0.95 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
1027- 'vit_large_patch14_clip_224.openai_ft_in12k_in1k' : _cfg (
994+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
995+ 'vit_large_patch14_clip_224.laion2b_ft_in12k' : _cfg (
1028996 hf_hub_id = 'timm/' ,
1029- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
1030- 'vit_large_patch14_clip_336.openai_ft_in12k_in1k ' : _cfg (
997+ mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 , num_classes = 11821 ),
998+ 'vit_huge_patch14_clip_224.laion2b_ft_in12k ' : _cfg (
1031999 hf_hub_id = 'timm/' ,
1032- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1033- crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
1000+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 11821 ),
10341001
10351002 'vit_base_patch32_clip_224.openai_ft_in12k' : _cfg (
1036- #hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k',
1003+ # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k',
10371004 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
10381005 'vit_base_patch16_clip_224.openai_ft_in12k' : _cfg (
10391006 hf_hub_id = 'timm/' ,
@@ -1042,6 +1009,37 @@ def _cfg(url='', **kwargs):
10421009 hf_hub_id = 'timm/' ,
10431010 mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 11821 ),
10441011
1012+ 'vit_base_patch32_clip_224.laion2b' : _cfg (
1013+ hf_hub_id = 'laion/CLIP-ViT-B-32-laion2B-s34B-b79K' ,
1014+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1015+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1016+ 'vit_base_patch16_clip_224.laion2b' : _cfg (
1017+ # hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K',
1018+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1019+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
1020+ 'vit_large_patch14_clip_224.laion2b' : _cfg (
1021+ hf_hub_id = 'laion/CLIP-ViT-L-14-laion2B-s32B-b82K' ,
1022+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1023+ mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 , num_classes = 768 ),
1024+ 'vit_huge_patch14_clip_224.laion2b' : _cfg (
1025+ hf_hub_id = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K' ,
1026+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1027+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
1028+ 'vit_giant_patch14_clip_224.laion2b' : _cfg (
1029+ hf_hub_id = 'laion/CLIP-ViT-g-14-laion2B-s12B-b42K' ,
1030+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1031+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
1032+
1033+ 'vit_base_patch32_clip_224.openai' : _cfg (
1034+ hf_hub_id = 'timm/' ,
1035+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1036+ 'vit_base_patch16_clip_224.openai' : _cfg (
1037+ hf_hub_id = 'timm/' ,
1038+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1039+ 'vit_large_patch14_clip_224.openai' : _cfg (
1040+ hf_hub_id = 'timm/' ,
1041+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 768 ),
1042+
10451043 # experimental (may be removed)
10461044 'vit_base_patch32_plus_256' : _cfg (url = '' , input_size = (3 , 256 , 256 ), crop_pct = 0.95 ),
10471045 'vit_base_patch16_plus_240' : _cfg (url = '' , input_size = (3 , 240 , 240 ), crop_pct = 0.95 ),
0 commit comments