@@ -769,11 +769,14 @@ def forward_intermediates(
769769 # split prefix (e.g. class, distill) and spatial feature tokens
770770 prefix_tokens = [y [:, 0 :self .num_prefix_tokens ] for y in intermediates ]
771771 intermediates = [y [:, self .num_prefix_tokens :] for y in intermediates ]
772+ else :
773+ prefix_tokens = None
774+
772775 if reshape :
773776 # reshape to BCHW output format
774777 H , W = self .patch_embed .dynamic_feat_size ((height , width ))
775778 intermediates = [y .reshape (B , H , W , - 1 ).permute (0 , 3 , 1 , 2 ).contiguous () for y in intermediates ]
776- if not torch .jit .is_scripting () and return_prefix_tokens :
779+ if not torch .jit .is_scripting () and return_prefix_tokens and prefix_tokens is not None :
777780 # return_prefix not support in torchscript due to poor type handling
778781 intermediates = list (zip (intermediates , prefix_tokens ))
779782
@@ -1889,17 +1892,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18891892 mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD , num_classes = 0 ),
18901893
18911894 'vit_base_patch32_siglip_256.v2_webli' : _cfg (
1892- # hf_hub_id='timm/',
1895+ hf_hub_id = 'timm/' ,
18931896 input_size = (3 , 256 , 256 ),
18941897 num_classes = 0 ),
18951898 'vit_base_patch16_siglip_224.v2_webli' : _cfg (
1896- # hf_hub_id='timm/',
1899+ hf_hub_id = 'timm/' ,
18971900 num_classes = 0 ),
18981901 'vit_base_patch16_siglip_224.webli' : _cfg (
18991902 hf_hub_id = 'timm/' ,
19001903 num_classes = 0 ),
19011904 'vit_base_patch16_siglip_256.v2_webli' : _cfg (
1902- # hf_hub_id='timm/',
1905+ hf_hub_id = 'timm/' ,
19031906 input_size = (3 , 256 , 256 ),
19041907 num_classes = 0 ),
19051908 'vit_base_patch16_siglip_256.webli' : _cfg (
@@ -1911,49 +1914,49 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19111914 input_size = (3 , 256 , 256 ),
19121915 num_classes = 0 ),
19131916 'vit_base_patch16_siglip_384.v2_webli' : _cfg (
1914- # hf_hub_id='timm/',
1917+ hf_hub_id = 'timm/' ,
19151918 input_size = (3 , 384 , 384 ),
19161919 num_classes = 0 ),
19171920 'vit_base_patch16_siglip_384.webli' : _cfg (
19181921 hf_hub_id = 'timm/' ,
19191922 input_size = (3 , 384 , 384 ),
19201923 num_classes = 0 ),
19211924 'vit_base_patch16_siglip_512.v2_webli' : _cfg (
1922- # hf_hub_id='timm/',
1925+ hf_hub_id = 'timm/' ,
19231926 input_size = (3 , 512 , 512 ),
19241927 num_classes = 0 ),
19251928 'vit_base_patch16_siglip_512.webli' : _cfg (
19261929 hf_hub_id = 'timm/' ,
19271930 input_size = (3 , 512 , 512 ),
19281931 num_classes = 0 ),
19291932 'vit_large_patch16_siglip_256.v2_webli' : _cfg (
1930- # hf_hub_id='timm/',
1933+ hf_hub_id = 'timm/' ,
19311934 input_size = (3 , 256 , 256 ),
19321935 num_classes = 0 ),
19331936 'vit_large_patch16_siglip_256.webli' : _cfg (
19341937 hf_hub_id = 'timm/' ,
19351938 input_size = (3 , 256 , 256 ),
19361939 num_classes = 0 ),
19371940 'vit_large_patch16_siglip_384.v2_webli' : _cfg (
1938- # hf_hub_id='timm/',
1941+ hf_hub_id = 'timm/' ,
19391942 input_size = (3 , 384 , 384 ),
19401943 num_classes = 0 ),
19411944 'vit_large_patch16_siglip_384.webli' : _cfg (
19421945 hf_hub_id = 'timm/' ,
19431946 input_size = (3 , 384 , 384 ),
19441947 num_classes = 0 ),
19451948 'vit_large_patch16_siglip_512.v2_webli' : _cfg (
1946- # hf_hub_id='timm/',
1949+ hf_hub_id = 'timm/' ,
19471950 input_size = (3 , 512 , 512 ),
19481951 num_classes = 0 ),
19491952 'vit_so400m_patch14_siglip_224.v2_webli' : _cfg (
1950- # hf_hub_id='timm/',
1953+ hf_hub_id = 'timm/' ,
19511954 num_classes = 0 ),
19521955 'vit_so400m_patch14_siglip_224.webli' : _cfg (
19531956 hf_hub_id = 'timm/' ,
19541957 num_classes = 0 ),
19551958 'vit_so400m_patch14_siglip_378.v2_webli' : _cfg (
1956- # hf_hub_id='timm/',
1959+ hf_hub_id = 'timm/' ,
19571960 input_size = (3 , 378 , 378 ),
19581961 num_classes = 0 ),
19591962 'vit_so400m_patch14_siglip_378.webli' : _cfg (
@@ -1965,42 +1968,42 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19651968 input_size = (3 , 384 , 384 ),
19661969 num_classes = 0 ),
19671970 'vit_so400m_patch16_siglip_256.v2_webli' : _cfg (
1968- # hf_hub_id='timm/',
1971+ hf_hub_id = 'timm/' ,
19691972 input_size = (3 , 256 , 256 ),
19701973 num_classes = 0 ),
19711974 'vit_so400m_patch16_siglip_256.webli_i18n' : _cfg (
19721975 hf_hub_id = 'timm/' ,
19731976 input_size = (3 , 256 , 256 ),
19741977 num_classes = 0 ),
19751978 'vit_so400m_patch16_siglip_384.v2_webli' : _cfg (
1976- # hf_hub_id='timm/',
1979+ hf_hub_id = 'timm/' ,
19771980 input_size = (3 , 384 , 384 ),
19781981 num_classes = 0 ),
19791982 'vit_so400m_patch16_siglip_512.v2_webli' : _cfg (
1980- # hf_hub_id='timm/',
1983+ hf_hub_id = 'timm/' ,
19811984 input_size = (3 , 512 , 512 ),
19821985 num_classes = 0 ),
19831986 'vit_giantopt_patch16_siglip_256.v2_webli' : _cfg (
1984- # hf_hub_id='timm/',
1987+ hf_hub_id = 'timm/' ,
19851988 input_size = (3 , 256 , 256 ),
19861989 num_classes = 0 ),
19871990 'vit_giantopt_patch16_siglip_384.v2_webli' : _cfg (
1988- # hf_hub_id='timm/',
1991+ hf_hub_id = 'timm/' ,
19891992 input_size = (3 , 384 , 384 ),
19901993 num_classes = 0 ),
19911994
19921995 'vit_base_patch32_siglip_gap_256.v2_webli' : _cfg (
1993- # hf_hub_id='timm/',
1996+ hf_hub_id = 'timm/' ,
19941997 input_size = (3 , 256 , 256 ),
19951998 num_classes = 0 ),
19961999 'vit_base_patch16_siglip_gap_224.v2_webli' : _cfg (
1997- # hf_hub_id='timm/',
2000+ hf_hub_id = 'timm/' ,
19982001 num_classes = 0 ),
19992002 'vit_base_patch16_siglip_gap_224.webli' : _cfg (
20002003 hf_hub_id = 'timm/' ,
20012004 num_classes = 0 ),
20022005 'vit_base_patch16_siglip_gap_256.v2_webli' : _cfg (
2003- # hf_hub_id='timm/',
2006+ hf_hub_id = 'timm/' ,
20042007 input_size = (3 , 256 , 256 ),
20052008 num_classes = 0 ),
20062009 'vit_base_patch16_siglip_gap_256.webli' : _cfg (
@@ -2012,43 +2015,43 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
20122015 input_size = (3 , 256 , 256 ),
20132016 num_classes = 0 ),
20142017 'vit_base_patch16_siglip_gap_384.v2_webli' : _cfg (
2015- # hf_hub_id='timm/',
2018+ hf_hub_id = 'timm/' ,
20162019 input_size = (3 , 384 , 384 ),
20172020 num_classes = 0 ),
20182021 'vit_base_patch16_siglip_gap_384.webli' : _cfg (
20192022 hf_hub_id = 'timm/' ,
20202023 input_size = (3 , 384 , 384 ),
20212024 num_classes = 0 ),
20222025 'vit_base_patch16_siglip_gap_512.v2_webli' : _cfg (
2023- # hf_hub_id='timm/',
2026+ hf_hub_id = 'timm/' ,
20242027 input_size = (3 , 512 , 512 ),
20252028 num_classes = 0 ),
20262029 'vit_base_patch16_siglip_gap_512.webli' : _cfg (
20272030 hf_hub_id = 'timm/' ,
20282031 input_size = (3 , 512 , 512 ),
20292032 num_classes = 0 ),
20302033 'vit_large_patch16_siglip_gap_256.v2_webli' : _cfg (
2031- # hf_hub_id='timm/',
2034+ hf_hub_id = 'timm/' ,
20322035 input_size = (3 , 256 , 256 ),
20332036 num_classes = 0 ),
20342037 'vit_large_patch16_siglip_gap_256.webli' : _cfg (
20352038 hf_hub_id = 'timm/' ,
20362039 input_size = (3 , 256 , 256 ),
20372040 num_classes = 0 ),
20382041 'vit_large_patch16_siglip_gap_384.v2_webli' : _cfg (
2039- # hf_hub_id='timm/',
2042+ hf_hub_id = 'timm/' ,
20402043 input_size = (3 , 384 , 384 ),
20412044 num_classes = 0 ),
20422045 'vit_large_patch16_siglip_gap_384.webli' : _cfg (
20432046 hf_hub_id = 'timm/' ,
20442047 input_size = (3 , 384 , 384 ),
20452048 num_classes = 0 ),
20462049 'vit_large_patch16_siglip_gap_512.v2_webli' : _cfg (
2047- # hf_hub_id='timm/',
2050+ hf_hub_id = 'timm/' ,
20482051 input_size = (3 , 512 , 512 ),
20492052 num_classes = 0 ),
20502053 'vit_so400m_patch14_siglip_gap_224.v2_webli' : _cfg (
2051- # hf_hub_id='timm/',
2054+ hf_hub_id = 'timm/' ,
20522055 num_classes = 0 ),
20532056 'vit_so400m_patch14_siglip_gap_224.webli' : _cfg (
20542057 hf_hub_id = 'timm/' ,
@@ -2071,7 +2074,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
20712074 # custom_load='hf',
20722075 # num_classes=0),
20732076 'vit_so400m_patch14_siglip_gap_378.v2_webli' : _cfg (
2074- # hf_hub_id='timm/',
2077+ hf_hub_id = 'timm/' ,
20752078 input_size = (3 , 378 , 378 ),
20762079 num_classes = 0 ),
20772080 'vit_so400m_patch14_siglip_gap_378.webli' : _cfg (
@@ -2147,27 +2150,27 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
21472150 # input_size=(3, 896, 896), crop_pct=1.0,
21482151 # num_classes=0),
21492152 'vit_so400m_patch16_siglip_gap_256.v2_webli' : _cfg (
2150- # hf_hub_id='timm/',
2153+ hf_hub_id = 'timm/' ,
21512154 input_size = (3 , 256 , 256 ),
21522155 num_classes = 0 ),
21532156 'vit_so400m_patch16_siglip_gap_256.webli_i18n' : _cfg (
21542157 hf_hub_id = 'timm/' ,
21552158 input_size = (3 , 256 , 256 ),
21562159 num_classes = 0 ),
21572160 'vit_so400m_patch16_siglip_gap_384.v2_webli' : _cfg (
2158- # hf_hub_id='timm/',
2161+ hf_hub_id = 'timm/' ,
21592162 input_size = (3 , 384 , 384 ),
21602163 num_classes = 0 ),
21612164 'vit_so400m_patch16_siglip_gap_512.v2_webli' : _cfg (
2162- # hf_hub_id='timm/',
2165+ hf_hub_id = 'timm/' ,
21632166 input_size = (3 , 512 , 512 ),
21642167 num_classes = 0 ),
21652168 'vit_giantopt_patch16_siglip_gap_256.v2_webli' : _cfg (
2166- # hf_hub_id='timm/',
2169+ hf_hub_id = 'timm/' ,
21672170 input_size = (3 , 256 , 256 ),
21682171 num_classes = 0 ),
21692172 'vit_giantopt_patch16_siglip_gap_384.v2_webli' : _cfg (
2170- # hf_hub_id='timm/',
2173+ hf_hub_id = 'timm/' ,
21712174 input_size = (3 , 384 , 384 ),
21722175 num_classes = 0 ),
21732176
0 commit comments