@@ -624,12 +624,7 @@ def __init__(
624624 if rope_type == 'mixed' :
625625 rope_kwargs .update (dict (depth = depth ))
626626 self .rope_mixed = True
627- elif rope_type == 'dinov3' :
628- rope_kwargs .update (dict (
629- grid_offset = rope_grid_offset ,
630- ref_feat_shape = ref_feat_shape ,
631- ))
632- else : # 'cat' or 'base'
627+ elif rope_type == 'cat' :
633628 rope_kwargs .update (dict (
634629 in_pixels = False ,
635630 grid_offset = rope_grid_offset ,
@@ -1558,160 +1553,148 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
15581553 # RoPE-ViT models from Naver
15591554 'vit_small_patch16_rope_224.naver_in1k' : _cfg (
15601555 hf_hub_id = 'timm/' ,
1561- mean = IMAGENET_DEFAULT_MEAN ,
1562- std = IMAGENET_DEFAULT_STD ,
1556+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
15631557 license = 'apache-2.0' ,
15641558 ),
15651559 'vit_base_patch16_rope_224.naver_in1k' : _cfg (
15661560 hf_hub_id = 'timm/' ,
1567- mean = IMAGENET_DEFAULT_MEAN ,
1568- std = IMAGENET_DEFAULT_STD ,
1561+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
15691562 license = 'apache-2.0' ,
15701563 ),
15711564 'vit_large_patch16_rope_224.naver_in1k' : _cfg (
15721565 hf_hub_id = 'timm/' ,
1573- mean = IMAGENET_DEFAULT_MEAN ,
1574- std = IMAGENET_DEFAULT_STD ,
1566+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
15751567 license = 'apache-2.0' ,
15761568 ),
15771569 'vit_small_patch16_rope_mixed_224.naver_in1k' : _cfg (
15781570 hf_hub_id = 'timm/' ,
1579- mean = IMAGENET_DEFAULT_MEAN ,
1580- std = IMAGENET_DEFAULT_STD ,
1571+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
15811572 license = 'apache-2.0' ,
15821573 ),
15831574 'vit_base_patch16_rope_mixed_224.naver_in1k' : _cfg (
15841575 hf_hub_id = 'timm/' ,
1585- mean = IMAGENET_DEFAULT_MEAN ,
1586- std = IMAGENET_DEFAULT_STD ,
1576+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
15871577 license = 'apache-2.0' ,
15881578 ),
15891579 'vit_large_patch16_rope_mixed_224.naver_in1k' : _cfg (
15901580 hf_hub_id = 'timm/' ,
1591- mean = IMAGENET_DEFAULT_MEAN ,
1592- std = IMAGENET_DEFAULT_STD ,
1581+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
15931582 license = 'apache-2.0' ,
15941583 ),
15951584 'vit_small_patch16_rope_ape_224.naver_in1k' : _cfg (
15961585 hf_hub_id = 'timm/' ,
1597- mean = IMAGENET_DEFAULT_MEAN ,
1598- std = IMAGENET_DEFAULT_STD ,
1586+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
15991587 license = 'apache-2.0' ,
16001588 ),
16011589 'vit_base_patch16_rope_ape_224.naver_in1k' : _cfg (
16021590 hf_hub_id = 'timm/' ,
1603- mean = IMAGENET_DEFAULT_MEAN ,
1604- std = IMAGENET_DEFAULT_STD ,
1591+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
16051592 license = 'apache-2.0' ,
16061593 ),
16071594 'vit_large_patch16_rope_ape_224.naver_in1k' : _cfg (
16081595 hf_hub_id = 'timm/' ,
1609- mean = IMAGENET_DEFAULT_MEAN ,
1610- std = IMAGENET_DEFAULT_STD ,
1596+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
16111597 license = 'apache-2.0' ,
16121598 ),
16131599 'vit_small_patch16_rope_mixed_ape_224.naver_in1k' : _cfg (
16141600 hf_hub_id = 'timm/' ,
1615- mean = IMAGENET_DEFAULT_MEAN ,
1616- std = IMAGENET_DEFAULT_STD ,
1601+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
16171602 license = 'apache-2.0' ,
16181603 ),
16191604 'vit_base_patch16_rope_mixed_ape_224.naver_in1k' : _cfg (
16201605 hf_hub_id = 'timm/' ,
1621- mean = IMAGENET_DEFAULT_MEAN ,
1622- std = IMAGENET_DEFAULT_STD ,
1606+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
16231607 license = 'apache-2.0' ,
16241608 ),
16251609 'vit_large_patch16_rope_mixed_ape_224.naver_in1k' : _cfg (
16261610 hf_hub_id = 'timm/' ,
1627- mean = IMAGENET_DEFAULT_MEAN ,
1628- std = IMAGENET_DEFAULT_STD ,
1611+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
16291612 license = 'apache-2.0' ,
16301613 ),
16311614
16321615 # DINOv3 weights are under a specific license with redistribution terms, please see
16331616 # https://github.com/facebookresearch/dinov3/blob/main/LICENSE.md
16341617 'vit_small_patch16_dinov3_224.lvdm_1689m' : _cfg (
16351618 # hf_hub_id='timm/',
1636- mean = IMAGENET_DEFAULT_MEAN ,
1637- std = IMAGENET_DEFAULT_STD ,
1619+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1620+ crop_pct = 1.0 ,
16381621 num_classes = 0 ,
16391622 license = 'dinov3' ,
16401623 ),
16411624 'vit_small_patch16_dinov3_qkvb_224.lvdm_1689m' : _cfg (
16421625 # hf_hub_id='timm/',
1643- mean = IMAGENET_DEFAULT_MEAN ,
1644- std = IMAGENET_DEFAULT_STD ,
1626+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1627+ crop_pct = 1.0 ,
16451628 num_classes = 0 ,
16461629 license = 'dinov3' ,
16471630 ),
16481631 'vit_small_plus_patch16_dinov3_224.lvdm_1689m' : _cfg (
16491632 # hf_hub_id='timm/',
1650- mean = IMAGENET_DEFAULT_MEAN ,
1651- std = IMAGENET_DEFAULT_STD ,
1633+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1634+ crop_pct = 1.0 ,
16521635 num_classes = 0 ,
16531636 license = 'dinov3' ,
16541637 ),
16551638 'vit_small_plus_patch16_dinov3_qkvb_224.lvdm_1689m' : _cfg (
16561639 # hf_hub_id='timm/',
1657- mean = IMAGENET_DEFAULT_MEAN ,
1658- std = IMAGENET_DEFAULT_STD ,
1640+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1641+ crop_pct = 1.0 ,
16591642 num_classes = 0 ,
16601643 license = 'dinov3' ,
16611644 ),
16621645 'vit_base_patch16_dinov3_224.lvdm_1689m' : _cfg (
16631646 #hf_hub_id='timm/',
1664- mean = IMAGENET_DEFAULT_MEAN ,
1665- std = IMAGENET_DEFAULT_STD ,
1647+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1648+ crop_pct = 1.0 ,
16661649 num_classes = 0 ,
16671650 license = 'dinov3' ,
16681651 ),
16691652 'vit_base_patch16_dinov3_qkvb_224.lvdm_1689m' : _cfg (
16701653 #hf_hub_id='timm/',
1671- mean = IMAGENET_DEFAULT_MEAN ,
1672- std = IMAGENET_DEFAULT_STD ,
1654+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1655+ crop_pct = 1.0 ,
16731656 num_classes = 0 ,
16741657 license = 'dinov3' ,
16751658 ),
16761659 'vit_large_patch16_dinov3_224.lvdm_1689m' : _cfg (
16771660 # hf_hub_id='timm/',
1678- mean = IMAGENET_DEFAULT_MEAN ,
1679- std = IMAGENET_DEFAULT_STD ,
1661+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1662+ crop_pct = 1.0 ,
16801663 num_classes = 0 ,
16811664 license = 'dinov3' ,
16821665 ),
16831666 'vit_large_patch16_dinov3_qkvb_224.lvdm_1689m' : _cfg (
16841667 # hf_hub_id='timm/',
1685- mean = IMAGENET_DEFAULT_MEAN ,
1686- std = IMAGENET_DEFAULT_STD ,
1668+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1669+ crop_pct = 1.0 ,
16871670 num_classes = 0 ,
16881671 license = 'dinov3' ,
16891672 ),
16901673 'vit_large_patch16_dinov3_224.sat_493m' : _cfg (
16911674 # hf_hub_id='timm/',
1692- mean = (0.430 , 0.411 , 0.296 ),
1693- std = ( 0.213 , 0.156 , 0.143 ) ,
1675+ mean = (0.430 , 0.411 , 0.296 ), std = ( 0.213 , 0.156 , 0.143 ),
1676+ crop_pct = 1.0 ,
16941677 num_classes = 0 ,
16951678 license = 'dinov3' ,
16961679 ),
16971680 'vit_huge_plus_patch16_dinov3_224.lvdm_1689m' : _cfg (
16981681 # hf_hub_id='timm/',
1699- mean = IMAGENET_DEFAULT_MEAN ,
1700- std = IMAGENET_DEFAULT_STD ,
1682+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1683+ crop_pct = 1.0 ,
17011684 num_classes = 0 ,
17021685 license = 'dinov3' ,
17031686 ),
17041687 'vit_7b_patch16_dinov3_224.lvdm_1689m' : _cfg (
17051688 # hf_hub_id='timm/',
1706- mean = IMAGENET_DEFAULT_MEAN ,
1707- std = IMAGENET_DEFAULT_STD ,
1689+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
1690+ crop_pct = 1.0 ,
17081691 num_classes = 0 ,
17091692 license = 'dinov3' ,
17101693 ),
17111694 'vit_7b_patch16_dinov3_224.sat_493m' : _cfg (
17121695 # hf_hub_id='timm/',
1713- mean = (0.430 , 0.411 , 0.296 ),
1714- std = ( 0.213 , 0.156 , 0.143 ) ,
1696+ mean = (0.430 , 0.411 , 0.296 ), std = ( 0.213 , 0.156 , 0.143 ),
1697+ crop_pct = 1.0 ,
17151698 num_classes = 0 ,
17161699 license = 'dinov3' ,
17171700 ),
0 commit comments