o
     ݱii!                     @   s   d dl mZ d dlZd dlZd dlmZ d dlZd dlmZ d dl	m
Z
 dddZdejd	eejgef d
eejgejf dejfddZdddZdddZdd ZG dd dejZG dd dejZG dd dejZdS )    )CallableN)	rearrange	NormalizeTc                 C   s   t g dg d| dS )z8
    Construct an ImageNet normalization transform.
    )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?)meanstdinplacer   )r    r	   @/data/cameron/vidgen/unified-world-model/models/common/vision.pyget_imagenet_norm   s   r   root_module	predicatefuncreturnc           	         s    | r|| S  fdd| j ddD }|D ]>^ }}| }t|dkr,| d|}t|tjr9|t| }nt||}||}t|tjrO||t|< qt	||| q fdd| j ddD }t|dkskJ | S )a  
    Recursively replace submodules that satisfy a given predicate.

    Args:
        root_module (nn.Module): The root module to process.
        predicate (Callable[[nn.Module], bool]): A function that takes a module as input and
            returns True if the module should be replaced.
        func (Callable[[nn.Module], nn.Module]): A function that takes a module as input and
            returns a new module to replace it.
        **kwargs: Additional keyword arguments to be passed to the ResNet model constructor.
    c                    "   g | ]\}} |r| d qS .split.0kmr   r	   r
   
<listcomp>(       z&replace_submodules.<locals>.<listcomp>T)remove_duplicater   r   c                    r   r   r   r   r   r	   r
   r   <   r   )
named_moduleslenget_submodulejoin
isinstancenn
Sequentialintgetattrsetattr)	r   r   r   module_listparentr   parent_moduleZ
src_moduleZ
tgt_moduler	   r   r
   replace_submodules   s*   




r*   c                 K   sN   t tj| }|dd|i|}t|jj||_|r%t|dd dd d}|S )a*  
    Construct a ResNet model with a custom output embedding dimension and optional batch norm replacement.

    Args:
        name (str): The name of the ResNet architecture to use (e.g., "resnet18", "resnet34", "resnet50").
        embed_dim (int): The dimension of the output embedding.
        weights (Optional[str]): Pre-trained weights to load (e.g., "IMAGENET1K_V1"). If None, no pre-trained weights are used.
        replace_batch_norm (bool, optional): If True, replaces `nn.BatchNorm2d` layers with `nn.GroupNorm` layers. Default is True.
    weightsc                 S   s   t | tjS N)r!   r"   BatchNorm2dxr	   r	   r
   <lambda>U   s    zget_resnet.<locals>.<lambda>c                 S   s   t j| jd | jdS )N   )
num_groupsnum_channels)r"   	GroupNormnum_featuresr.   r	   r	   r
   r0   V   s    )r   r   r   Nr	   )r%   torchvisionmodelsr"   Linearfcin_featuresr*   )name	embed_dimr+   Zreplace_batch_normkwargsr   resnetr	   r	   r
   
get_resnetE   s   
r?   c                 K   s0   t tj| }|dd|i|}td||_|S )a  
    Construct a Vision Transformer (ViT) model with a custom output embedding dimension.

    Args:
        name (str): The name of the ViT architecture to use (e.g., "vit_b_16", "vit_b_32", "vit_l_16", "vit_l_32", "vit_h_14").
        embed_dim (int): The dimension of the output embedding.
        weights (Optional[str]): Pre-trained weights to load (e.g., "IMAGENET1K_V1"). If None, no pre-trained weights are used.
        **kwargs: Additional keyword arguments to be passed to the ViT model constructor.
    r+      Nr	   )r%   r6   r7   r"   r8   heads)r;   r<   r+   r=   r   vitr	   r	   r
   get_vit^   s   
rC   c                 K   s(   t j	dddi|}td| |_|S )a  
    Construct a pretrained CLIP encoder with a custom output embedding dimension.

    Args:
        embed_dim (int): The dimension of the output embedding.
        **kwargs: Additional keyword arguments to be passed to the timm model creation function.
    ,hf_hub:timm/vit_base_patch32_clip_224.openai
pretrainedTr@   N)rD   )timmcreate_modelr"   r8   head)r<   r=   clipr	   r	   r
   get_clipn   s   rJ   c                       :   e Zd ZdZdedef fddZdejfddZ  Z	S )	ResNetImageEncodera  
    Multi-view image encoder using a ResNet backbone.

    The input is expected to be a tensor with shape (B, V, C, T, H, W), where:
      - B is the batch size,
      - V is the number of views,
      - C is the number of channels,
      - T is the number of frames,
      - H and W are the image height and width, respectively.
    The encoder reshapes the input to treat each view and frame as an individual image,
    extracts features using the ResNet model, and then concatenates the features across
    all views and frames.

    Args:
        num_views (int): Number of camera views in the input.
        embed_dim (int): Dimension of the output embedding features.
    	num_viewsr<   c                    ,   t    || _t | _td|dd| _d S )Nresnet18IMAGENET1K_V1r+   )super__init__rM   r   normr?   modelselfrM   r<   	__class__r	   r
   rS         
zResNetImageEncoder.__init__imgsc                 C   s@   |j d d \}}t|d}| | |}t|d||d}|S )N   b v c t h w -> (b v t) c h w(b v t) c -> b (v t c)bv)shaper   rU   rT   )rW   r[   BVfeatsr	   r	   r
   forward   s
   
zResNetImageEncoder.forward
__name__
__module____qualname____doc__r$   rS   torchTensorrf   __classcell__r	   r	   rX   r
   rL   }   s    rL   c                       rK   )	ViTImageEncoderz
    Multi-view image encoder using a Vision Transformer (ViT) backbone.

    Args:
        num_views (int): Number of camera views in the input.
        embed_dim (int): Dimension of the output embedding features.
    rM   r<   c                    rN   )Nvit_b_32rP   rQ   )rR   rS   rM   r   rT   rC   rU   rV   rX   r	   r
   rS      rZ   zViTImageEncoder.__init__r[   c                 C   s   |j d d \}}t|d}| |}| j|}| jj|j d dd}tj||gdd}| j	|}| j
|d d df }t|d||d}|S )	Nr\   r]   r      dimr^   r_   )rb   r   rT   rU   _process_inputclass_tokenexpandrl   catencoderrA   rW   r[   rc   rd   r/   Zbatch_cls_tokenre   r	   r	   r
   rf      s   

zViTImageEncoder.forwardrg   r	   r	   rX   r
   ro      s    ro   c                       s>   e Zd ZdZdededef fddZdejfdd	Z  Z	S )
ViTImagePatchEncodera;  
    Multi-view image patch encoder using a Vision Transformer (ViT) backbone with learnable positional embeddings.

    Args:
        num_views (int): Number of camera views in the input.
        num_frames (int): Number of frames per view.
        embed_dim (int): Dimension of the output embedding features.
    rM   
num_framesr<   c                    sl   t    || _t | _td|dd| _tjt	
d|| d|dd| _tjt	
d|| d|dd| _d S )Nrp   rP   rQ   rr   T)requires_grad)rR   rS   rM   r   rT   rC   rU   r"   	Parameterrl   zeros	pos_shift	pos_scale)rW   rM   r|   r<   rX   r	   r
   rS      s   
zViTImagePatchEncoder.__init__r[   c                 C   s   |j d d \}}t|d}| |}| j|}| jj|j d dd}tj||gdd}| j	|}| j
|}t|d||d}|d| j  | j }|ddS )	Nr\   r]   r   rq   rr   rs   z(b v t) n c -> b (v t) n cr_   )rb   r   rT   rU   ru   rv   rw   rl   rx   ry   rA   r   r   flattenrz   r	   r	   r
   rf      s   

zViTImagePatchEncoder.forwardrg   r	   r	   rX   r
   r{      s    	r{   )T)NTr,   )typingr   rF   rl   torch.nnr"   r6   einopsr   Ztorchvision.transformsr   r   Moduleboolr*   r?   rC   rJ   rL   ro   r{   r	   r	   r	   r
   <module>   s.    
	

1
!"