o
    Wi%                     @   s   d Z ddlZddlZddlmZ ddlZddlZddlm	Z	 ddl
m	  mZ ddlmZ dZeejvr;ejde ddlmZ ddlmZ dZd	Zd	Zd	Zd
ZdZejedZdZdZdZ dZ!e"g d#ddddZ$e"g d#ddddZ%G dd de	j&Z'G dd de	j&Z(dS )a0  SVD + PARA wrapper for eval_multistage.py.

Wraps the SVD UNet (with LoRA) + ParaHeadsOnUNet into the interface expected
by eval_multistage.py's PARA code path:
    volume_logits, _, _, feats = model(img_tensor, start_kp)
    gripper_logits, rotation_logits = model.predict_at_pixels(feats, pred_pixels)
    N)Path)Imagez0/data/cameron/vidgen/svd_motion_lora/Motion-LoRA)StableVideoDiffusionPipeline) UNetSpatioTemporalConditionModel@             z1checkpoints/stable-video-diffusion-img2vid-xt-1-1i@  i@        )g
ףp=
?gv/?gCl?   )gZd;O?gy&1?g?c                       s2   e Zd ZdZdeef fdd	ZdddZ  ZS )	ParaHeadsOnUNetz?PARA heads that attach to SVD UNet's up_block_1 and up_block_2.r	   c                    s   t    || _td|d| _td|d| _|d }ttj||dddt tj||dddt tj||dddt | _	t||d| _
t|td| _t|dt d| _d S )Ni   r	   i     r   )padding)super__init__n_height_binsnnConv2dproj_block1proj_block2
SequentialGELUfeature_convsvolume_headN_GRIPPER_BINSgripper_head
N_ROT_BINSrotation_head)selfn_windowr   Zproj_dimD	__class__ =/data/cameron/para_normalized_losses/libero/model_svd_para.pyr   5   s   
zParaHeadsOnUNet.__init__Nc                 C   s2  t }| |}tj|||fddd}| |}tj|||fddd}tj||gdd}| |}| |}d  }	}
|d ur|j	d }|d d df 
 d|d }|d d df 
 d|d }tj||jd}| | }||d d ||f }	| | }||d d ||f }||dt}
||	|
|fS )	NbilinearF)sizemodealign_cornersr	   )dimr   devicer   )PARA_OUT_SIZEr   Finterpolater   torchcatr   r   shapelongclamparanger,   r   detachr   viewr   )r   Zfeat_block1Zfeat_block2query_pixelsPf1f2featsvolgripper_logitsrotation_logitsZBTpxpyidxgrip_maprot_map	rot_at_pxr$   r$   r%   forwardG   s(   




  zParaHeadsOnUNet.forward)N)	__name__
__module____qualname____doc__N_HEIGHT_BINSPROJ_DIMr   rF   __classcell__r$   r$   r"   r%   r   2   s    r   c                       sT   e Zd ZdZdeedf fdd	Zdd Zdd	 Zd
d Z	dddZ
dd Z  ZS )SVDParaPredictorz3Wraps SVD UNet + PARA heads for eval_multistage.py.i  Nc           
         sj  t    | _| _t _|d u rttj	 rdnd}| _
t|}|d }td|  tjt|tjd _ j   jd |d }td|  tjt|dd	}ttd
| _ j|d   j  |di  _i  _ fdd}	 jjd |	d  jjd |	d td|  tj| jtjdd _ j| td d S )Ncudacpuunetz[SVDPara] Loading UNet from )torch_dtypeFzpara_checkpoint.ptz"[SVDPara] Loading PARA heads from )map_location)r    
para_headsstatsc                    s    fdd}|S )Nc                    s(   t |tr	|d n|  j < d S )Nr   )
isinstancetupler6   floatcaptured)moduleinpout)namer   r$   r%   hook_fn   s   (z=SVDParaPredictor.__init__.<locals>.make_hook.<locals>.hook_fnr$   )r]   r^   r   )r]   r%   	make_hook   s   z,SVDParaPredictor.__init__.<locals>.make_hookr	   
up_block_1r   
up_block_2z%[SVDPara] Building SVD pipeline from Zfp16)rQ   rR   variantz[SVDPara] Ready.) r   r   target_size	pred_sizeN_WINDOWr    r0   r,   rO   is_available_devicer   printr   from_pretrainedstrfloat16rQ   evalrequires_grad_loadr   torT   load_state_dictgetrU   rY   Z	up_blocksregister_forward_hookr   pipe)
r   checkpoint_dirrd   re   Zsvd_baser,   Zunet_dirZpara_ckpt_pathZ	para_ckptr`   r"   r_   r%   r   l   sB   



zSVDParaPredictor.__init__c                 C   sr   t |j}t|j}|| | }|dd}|d ddd  }|d t	j
}t|ttf}|S )zBConvert ImageNet-normalized (1,3,H,W) tensor to PIL for SVD input.r   r	   r      )IMAGENET_MEANrp   r,   IMAGENET_STDr4   permuterP   numpyastypenpuint8r   	fromarrayresize	SVD_WIDTH
SVD_HEIGHT)r   
img_tensormeanstdimgpilr$   r$   r%   _denorm_to_pil   s   zSVDParaPredictor._denorm_to_pilc              	   K   s   | j   | |}t  | j|tttdt	d}W d   n1 s$w   Y  | j d }| j d }t
|jd t}t  | |d| |d| \}	}}}
W d   n1 s\w   Y  |	d}|
dd }|dd|fS )a  Run SVD pipeline to capture UNet features, then run PARA heads.

        Args:
            img_tensor:        (1, 3, H, W) ImageNet-normalized
            start_keypoint_2d: (2,) or (1, 2) current EEF pixel (unused for SVD feature extraction)

        Returns:
            volume_logits: (1, N_WINDOW, N_HEIGHT_BINS, pred_size, pred_size)
            None
            None
            feats:         (1, 256, pred_size, pred_size)
           )heightwidthZ
num_framesZdecode_chunk_sizeZnum_inference_stepsNra   rb   r   r	   )rY   clearr   r0   inference_modert   r   r   SVD_NUM_FRAMESSVD_NUM_INFERENCE_STEPSminr2   rf   no_gradrT   	unsqueeze)r   r   start_keypoint_2dkwargsZpil_img_Zfeat1Zfeat2Zn_tr=   Z	feats_allvolume_logitsr<   r$   r$   r%   rF      s(   






&
zSVDParaPredictor.forwardc                 C   s   |j d }|j d }| j}|d  d|d }|d  d|d }tj||jd|d||}| j	
| }	|	|dd||f }
| j	| }||dd||f }|||dt}|
|fS )aj  Index gripper/rotation heads at specified pixels.

        Args:
            feats:        (1, 256, pred_size, pred_size) from forward()
            pred_pixels:  (1, N_WINDOW, 2) pixel coords [x, y] in pred_size space

        Returns:
            gripper_logits:  (1, N_WINDOW, N_GRIPPER_BINS)
            rotation_logits: (1, N_WINDOW, 3, N_ROT_BINS)
        r   r	   ).r   ).r	   r+   Nr   )r2   re   r3   r4   r0   r5   r,   r7   expandrT   r   r6   r   r   )r   r<   pred_pixelsBNr9   r@   rA   	batch_idxrC   r>   rD   rE   r?   r$   r$   r%   predict_at_pixels   s   

 z"SVDParaPredictor.predict_at_pixelsTc                 C   s   dS )z@No-op: weights are loaded in __init__ from checkpoint directory.Nr$   )r   
state_dictstrictr$   r$   r%   rq      s   z SVDParaPredictor.load_state_dictc                 C   s   | j || _ || _| S )z?Move PARA heads to device. UNet/pipeline are already on device.)rT   rp   rh   )r   r,   r$   r$   r%   rp      s   zSVDParaPredictor.to)T)rG   rH   rI   rJ   r-   SVD_BASE_MODELr   r   rF   r   rq   rp   rM   r$   r$   r"   r%   rN   i   s    7,
rN   ))rJ   sysospathlibr   rz   r|   r0   torch.nnr   torch.nn.functional
functionalr.   PILr   ZSVD_ROOTpathinsertZsvd.pipelinesr   Z
svd.modelsr   r-   rK   r   r   rL   rf   joinr   r   r   r   r   tensorr7   rw   rx   Moduler   rN   r$   r$   r$   r%   <module>   s:    
7