o
    _i"                     @   s  d Z ddlZddlZddlmZ ddlZddlZddlm	Z	 ddl
m	  mZ ddlmZ dZeejvr;ejde ddlmZ ddlmZ dZd	Zd
ZejedZdZdZdZdZeg d d
dd
d
Z!eg d d
dd
d
Z"G dd de	j#Z$G dd de	j#Z%dS )a?  SVD + Global Action wrapper for eval_multistage.py.

Wraps the SVD UNet (with LoRA) + GlobalActionHead into the ACT-style interface
expected by eval_multistage.py:
    pos_pred, rot_pred, gripper_pred = model(img_tensor, start_kp,
                                              current_eef_pos=..., current_gripper=...)
    N)Path)Imagez0/data/cameron/vidgen/svd_motion_lora/Motion-LoRA)StableVideoDiffusionPipeline) UNetSpatioTemporalConditionModel          z1checkpoints/stable-video-diffusion-img2vid-xt-1-1i@  i@        )g
ףp=
?gv/?gCl?   )gZd;O?gy&1?g?c                       s.   e Zd ZdZedf fdd	Zdd Z  ZS )GlobalActionHeadz,Global regression head on SVD UNet features.   c                    s   t    td|d| _td|d| _|d }ttj||dddt tj||dddt tj||dddt | _t	d| _
tt||t t||t t|d| _tt||t t|t| _d S )Ni   r   i     r   )padding)super__init__nnConv2dproj_block1proj_block2
SequentialGELUfeature_convsAdaptiveAvgPool2dpoolLinearposition_mlpN_GRIPPER_BINSgripper_mlp)selfZproj_dim
hidden_dimD	__class__ ?/data/cameron/para_normalized_losses/libero/model_svd_global.pyr   2   s&   



zGlobalActionHead.__init__c           
      C   s   d}|  |}tj|||fddd}| |}tj|||fddd}tj||gdd}| |}| |dd}| 	|}| 
|}	||	fS )N@   bilinearF)sizemodealign_cornersr   dim)r   Finterpolater   torchcatr   r   squeezer   r   )
r   Zfeat_block1Zfeat_block2Pf1f2featsZpooledpos_predgrip_logitsr$   r$   r%   forwardI   s   




zGlobalActionHead.forward)__name__
__module____qualname____doc__PROJ_DIMr   r9   __classcell__r$   r$   r"   r%   r   /   s    r   c                       sP   e Zd ZdZdedf fdd	Zdd Z		ddd	ZdddZdd Z	  Z
S )SVDGlobalPredictorzMWraps SVD UNet + GlobalActionHead for eval_multistage.py ACT-style interface.i  Nc           	         s`  t    | _t _|d u rttj rdnd}| _	t
|}|d }td|  tjt|tjd _ j   jd |d }td|  tjt|dd	}t | _ j|d
   j  |di  _i  _ fdd} jjd |d  jjd |d td|  tj| jtjdd _ j| td d S )Ncudacpuunetz[SVDGlobal] Loading UNet from )torch_dtypeFzaction_checkpoint.ptz%[SVDGlobal] Loading action head from )map_locationaction_headstatsc                    s    fdd}|S )Nc                    s(   t |tr	|d n|  j < d S )Nr   )
isinstancetupledetachfloatcaptured)moduleinpout)namer   r$   r%   hook_fn   s   (z?SVDGlobalPredictor.__init__.<locals>.make_hook.<locals>.hook_fnr$   )rP   rQ   r   )rP   r%   	make_hook   s   z.SVDGlobalPredictor.__init__.<locals>.make_hookr   
up_block_1r   
up_block_2z'[SVDGlobal] Building SVD pipeline from Zfp16)rC   rD   variantz[SVDGlobal] Ready.)r   r   target_sizeN_WINDOWn_windowr0   devicerA   is_available_devicer   printr   from_pretrainedstrfloat16rC   evalrequires_grad_loadr   torF   load_state_dictgetrG   rL   Z	up_blocksregister_forward_hookr   pipe)	r   checkpoint_dirrW   Zsvd_baserZ   Zunet_dirZaction_ckpt_pathZaction_ckptrS   r"   rR   r%   r   c   s@   



zSVDGlobalPredictor.__init__c                 C   sr   t |j}t|j}|| | }|dd}|d ddd  }|d t	j
}t|ttf}|S )zBConvert ImageNet-normalized (1,3,H,W) tensor to PIL for SVD input.r   r   r      )IMAGENET_MEANrd   rZ   IMAGENET_STDclamppermuterB   numpyastypenpuint8r   	fromarrayresize	SVD_WIDTH
SVD_HEIGHT)r   
img_tensormeanstdimgpilr$   r$   r%   _denorm_to_pil   s   z!SVDGlobalPredictor._denorm_to_pilc              	   K   s4  | j   | |}t  | j|tttdt	d}W d   n1 s$w   Y  | j d }| j d }	|dd }|	dd }	t
  | ||	\}
}W d   n1 sVw   Y  t|
}|d}tjd| jd|jd	}td
 }|dd|df jdd|ddd|f jdd }|d}|||fS )a  Run SVD pipeline to capture UNet features, then run global action head.

        Args:
            img_tensor:        (1, 3, H, W) ImageNet-normalized
            start_keypoint_2d: (2,) or (1, 2) current EEF pixel (unused for SVD)
            current_eef_pos:   (1, 3) normalized [0,1] EEF position (unused by this head)
            current_gripper:   (1, 1) normalized [0,1] gripper state (unused by this head)

        Returns:
            pos_pred:      (1, N_WINDOW, 3) normalized [0, 1] position
            rot_pred:      (1, N_WINDOW, 3) zeros (rotation not predicted)
            gripper_pred:  (1, N_WINDOW) gripper logits (argmax of binned prediction)
           )heightwidthZ
num_framesZdecode_chunk_sizeZnum_inference_stepsNrT   rU   r   r   r   )rZ   r   r-   r+   )rL   clearr|   r0   inference_moderh   rv   ru   SVD_NUM_FRAMESSVD_NUM_INFERENCE_STEPSno_gradrF   sigmoid	unsqueezezerosrY   rZ   r   sum)r   rw   start_keypoint_2dcurrent_eef_poscurrent_gripperkwargsZpil_img_Zfeat1Zfeat2Zpos_rawr8   r7   rot_predhalfgripper_predr$   r$   r%   r9      s2   








8

zSVDGlobalPredictor.forwardTc                 C   s   dS )z@No-op: weights are loaded in __init__ from checkpoint directory.Nr$   )r   
state_dictstrictr$   r$   r%   re      s   z"SVDGlobalPredictor.load_state_dictc                 C   s   | j || _ || _| S )z@Move action head to device. UNet/pipeline are already on device.)rF   rd   r\   )r   rZ   r$   r$   r%   rd      s   zSVDGlobalPredictor.to)NN)T)r:   r;   r<   r=   SVD_BASE_MODELr   r|   r9   re   rd   r?   r$   r$   r"   r%   r@   `   s    6

9r@   )&r=   sysospathlibr   ro   rq   r0   torch.nnr   torch.nn.functional
functionalr.   PILr   ZSVD_ROOTpathinsertZsvd.pipelinesr   Z
svd.modelsr   r>   r   rX   joinr   rv   ru   r   r   tensorviewrk   rl   Moduler   r@   r$   r$   r$   r%   <module>   s4    
1