o
    @Zi0                     @   s.   d Z ddlZddlmZ G dd dejZdS )zPARA head: map MAR decoder tokens to per-pixel ray (height-bin) logits.

Takes (B, T, S, C) decoder tokens, reshapes to (B, T, H_lat, W_lat, C),
bilinearly upsamples with convs to (B, T, n_bins, H_out, W_out).
    Nc                	       s>   e Zd Z			ddedededef fdd	Zd
d Z  ZS )ParaHead       @   decoder_embed_dimn_binsin_grid_sizeout_sizec                    s   t    || _|| _|| _|}ttj||dddt tj	ddddtj||dddt tj	ddddtj||dddt t||d	| _
d S )N      )padding   bilinearF)scale_factormodealign_corners)super__init__in_gridr	   r   nn
SequentialConv2dGELUUpsample	conv_head)selfr   r   r   r	   mid	__class__ A/data/cameron/vidgen/unified_video_action/simple_uva/para_head.pyr      s    

zParaHead.__init__c                 C   sz   |j \}}}}| j }}||||||ddddd}||| |||}| |}|j \}	}
}}||||
||}|S )z
        dec_tokens: (B, T, S, C) where S = H_lat*W_lat (e.g. 256)
        Returns volume_logits: (B, T, n_bins, H_out, W_out)
        r   r      r   r
   )shaper   viewpermutereshaper   )r   Z
dec_tokensBTSCZH_latZW_latx_r   ZH_outZW_outr   r   r    forward$   s   
 
zParaHead.forward)r   r   r   )__name__
__module____qualname__intr   r,   __classcell__r   r   r   r    r   
   s    r   )__doc__torchtorch.nnr   Moduler   r   r   r   r    <module>   s    