o
    i61                     @   s   d Z ddlZddlmZ ddlZddlZddlm  m	Z
 ddlmZ zddlZejd W n ey:   dZY nw dZdd Zd	efd
dZdefddZG dd deZG dd deZdd ZdefddZG dd deZG dd deZdS )zNDROID video clips for simple_uva: 4-frame, 256x256, [-1,1]. Cache or raw MP4s.    N)Path)Datasettorch   c                 C   sz   t dt|| }|d | d }| |k r$tjd| d |dt}|S td| | }tj||| d |dt}|S )N   r   )num)maxroundnplinspaceastypeintrandomrandint)total_frames	video_fps
num_frames
sample_fpsintervalneedindicesstart r   ?/data/cameron/vidgen/unified_video_action/simple_uva/dataset.py_sample_frames   s   r   rootc                 C      t | } t| dS )Nz*.mp4)r   listrglob)r   r   r   r   	find_mp4s      r   	cache_dirc                 C   r   )Nz*.pt)r   sortedglob)r!   r   r   r   find_cached_clips$   r    r$   c                   @   sD   e Zd ZdZedfdededefddZdd	 Zd
d Z	dd Z
dS )CachedClipDatasetzJPre-extracted clips from cache_dir. Uses first NUM_FRAMES frames per clip.   r!   r   max_load_retriesc                 C   s0   t || _|| _|| _| jstd| dd S )NzNo .pt clips in z. Run precache first.)r$   clipsr   r'   FileNotFoundError)selfr!   r   r'   r   r   r   __init__,   s   
zCachedClipDataset.__init__c                 C   
   t | jS N)lenr(   r*   r   r   r   __len__3      
zCachedClipDataset.__len__c                 C   s   z
t j|ddd}W n ttfy(   z
t j|ddd}W n ty%    w Y nw | dkr3|d }|d d d | jf }|dS )NcpuT)map_locationweights_onlyFr&   r   )r   load	TypeErrorEOFError	Exceptiondimr   	unsqueeze)r*   pathoutr   r   r   	_load_one6   s   
zCachedClipDataset._load_onec                 C   s   d }t | jD ]3}|dkr|| t| j n|}z| | j| W   S  tttfy: } z|}W Y d }~qd }~ww td| j d| d| )Nr   zFailed to load any of z clips (last idx=z): )ranger'   r.   r(   r=   r7   OSErrorRuntimeError)r*   idxlast_errattemptier   r   r   __getitem__C   s   zCachedClipDataset.__getitem__N)__name__
__module____qualname____doc__
NUM_FRAMESstrr   r+   r0   r=   rF   r   r   r   r   r%   )   s    r%   c                	   @   s>   e Zd ZeddfdedededefddZd	d
 Zdd Z	dS )DroidVideoDatasetg      @   r   r   r   sizec                 C   sB   t || _|| _|| _|| _t| j| _| jstd| j d S )NzNo .mp4 under )r   r   r   r   rO   r   videosr)   )r*   r   r   r   rO   r   r   r   r+   P   s   
zDroidVideoDataset.__init__c                 C   r,   r-   )r.   rP   r/   r   r   r   r0   _   r1   zDroidVideoDataset.__len__c                 C   s   | j | }td u rtdtjt|dd}t|}| }t||| j| j	}|
|}| d }|dddd}tjjj|| j| jfdd	d
}|dddd}|d d }|dS )Nzdecord not installed   )num_threads     o@r      r   bilinearFrO   modealign_corners       @      ?)rP   decordr@   VideoReaderrL   r.   Zget_avg_fpsr   r   r   Z	get_batchfloatpermuter   nn
functionalinterpolaterO   r:   )r*   rA   r;   vrtotalfpsr   framesr   r   r   rF   b   s    


zDroidVideoDataset.__getitem__N)
rG   rH   rI   rK   rL   r   r]   r+   r0   rF   r   r   r   r   rM   O   s    
rM   c                 C   s   t j| ddS )Nr   r9   )r   cat)batchr   r   r   collate_batchu   s   ri   pc                 C   s0   | j }zdt|fW S  ty   d|f Y S w )z'Sort by stem numerically when possible.r   r   )stemr   
ValueError)rj   sr   r   r   _natural_sort_keyy   s   rn   c                	   @   sB   e Zd ZdZeddfdedededefdd	Zd
d Zdd Z	dS )SelfCollectedDataseta  Episodes under root: root/task/episode/00000.png, 00001.png, ...
    Each subdir of root is a task; each subdir of a task is an episode with ordered PNGs.
    Samples random start-frame consecutive 8-frame sequences; returns first 4 frames for the 4-frame model.
    rN      r   r   rO   sequence_lengthc                 C   s   t | | _|| _|| _|| _g | _t| j D ]/}|	 s!qt| D ]!}|	 s.q'tt
|dtd}t|| jkrH| j||f q'q| jsZtd| j d| j dd S Nz*.png)keyzNo episodes with >=z PNGs under zK. Expected structure: root/task_name/episode_name/00000.png, 00001.png, ...)r   resolver   r   rO   rq   episodesr"   iterdiris_dirr   r#   rn   r.   appendr)   )r*   r   r   rO   rq   task_direpisode_dirpngsr   r   r   r+      s0   	zSelfCollectedDataset.__init__c                 C   r,   r-   r.   ru   r/   r   r   r   r0      r1   zSelfCollectedDataset.__len__c                 C   s  dd l }ddlm} | j| \}}t|}|d|| j }|||| j  }g }	|d | j D ]D}
|t|
}|j	d dkrG|
ddd}n|j	d dkrT|d d }| d }tjjj|d| j| jfddd	d}|	| q0tj|	dd
}|d d }|dS )Nr   
read_imager   rT   r   rS   rU   FrV   rf   rY   rZ   )r   torchvision.ior~   ru   r.   r   rq   r   rL   shaperepeatr]   r   r_   r`   ra   r:   rO   squeezerx   stack)r*   rA   r   r~   rz   Z	png_pathsnr   pathsre   rj   imgr<   r   r   r   rF      s4   

z SelfCollectedDataset.__getitem__N)
rG   rH   rI   rJ   rK   rL   r   r+   r0   rF   r   r   r   r   ro      s
    "ro   c                   @   sh   e Zd ZdZeddddfdedededed	edB d
edB fddZdd Z	de
defddZdd ZdS )KeygripVideoDataseta   Same as SelfCollectedDataset (real N-frame video from task/episode/*.png) plus optional
    trajectory per episode for keygrip/PARA supervision. Returns real video (no repeated frame)
    and trajectory_2d/trajectory_3d when episode has trajectory.json or trajectory_map is provided.
    rN   rp   Nr   r   rO   rq   trajectory_roottrajectory_mapc           
      C   s   t | | _|rt | n| j| _|| _|| _|| _|| _g | _t	| j
 D ]/}| s0q)t	|
 D ]!}| s=q6t	t|dtd}	t|	| jkrW| j||	f q6q)| jsitd| j d| j dd S rr   )r   rt   r   r   r   r   rO   rq   ru   r"   rv   rw   r   r#   rn   r.   rx   r)   )
r*   r   r   rO   rq   r   r   ry   rz   r{   r   r   r   r+      s4   		zKeygripVideoDataset.__init__c                 C   r,   r-   r|   r/   r   r   r   r0      r1   zKeygripVideoDataset.__len__rz   r   c                 C   sP  | j durd}|t|fD ]}|| j v r|} nq|du r9t|dr9z| }|| j v r.|}W n	 ty8   Y nw |dur| j | \}}t|tjrT| | }}ntj	|tj
d}tj	|tj
d}|| j }|jd |kr|jd |kr||| }||| }nEt| j|jd |jd }	|d|	 }|d|	 }|jd | jk rtj|ddd| j|jd  fdd}tj|ddd| j|jd  fdd}|d| j |d| j dfS ddl}
|| j}| j| d }| stj| jd	tj
dtj| jd
tj
ddfS t|}|
|}W d   n	1 sw   Y  tj	|d tj
d}tj	|d tj
d}|| j }|jd |krS|jd |krS||| }||| }nFt| j|jd |jd }	|d|	 }|d|	 }|jd | jk rtj|ddd| j|jd  fdd}tj|ddd| j|jd  fdd}|d| j |d| j dfS )a3  Load trajectory_2d (N,2) and trajectory_3d (N,3) from trajectory_map, trajectory.json, or return zeros.
        Waypoints in pixel coords (same resolution as self.size). Uses waypoints [start:start+num_frames] to align
        with sampled frames. Returns (t2d, t3d, True) or (zeros, zeros, False).
        Nrt   )dtyper   g        )valueTztrajectory.jsonrQ   rT   Ftrajectory_2dtrajectory_3d)r   rL   hasattrrt   r8   
isinstancer   Tensorclonetensorfloat32r   r   minFpadjsonrelative_tor   r   existszerosopenr5   )r*   rz   r   rs   	candidaterest2dt3dendr   r   relZ	traj_pathfdatar   r   r   _load_trajectory   sr   



$$


 $$z$KeygripVideoDataset._load_trajectoryc                 C   s.  dd l }ddlm} | j| \}}t|}|d|| j }|||| j  }g }	|d | j D ]D}
|t|
}|j	d dkrG|
ddd}n|j	d dkrT|d d }| d }tjjj|d| j| jfddd	d}|	| q0tj|	dd
}|d d }|d}| ||\}}}||||dS )Nr   r}   r   rT   r   rS   rU   FrV   rf   rY   rZ   )videor   r   has_trajectory)r   r   r~   ru   r.   r   rq   r   rL   r   r   r]   r   r_   r`   ra   r:   rO   r   rx   r   r   )r*   rA   r   r~   rz   r{   r   r   r   re   rj   r   r<   r   r   r   r   r   r   r   rF   *  s@   

zKeygripVideoDataset.__getitem__)rG   rH   rI   rJ   rK   rL   r   dictr+   r0   r   r   rF   r   r   r   r   r      s.    
"Ar   )rJ   r   pathlibr   numpyr
   r   torch.nn.functionalr_   r`   r   torch.utils.datar   r[   bridgeZ
set_bridger8   rK   r   rL   r   r$   r%   rM   ri   rn   ro   r   r   r   r   r   <module>   s.    &&	<