
    *i                     h   d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlmZ ddlmZ ddlmZmZmZmZmZ ddlZddlZddlmZ ej         G d d                      Zd	ej        d
ej        fdZdej        dej        d
ej        fdZdej        d
ej        fdZ  G d d          Z!ded
eee"ef                  fdZ#dZ$dede"de%deee%e%f                  d
ee&e'e"f                  f
dZ(dede"de%d
ee'         fdZ)de%de%d
e%fdZ*dZ+deee"ef                  d e%d!ed"ee"         d
ee"ej        f         f
d#Z,d$ee"e"f         d%e"d
e"fd&Z-d'eej                 d e%d(e.d
e/fd)Z0 G d* d+          Z1d,e"d-e"d
e%fd.Z2d,e"d-e"d
e%fd/Z3d,e"d-e"d
e"fd0Z4d,e"d-e"d
e"fd1Z5d2ed3e"d4e"d
dfd5Z6d6ed!ed7e"d8ee         d
df
d9Z7d!ed8ee         d
ee"         fd:Z8dAd<e"d
eeeee         f                  fd=Z9d>e&d
ee"ef         fd?Z:	 	 dBd8ee         d!ed3ee"         d4ee"         d
df
d@Z;dS )Cu  Export converted Raiden episodes to WebDataset sharded .tar format.

Each sample in a shard contains:
  {uuid}.{cam}_t{idx}.jpg           — camera images at specified time indices (JPEG quality 95)
  {uuid}.lowdim.npz                 — windowed lowdim arrays  (T × D each key)
  {uuid}.metadata.json              — episode / sample metadata
  {uuid}.language_instructions.json — language annotations

Alongside the shards the following files are written:
  preprocessing_config.yaml — full config snapshot
  manifest.jsonl            — one JSON line per shard: {"shard": ..., "num_sequences": N}
  stats.json                — per-key statistics (mean/std/min/max + percentiles)
  processing_metadata.json
    N)ThreadPoolExecutor)Path)AnyDictListOptionalTuple)tqdmc                      e Zd ZU dZeed<   dZeed<   dZeed<   dZ	eed<    e
j        d	 
          Zee         ed<   dZeed<   dZeed<   dZeed<   dZeee                  ed<    e
j        e
          Zeeef         ed<   dZeeeef                  ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZ eed<   d Z!eed!<   d"Z"eed#<   d$Z#eed%<   d&Z$eed'<   dS )(ShardifyConfigz2Parameters controlling the shardification process.
output_dir   past_lowdim_steps   future_lowdim_steps   stridec                  
    ddgS )Nr    r       -/home/robot-lab/raiden_cmu/raiden/shardify.py<lambda>zShardifyConfig.<lambda><   s
    "a r   )default_factoryimage_indicesmax_padding_left   max_padding_rightcopypadding_strategyNcamera_namescamera_name_mapresize_images_size_   jpeg_qualityF	use_depthfilter_still_samplesg?still_thresholdTfail_on_nanr   max_episodes_to_processd   samples_per_shard   num_workersP  stats_reservoir_size
   stats_stride)%__name__
__module____qualname____doc__r   __annotations__r   intr   r   dataclassesfieldr   r   r   r   r    strr!   r   dictr"   r   r#   r	   r%   r&   boolr'   r(   floatr)   r*   r,   r.   r0   r2   r   r   r   r   r   )   s        <<  s!!!! FCOOO  1{0QQQM49QQQcs"c""" )-L(49%,,,&7k&7&M&M&MOT#s(^MMM 59sCx1888L#It "'$&&&!OU!!!K#%S%%% !s   K !'#&&& L#r   r   vreturnc                    | dd                              t          j                  | dd                              t          j                  }}|t          j                            |          z  }|t          j        ||          |z  z
  }|t          j                            |          z  }t          j        ||          }t          j        |||gd          S )u(  Convert a (6,) rot6d vector to a (3, 3) rotation matrix via Gram-Schmidt.

    Inverse of ``_rot9_to_rot6d``: reconstructs the full rotation matrix from
    its first two rows.

    Args:
        v: (6,) float array — [R[0,:], R[1,:]].

    Returns:
        (3, 3) float64 rotation matrix.
    Nr      r   axis)astypenpfloat64linalgnormdotcrossstack)r?   a1a2b1b2b3s         r   _rot6d_to_matrR   d   s     rrU\\"*%%q1v}}RZ'@'@B	binnR  	 B	bfRnnr!	!B	binnR  	 B	"b		B8RRLq))))r   xyzrot6dc                     t          j        dt           j                  }t          |          |ddddf<   |                     t           j                  |dddf<   |S )uF   Build a 4×4 rigid-body transform from a (3,) position and (6,) rot6d.   dtypeNr   )rF   eyerG   rR   rE   )rS   rT   Ts      r   _build_transformr[   x   sY    
q
###Ae$$Abqb"1"fIzz"*%%Abqb!eHHr   rot9c                     |                      | j        dd         dz             }|dddddf                              | j        dd         dz             S )ul  Convert a row-major flattened 3×3 rotation matrix to the 6D representation.

    Uses the first two rows of R: [R[0,:], R[1,:]] = [R00,R01,R02, R10,R11,R12] → (6,).
    Matches vla_foundry's ``matrix_to_rot_6d``: ``R[:2, :].flatten()``.

    Args:
        rot9: (..., 9) float array — row-major flattened 3×3.

    Returns:
        (..., 6) float array.
    Nr   )r   r   .   )rB   )reshapeshape)r\   mats     r   _rot9_to_rot6drb      sY     ,,tz#2#/
0
0CsBQBz?""4:crc?T#9:::r   c                   ^    e Zd ZdZddededefdZdej        dd	fd
Zde	e
ef         fdZd	S )_StatsAccumulatorzyIncrementally tracks mean/std/min/max and a reservoir for percentiles.

    Handles data of shape (T, D) per sample.
    r/   rZ   Dreservoir_sizec                    || _         || _        d| _        t          j        ||ft          j                  | _        t          j        ||ft          j                  | _        t          j        ||ft          j	        t          j                  | _
        t          j        ||ft          j	         t          j                  | _        t          j        |||ft          j                  | _        d| _        || _        d S )Nr   rW   )rZ   re   nrF   zerosrG   _mean_M2fullinf_min_maxfloat32_res_res_n	_res_size)selfrZ   re   rf   s       r   __init__z_StatsAccumulator.__init__   s    Xq!fBJ777
8QF"*555GQFBF"*===	GQFRVG2:>>>	Hna32:FFF	'r   sampler@   Nc                    |                     t          j                  }| xj        dz  c_        || j        z
  }| xj        || j        z  z  c_        | xj        ||| j        z
  z  z  c_        t          j        | j        |          | _        t          j        | j	        |          | _	        | j
        | j        k     r>|                     t          j                  | j        | j
        <   | xj
        dz  c_
        dS t          t          j                            d| j                            }|| j        k     r)|                     t          j                  | j        |<   dS dS )zAdd one (T, D) sample.r   r   N)rE   rF   rG   rh   rj   rk   minimumrn   maximumro   rr   rs   rp   rq   r8   randomrandint)rt   rv   xdeltajs        r   updatez_StatsAccumulator.update   s   MM"*%%!DJ

edfn$

EQ^,,Jty!,,	Jty!,,	;''%+]]2:%>%>DIdk"KK1KKKKBI%%a0011A4>!!%}}RZ88	! "!r   c                    | j         dk    ri S t          j        | j        t	          | j         dz
  d          z            }| j                            d          }| j        |z
  }| j        | j         |dz  z  z                       d          }t          j        |t	          | j         | j        z  dz
  d          z            }| j	        
                    d          }| j                            d          }| j        d| j                 }|                    d| j                  }	t          j        |	g dd          }
t          j        |g dd          }dt          j        d	t$          ffd
i d |          d |          d |          d |          d | j                  d |          d | j	                  d | j                  d |
d                   d |
d                   d |
d                   d |
d                   d |
d                   d |
d                   d |d                   d |d                   d |d                    |d                    |d                    |d                   | j         t'          | j                  dS ) zDReturn a stats dict compatible with the reference stats.json format.r   r   rC   r^   Nr   )r   r^      r$   b   c   arrr@   c                 ^    | j         dk    r|                                 S fd| D             S )Nr   c                 &    g | ]} |          S r   r   ).0row_to_lists     r   
<listcomp>z@_StatsAccumulator.finalize.<locals>._to_list.<locals>.<listcomp>   s!    111cHHSMM111r   )ndimtolist)r   r   s    r   r   z,_StatsAccumulator.finalize.<locals>._to_list   s6    x1}}zz||#1111S1111r   meanstdminmaxmean_per_timestepstd_per_timestepmin_per_timestepmax_per_timesteppercentile_1percentile_2percentile_5percentile_95r   percentile_98rV   percentile_99r   percentile_1_per_timesteppercentile_2_per_timesteppercentile_5_per_timestep)percentile_95_per_timesteppercentile_98_per_timesteppercentile_99_per_timestepcountpercentile_sample_count)rh   rF   sqrtrk   r   rj   r   sumrZ   rn   r   ro   rq   rr   r_   re   
percentilendarrayr   r8   )rt   
std_per_tsglobal_meanr}   M2_combined
global_std
global_min
global_maxresflatpcts_globalpcts_per_tsr   s               @r   finalizez_StatsAccumulator.finalize   sf   6Q;;IWTXDFQJ(:(::;;
 joo1o-- 
[(x$&5!8"3388a8@@W[3tv/BA+F+FFGG
Y]]]**
Y]]]**
i$+&{{2tv&&mD*?*?*?aHHHmC)>)>)>QGGG	2"* 	2 	2 	2 	2 	2 	2 	2

HH[))
88J''
 88J''
 88J''	

  $*!5!5
  4 4
  3 3
  3 3
 HH[^44
 HH[^44
 HH[^44
 XXk!n55
 XXk!n55
 XXk!n55
 (+a.)A)A
  (+a.)A)A!
" (+a.)A)A#
$ +3(;q>*B*B*2(;q>*B*B*2(;q>*B*BV'*4;'7'7-
 
 
 	
r   )r/   )r3   r4   r5   r6   r8   ru   rF   r   r   r   r;   r   r   r   r   r   rd   rd      s         
( (# (# (s ( ( ( (9RZ 9D 9 9 9 9$4
$sCx. 4
 4
 4
 4
 4
 4
r   rd   ep_dirc                 p   | dz  }t          |                    d                    }|s"t          |                    d                    }|st          d|           g }|D ]Q}t          |d          5 }|                    t          j        |                     ddd           n# 1 swxY w Y   R|S )z>Load all per-frame lowdim pkl files from an episode directory.lowdim??????????.pklz?????????.pklzNo lowdim .pkl files in rbN)sortedglobFileNotFoundErroropenappendpickleload)r   
lowdim_dir	pkl_filesframespfs         r   _load_episode_framesr      s    ("Jz'78899I =:???;;<<	 I G: G GHHHF * *!T]] 	*aMM&+a..)))	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	*Ms   6(B**B.	1B.	r$   camera_name	frame_idxresizec                 `   | dz  |z  |ddz  }|                                 sdS t          j        t          |                    }|dS |(|\  }}t          j        |||ft          j                  }t          j        d|t          j        t          g          \  }}	t          |	          dfS )u~  Return (image_bytes, "jpg") for a frame, optionally resizing.

    The converter saves frames with ``cv2.imwrite`` in BGR order.  JPEG
    encoding via cv2 converts BGR→YCbCr internally, so PIL (used by WebDataset
    dataloaders) decodes the JPEG back as correct RGB — no manual channel flip
    needed.

    Returns:
        (bytes, "jpg"), or None if the file is absent.
    rgb010d.pngN)interpolationz.jpgjpg)
existscv2imreadr;   r   INTER_LANCZOS4imencodeIMWRITE_JPEG_QUALITY_JPEG_QUALITYbytes)
r   r   r   r   pathimg_bgrh_outw_out_bufs
             r   	_load_rgbr   	  s      E>K'Y*A*A*A*AAD;;== tjT##Gtu*WuenCDVWWW\&'C,Dm+TUUFAs::ur   c                     | dz  |z  |ddz  }|                                 sdS t          j        |          d         }t          j        d|          \  }}t          |          S )zReturn 16-bit PNG bytes for a depth frame, or None if absent.

    Loads the uint16 mm depth stored by the converter as a ``.npz`` and
    re-encodes it as a 16-bit greyscale PNG (lossless, widely supported).
    depthr   z.npzNr   )r   rF   r   r   r   r   )r   r   r   r   depth_mmr   r   s          r   _load_depth_pngr   &  sl     Gk)y,C,C,C,CCD;;== twt}}W%H\&(++FAs::r   idxn_framesc                 D    t          dt          | |dz
                      S )Nr   r   )r   r   )r   r   s     r   _clamp_framer   9  s     q#c8a<(()))r   yamr   
anchor_idxconfigoutput_cam_namesc           	         *+, |j         dz   |j        z   }t                     *t          }|j        +*+fdt          |j          |j        dz             D             ,dt          dt          dt          dt          t          j
                 f ,fd}i } |dd	d
          }||ddd	df         |d| d<   t          |ddddf                   |d| d<   |ddddf         |d| d<   |j        d         d
k    rL|ddddf         |d| d<   t          |ddddf                   |d| d<   |dddd
f         |d| d<    |dd	d
          }	|	|	ddd	df         |d| d<   t          |	ddddf                   |d| d<   |	ddddf         |d| d<   |	j        d         d
k    rL|	ddddf         |d| d<   t          |	ddddf                   |d| d<   |	dddd
f         |d| d<    |dd	d          }
|
9|
ddd	df         |d | <   |
j        d         dk    r|
ddddf         |d!| <    |d"d	d          }|9|ddd	df         |d#| <   |j        d         dk    r|ddddf         |d$| <             }*fd%|j        D             }|D ]9}t          |j        |          }|                    d&i                               |          }|Qt          j        |t          j        '          }t          j        |d         t          |          ddf          |d(| <   g }|D ]} |                             d)i                               |          }| t          j        d*t          j        '          }|                    t          j        |t          j        '                     t          j        |          |d+| <   ;|j         }d,D ]}d-| d.| d}d-| d.| d}||vr||         |         }||         |         }t          j                            t5          ||                    }d/D ]C}|dk    r%d0| d.| d}d0| d.| d}d0| d.| d1}d0| d.| d2} n$d-| d.| d}d-| d.| d}d-| d.| d1}d-| d.| d2} ||vrW||         }!||         }"t          j        |!          }#t          j        |"          }$t          |!j        d	                   D ]}%t5          |!|%         |"|%                   }&||&z  }'|'dddf                             t          j                  |#|%<   |'dd3ddf                                                             t          j                  |$|%<   |#||<   |$|| <   Et          j        |t>          '          }(d4|(d|j         <   t          j        |t>          '          })d4|)|j         dz   d<   |(|d5<   |)|d6<   |S )7z2Build windowed (T, D) arrays for one anchor frame.r   c                 <    g | ]}t          |z  z             S r   r   )r   offsetr   rh   ss     r   r   z(_build_window_arrays.<locals>.<listcomp>L  s<        	Z&1*,a00  r   r:   startlengthr@   c           	          g }D ]`}|                              |           }| d S |                    t          j        ||||z            t          j                             at          j        |          S )NrW   )getr   rF   asarrayrp   rL   )r:   r   r   rowsfivalr   
window_idxs         r   _collectz&_build_window_arrays.<locals>._collectQ  s|     	S 	SB*..''C{ttKK
3uuv~'=#>bjQQQRRRRx~~r   actionr      Nr   zrobot__action__poses__left::__xyz   __rot_6d   zrobot__action__grippers__left::_hand   zrobot__action__poses__right::   z robot__action__grippers__right::actual_poseszrobot__actual__poses__left::zrobot__actual__grippers__left::zrobot__actual__poses__right::z robot__actual__grippers__right::joints      z%robot__actual__joint_position__left::z&robot__actual__joint_position__right::action_jointsz&robot__desired__joint_position__left::z'robot__desired__joint_position__right::c                 6    g | ]}t          |z             S r   r   )r   ir   rh   s     r   r   z(_build_window_arrays.<locals>.<listcomp>  s'    WWWQj1na88WWWr   
intrinsicsrW   intrinsics.
extrinsicsrV   extrinsics.)leftrightrobot__actual__poses__z::)r   actualrobot__action__poses____xyz_relative__rot_6d_relativer^   T	past_maskfuture_mask) r   r   len_ROBOTr   ranger;   r8   r   rF   r   rb   r`   r   _reverse_mapr"   r   r   rp   tilerY   r   rG   rL   rH   invr[   
empty_likerE   flattenri   r=   )-r   r   r   r   rZ   Rr   out
action_seqactual_poses_seq
joints_seqact_joints_seqanchor_frameimg_frame_indicescam_namesrc_camKK_arrext_rowsr   extanchor_isideanc_xyz_keyanc_rot_keyanc_xyz	anc_rot6d	T_anc_invsrcxyz_keyrot_keyout_xyzout_rotxyz_seq	rot6d_seqrel_xyz	rel_rot6dr  T_tT_relr  r  rh   r   r   s-   ``                                        @@@r   _build_window_arraysr<  @  so    	 1$v'AAAFAAA     V55v7QTU7UVV  J
 C  "*9M        "$C (Ar**J7A!!!QqS&7I313334:Hqqq!B$w;
 ;
616667 ;EQQQ2X:N6a6667A"$$<Fqqq"R%x<PC88889?M111be8$@ @C;;;;< @J!!!RPRU(?SC;1;;;<  x266#7G1Q37O313334:HQQQ"W%;
 ;
616667 ;K111bQSe8:T6a6667!!$**<LQQQPRSUPUX<VC88889?M BrE*@ @C;;;;< @P2b5@C;1;;;<
 (Ar**J;Eaaa1f;M7A778A"$$@J111aPRd7@SC<<<= Xoq"55N!<J111aPQc6<R8Q889"b((AOPQPQPQSTUWSWPWAXC=!==> *%LWWWWW&BVWWW$ ; ;v5x@@\2..227;;=Jq
333E,.Gdc"344a;- -C(h(() # 	? 	?B*..r2266w??C{fQbj111OOBJs"*===>>>>(*(:(:$($$%% 'H! %% %%?t??q???BtBBqBBBc!!k"8,$X.	IMM"27I"F"FGG	' 	% 	%ChC4CC1CCCF4FF1FFFL4LL1LLLO4OO1OOOC4CC1CCCF4FF1FFFL4LL1LLLO4OO1OOOc!!'lGGImG,,Gi00I7=+,, J J&wqz9Q<@@!C"2A2q5\00<<
$RaR!V}4466==bjII	!"CL$CLL7	%< $'''I,0I((()(1D)))K26K(1,../ C$CJr   r"   out_namec                 N    |                                  D ]\  }}||k    r|c S |S )z=Return the source camera name for a given output camera name.)items)r"   r=  r1  dsts       r   r  r    s=    #))++  S(??JJJ Or   r  	thresholdc                    | dS | |dz   d         }t          |          dk    rdS | |ddf         }| |ddf         }t          t          t          j                            |ddddf         |z
  d                                                    t          t          j                            |ddddf         |z
  d                                                              }||k     S )	zGReturn True if the future EE trajectory barely moves (below threshold).NFr   r   r   r   r   rC   )r  r   r>   rF   rH   rI   )r  r   rA  futureanchor_xyz_lanchor_xyz_rmax_moves          r   	_is_stillrG    s     u
Q(()F
6{{auj!A#o.Lj"R%/0LbinnVAAAqsF^l:nCCGGIIJJbinnVAAAr"uH-<1nEEIIKKLL H ir   c                   j    e Zd ZdZdedefdZdeee	f         ddfdZ
dd	Zdd
Zdee         fdZdS )_ShardWriterz*Writes samples into sequential tar shards.	shard_dirr,   c                 L    || _         || _        d| _        g | _        g | _        d S )Nr   )
_shard_dir_sps
_shard_idx_buf_shard_counts)rt   rJ  r,   s      r   ru   z_ShardWriter.__init__  s,    #%	,.	(*r   filesr@   Nc                     | j                             |           t          | j                   | j        k    r|                                  d S d S N)rO  r   r  rM  _flush)rt   rQ  s     r   addz_ShardWriter.add  sF    	ty>>TY&&KKMMMMM '&r   c                 @    | j         r|                                  d S d S rS  )rO  rT  rt   s    r   closez_ShardWriter.close  s'    9 	KKMMMMM	 	r   c           	         d| j         dd}| j        |z  }t          j        |d          5 }| j        D ]m}|                                D ]V\  }}t          j        |          }t          |          |_        |	                    |t          j        |                     Wn	 d d d            n# 1 swxY w Y   | j                            t          | j                             | xj         dz  c_         g | _        d S )Nshard_06dz.tarw)namer   )rN  rL  tarfiler   rO  r?  TarInfor  sizeaddfileioBytesIOrP  r   )rt   r]  r   tfrv   fnamedatainfos           r   rT  z_ShardWriter._flush  s9   11111%\$$$ 	7) 7 7#)<<>> 7 7KE4"?666D #D		DIJJtRZ%5%5666677	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	!!#di..1111			s   A6B//B36B3c                 >    d t          | j                  D             S )Nc                 J    g | ] \  }}t          j        d |d|d          !S )rZ  r[  )shardnum_sequences)jsondumps)r   r  rh   s      r   r   z/_ShardWriter.manifest_lines.<locals>.<listcomp>  sJ     
 
 
1 J!1!!1!1!1AFFGG
 
 
r   )	enumeraterP  rW  s    r   manifest_linesz_ShardWriter.manifest_lines  s.    
 
!$"455
 
 
 	
r   )r@   N)r3   r4   r5   r6   r   r8   ru   r   r;   r   rU  rX  rT  r   ro  r   r   r   rI  rI    s        44+$ +3 + + + +c5j) d    
      
S	 
 
 
 
 
 
r   rI  bucketprefixc                     |                     d          dz   }|                     ||d          }|                    dd          S )zEReturn the number of objects under s3://bucket/prefix/, or 0 if none./r   )BucketPrefixMaxKeysKeyCountr   )rstriplist_objects_v2r   )	s3_clientrp  rq  prefix_with_slashresps        r   _s3_prefix_existsr}  %  sI    c**S0$$F;LVW$XXD88J"""r   c                 2   |                     d          dz   }d}|                     d          }|                    ||          D ]O}d |                    dg           D             }|r+|                     |d|i           |t          |          z  }P|S )	zDDelete all objects under s3://bucket/prefix/. Returns count deleted.rs  r   ry  rt  ru  c                 "    g | ]}d |d          iS )Keyr   )r   os     r   r   z%_s3_delete_prefix.<locals>.<listcomp>2  s!    GGGE1U8$GGGr   ContentsObjects)rt  Delete)rx  get_paginatorpaginater   delete_objectsr  )rz  rp  rq  r{  deleted	paginatorpageobjectss           r   _s3_delete_prefixr  ,  s    c**S0G''(9::I""&9J"KK $ $GGdhhz2.F.FGGG 	$$$FIw;O$PPPs7||#GNr   c           	      4   ddl }|j                             |j        j                                      d          }|                    d           d| }|                    d          dz   }|dz   }|                     d          }d}	|                    ||          D ]]}
|
                    dg           D ]D}|d	         }||t          |          d         z   }| 
                    |||d
|           |	dz  }	E^t          d|	 d| d| d           |S )zZCopy all objects under prefix/ to prefix_backup_<timestamp>/ and return the backup prefix.r   Nz%Y%m%d_%H%M%Srs  _backup_ry  r  r  r  )rt  r  )rt  
CopySourcer  r   z  Backed up z object(s) to s3://)datetimenowtimezoneutcstrftimerx  r  r  r   r  copy_objectprint)rz  rp  rq  r  tsbackup_prefixr{  backup_with_slashr  copiedr  objsrc_keydst_keys                 r   _s3_backup_prefixr  9  sh   OOO				x04	5	5	>	>	O	OB}}S))77277Mc**S0%+''(9::IF""&9J"KK 	 	88J++ 	 	C%jG''#6G2H2H2J2J*KKG!!&,W== "   
 aKFF	 

M
M
MF
M
M]
M
M
MNNNr   c                    t          d           t          d|  d| d           t                       t          d           t          d           t          d           t                       	 t          d                                                                          }|d	v rd
ddd	|         S t          d           S)zPrompt the user what to do when the S3 destination already has data.

    Returns one of: ``"delete"``, ``"backup"``, ``"cancel"``.
    z&
S3 destination already contains data:z  s3://rs  z*  [d] Delete existing shards and overwritez,  [b] Back up existing shards then overwritez  [c] CancelTzChoice [d/b/c]: )dbcdeletebackupcancelz  Please enter d, b, or c.)r  inputstriplower)rp  rq  choices      r   _prompt_s3_overwriter  Q  s    
 

3444	
&F
&
&V
&
&
&'''	GGG	
6777	
8999	.	GGG,)**002288::_$$!x@@HH*+++	,r   	local_dir	s3_bucket	s3_prefixc           	         ddl }|                    d          }t          |                     d                    }d |D             }t	          dt          |           d| d| d           |D ]R}| d|                    |            }|                    t          |          ||           t	          d	|            St	          d
           dS )z;Upload all files in local_dir to s3://s3_bucket/s3_prefix/.r   Ns3*c                 :    g | ]}|                                 |S r   )is_file)r   r   s     r   r   z upload_to_s3.<locals>.<listcomp>j  s%    ---1-Q---r   z
Uploading z files to s3://rs    zUpload complete.)	boto3clientr   rglobr  r  relative_toupload_filer;   )r  r  r  r  r  rQ  r   keys           r   upload_to_s3r  d  s    LLL	d		B9??3''((E-----E	
Js5zz
J
J)
J
Ji
J
J
JKKK  77Q]]95577
s1vvy#...j3jj	
r   rJ  output_dir_strepisode_dirsc                    ddl }t          ||          }i d|ddd|j        d|j        d|j        d	t          |j                  d
dd|j        d|j        d|j	        d|j
        d|d|j        d|j        d|j        rt          |j                  ndd|j        d|j        |j        dd}t#          | dz  d          5 }|                    ||dd           ddd           dS # 1 swxY w Y   dS )z5Write preprocessing_config.yaml alongside the shards.r   Nr!   compute_statisticsTr)   r'   r   r   image_formatr   r*   r   r   r.   r   r    r   r#   r,   r(   )r   validation_episodes_pathzpreprocessing_config.yamlr\  F)default_flow_style	sort_keys)yaml_resolve_output_cam_namesr)   r'   r   listr   r*   r   r   r.   r    r   r#   r,   r(   r   r   dump)rJ  r   r  r  r  r   cfgr   s           r   _write_preprocessing_configr  x  s    KKK0FF(d 	v) 	 ;	
 	v9 	f233 	 	"6#A 	F3 	V5 	v) 	n 	F3 	V5 	$d6#<===#$ 	V5%& 	61'( -$(+  C. 
i55s	;	; Dq		#qUd	CCCD D D D D D D D D D D D D D D D D Ds   C22C69C6c                      j         t           j                   S |D ]}t          |dz                      d                    }|s*t	          |d         d          5 }t          j        |          }ddd           n# 1 swxY w Y   t          |                    di                                                     } fd|D             c S g S )z/Return the ordered list of output camera names.Nr   r   r   r   r	  c                 F    g | ]}j                             ||          S r   )r"   r   )r   r  r   s     r   r   z-_resolve_output_cam_names.<locals>.<listcomp>  s,    CCCQ&**1a00CCCr   )	r!   r  r   r   r   r   r   r   keys)r   r  r   r   r   framesrc_camss   `      r   r  r    s    &F'((( D DFX-334DEEFF	 	)A,%% 	#KNNE	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	#		,3388::;;CCCC(CCCCCCIs   B  B	B	rf  data_dirc                     ddl }ddlm} t          |           dz  }t	          d |                                D             d d          }|s"t          d	|             |j        d
           d}d |D             |gt                    z   } ||ddd          }||v r|nfd|D             }g }	|D ]D}
t	          d |
                                D                       }|		                    |
|f           E|	S )zUse fzf to select one or more converted tasks.

    Returns a list of ``(task_dir, episode_dirs)`` tuples for each selected task.
    r   N)
fzf_select	processedc              3      K   | ]E}|                                 t          d  |                                D                       A|V  FdS )c              3   l   K   | ]/}|                                 |d z                                  V  0dS metadata.jsonNis_dirr   r   eps     r   	<genexpr>z2select_processed_task.<locals>.<genexpr>.<genexpr>  sT        46o%--//     r   N)r  anyiterdirr   r  s     r   r  z(select_processed_task.<locals>.<genexpr>  s|       	
 	
xxzz	
   :;))++    		
	
 	
 	
 	
 	
 	
r   c                 4    |                                  j        S rS  )statst_mtime)r  s    r   r   z'select_processed_task.<locals>.<lambda>  s    affhh' r   T)r  reversezNo converted tasks found in r   z*** ALL TASKS ***c           	      |    i | ]9}|j          d t          d |                                D                        d|:S )  (c              3   p   K   | ]1}|                                 |d z                                  -dV  2dS )r  r   Nr  r  s     r   r  z3select_processed_task.<locals>.<dictcomp>.<genexpr>  sE      eeRYY[[eb?FZEbEbEdEde!eeeeeer   z episode(s)))r]  r   r  r  s     r   
<dictcomp>z)select_processed_task.<locals>.<dictcomp>  s]        6ssceeaiikkeeeeesssuv  r   zShardify task(s)> z[Tab: toggle select  |  Enter: confirm  |  Select '*** ALL TASKS ***' to shardify everything)promptmultiheaderc                      g | ]
}|         S r   r   )r   r   labelss     r   r   z)select_processed_task.<locals>.<listcomp>  s    ;X;X;X!F1I;X;X;Xr   c              3   p   K   | ]1}|                                 |d z                                  -|V  2dS r  r  r  s     r   r  z(select_processed_task.<locals>.<genexpr>  s_       
 
yy{{
 !#_ 4<<>>

 
 
 
 
 
r   )
sysraiden.utilsr  r   r   r  r  exitr  r   )r  r  r  base	task_dirs
_ALL_LABELchoicesselectedchosen_dirsresulttask_dirr  r  s               @r   select_processed_taskr    s   
 JJJ''''''>>K'D	
 	
\\^^	
 	
 	
 ('  I  3T33444$J   F lT&\\)Gz#l	  H  *X55));X;X;X;Xx;X;X;XKF 0 0 
 
&&((
 
 
 
 

 	x.////Mr   argsc                   "#$ | \  }$}}|d         "|d         #|d         }|d         }|d         }|d         }|d         }|d         }	|j         }
|j        |
z  }|j        |
z  }t          d	t          d	|$z
             |
z             }t          d	t          d	|#d
z
  $z
  z
             |
z             }||j        k    s||j        k    rddiS |j        r]t          j        "#$fdt          |j         |j        d
z             D                       }t          ||j        |j                  rddiS t          "$||          }|j        r|                                D ]m\  }}t          |t          j                  rN|j        j        dk    r>t          j        |                                          rt+          d| d| d$           nn/t)          d |                                D                       rddiS i }t/          t1          j                              }|j        D ]}t7          $|z   #          }d| }t9          ||          D ][\  }}t;          |	|||j                  }||\  }}||| d| d| d| <   |j        r!tA          |	||          }|||| d| d| d<   \tC          j"                    }t          j#        |fi | |$                                || d<   #$fd|j        D             }|| d| d$d$|j        |t          d	$|j        z
            tK          #d
z
  $|j        z             tM          |          tM          |          |#|d	k    p|d	k    |d} tO          j(        |           )                                || d<   d |p|gi}!tO          j(        |!          )                                || d!<   d|||d"S )#u  Build one sample for the shard writer (runs in a worker thread).

    Args:
        args: (ctx, t, config, output_cam_names) — packed as a tuple so
            ThreadPoolExecutor.map can call this as a plain function.

    Returns:
        A dict with key ``"filtered"`` set to ``"padding"``, ``"still"``, or
        ``"nan"`` when the sample is dropped, or ``None`` when it should be
        written.  Kept samples also carry ``"sample_files"``, ``"lowdim"``,
        ``"episode_id"``, and ``"lang"`` keys.
    r   r   src_cam_names
episode_idlanguage_tasklanguage_promptcontrolr   r   r   filteredpaddingc           
          g | ]_}t          j        t          |z                                          d t          j        d                    t           j                  `S )r   r   rW   )rF   r   r   r   ri   rp   )r   r  r   r   ts     r   r   z!_build_sample.<locals>.<listcomp>  sp       
 	 
<Ax889==hQSUU*    r   stillr   zNaN in key 'z' at episode z frame c              3      K   | ]T}t          |t          j                  o5|j        j        d k    o%t          j        |                                          V  UdS )r   N)
isinstancerF   r   rX   kindisnanr  )r   r   s     r   r  z _build_sample.<locals>.<genexpr>+  sq       
 
  sBJ'' $	#%$!!##
 
 
 
 
 
r   nanr  N.r   z
.depth.pngz.lowdim.npzc                 6    g | ]}t          |z             S r   r   )r   r  r   r  s     r   r   z!_build_sample.<locals>.<listcomp>J  s'    JJJl1q5(++JJJr   _t04d)r  	sample_idanchor_timestepanchor_relative_idximage_timestepslowdim_start_timesteplowdim_end_timesteppast_paddingfuture_paddingr!   original_episode_length	is_paddedr  z.metadata.jsonoriginalz.language_instructions.json)r   sample_filesr   r  )*r   r   r   r   r   r   r'   rF   rL   r  rG  r(   r<  r)   r?  r  r   rX   r  r  r  
ValueErrorvaluesr;   uuiduuid4r   r   zipr   r#   r&   r   rb  rc  savez_compressedgetvaluer   r8   rl  rm  encode)%r  ctxr   r   r  r  r  r  r  r   r   left_frames_neededright_frames_neededleft_pad	right_padwindow_actionsr   r  r   r  sample_uuidimg_idx	abs_framesuffixr%  out_camr  r   img_extr   r   img_tssample_metalangr   r   r  s%                                     @@@r   _build_sampler1    s    (,$CF$]F:H(M\"J(M+,O)nG]FA  1A5 4q81Q 2Q 67771<=>>HA#a!41q8H!IJJJaOPQQI&)))Y9Q-Q-QI&& " )     
 --v/IA/M   

 

 ^V%=v?UVV 	)(( "&!V5EFFF  ' 	 	HC#rz** sy~/D/D8C==$$&& $OsOOOOAOO  	  
 
 }}	
 
 
 
 
 	' && &(Ldjll##K' X X Wh77	W #M3C D D 	X 	XGWvw	6;TUUF!%WNQJJgJJJJJJK X'CC$RWLK!N!N'!N!NF!N!N!NO	X *,,C&&v&&&03LK,,,- KJJJJV5IJJJF #;;j;;A;;;%7!!$QF,D(D!E!E"8a<V5O1OPPHi..(#+\2Y] K 48:k3J3J3Q3Q3S3SLK///09M:;D@D
A Afhh K<<<=
 $ 	  r   c                   @ t          j                     }j        dz  }|                    dd           |r|rddl}|                    d          }t          |||          rt          ||          }|dk    rt          d           dS |d	k    rAt          d
           t          |||           t          d           t          |||           n9|dk    r3t          d           t          |||          }	t          d|	 d           t          |           }
t          j        |
           j        dk    r|
dj                 }
t          |
          @|r
|rd| d| nt          j                  }t!          |||
           j        dz   j        z   }t'          |j                  }i }d}d}d}d}d}t          dt+          |
           d           g }i }d}d}t-          |
d          D ]/\  }}|dz                                  s1t          d| dt+          |
           d|j         d           |dz  }N	 t3          |          }nI# t4          $ r<}t          d| dt+          |
           d|j         d|            |dz  }Y d}~d}~ww xY w|d         }t7          |dz            5 }t9          j        |          }ddd           n# 1 swxY w Y   |                    d          pg }|                    d          pg } |                    d          }!|                    ||t+          |          fd@D             t          |                    dd                    t          |                    dd                    |j        |                    d d!          || |!d"           |s| s|!r|| d#}"|!r|!|"d<   |"||j        <   |t+          |          z  }t          d| dt+          |
           d$|j         d%t+          |           d&	           1|rd'| d(nd}#t          d)t+          |           d*|# d'| d+           d, |D             }$t          j        |$           t          d-t+          |$           d.t+          |           d/j          d0           i }%@fd1|$D             }&tC          j"        2          5 }'tG          |'$                    tJ          |&          t+          |&          d3d4          }(|(D ]})|)d5         d6k    r|dz  }|)d5         d7k    r|dz  }'|)d5         d8k    r|dz  }9|&                    |)d9                    |dz  }|)d:         }*|%                    |*d          dz   |%|*<   |dz  }|('                    ||z   |z   |j(        ;           |j)        z  dk    r|)d<         }+|+*                                D ]\  },}-tW          |-tX          j-                  s |-j.        t^          k    s|-j0        d=k     r<|,1                    d>          s|,1                    d?          rg|-2                    |d@          3                    tX          j4                  }.|.j5        d         }/|,|vrtm          ||/j7                  ||,<   ||,         8                    |.           	 ddd           n# 1 swxY w Y   t          dA           ts          |%*                                          D ]\  }0}1t          dB|0 d|1            |:                                 |dCz  ;                    dD<                    |=                                          dDz              |rd}2t}          dE |?                                D                       }3|dFz  }4dG |D             }5|*                                D ]t\  }0}"|"d         s|"                    d          s#|5                    |0          }|;|dFz  }6|6@                                sUt          jB        |6|4|0z  dH           |2dz  }2ut7          |dIz  dJ          5 }7t9          jC        ||7d=K           ddd           n# 1 swxY w Y   t          dL|3 dM|2 dN           t          dO           dP |*                                D             }8|8*                                D ]\  },}9|9                    dQd          }1|9                    dRg           }:|9                    dSg           };tW          |:t                    r|:|;k    ndT}<|<rdUnd}=t          dB|, dV|1 |=            t7          |dWz  dJ          5 }7t9          jC        |8|7           ddd           n# 1 swxY w Y   t          j                     |z
  }>dXt          jD        dY          dZ |
D             t+          |
          d[t          jF                  t          jH        t          jI                    t          jK                    d\||||t          |>d          d]d^}?t          j                  |?d_         d`<   t7          |daz  dJ          5 }7t9          jC        |?|7d=K           ddd           n# 1 swxY w Y   t          db| dc|j(         dd||z   |z    de| df| dg| dh|>didj           t          dk|            |r|rt          |||           dS dS dS )la  Convert a list of converted episode directories to sharded WebDataset format.

    Args:
        episode_dirs: Paths to episode directories, each containing ``lowdim/``,
            ``rgb/``, and ``metadata.json``.
        config: Shardification parameters.
        s3_bucket: If set, upload the shards directory to this S3 bucket.
        s3_prefix: S3 key prefix for upload (e.g. ``yam_datasets/task_name``).
    shardsT)parentsexist_okr   Nr  r  z
Cancelled.r  zBacking up existing shards...zDeleting existing shards...r  z
  Deleted z object(s).zs3://rs  r   zLoading z episode(s)...r  z  [z] SKIP z: no metadata.jsonz: event_markersaudio_segments
audio_fullc                 :    g | ]}t          j        |          S r   )r  r"   )r   r,  r   s     r   r   z run_shardify.<locals>.<listcomp>  s6     " " " !!7AA" " "r   r   r  r  leader)r   r   r   r  r  r  r  r  r6  r7  r8  )r6  r7  z] r  z frames)z, z skippedzLoaded z	 episodesz frames total
c                 F    g | ]}t          |d                    D ]}||fS )r   )r  )r   r"  r  s      r   r   z run_shardify.<locals>.<listcomp>  sI       s:1G1G ,-a   r   zProcessing z anchors from z% episodes (globally shuffled, stride=z)...c                 "    g | ]\  }}||fS r   r   )r   r"  r  r   r   s      r   r   z run_shardify.<locals>.<listcomp>  s'    KKKQ#q&"23KKKr   )max_workersanchor)totalunitdynamic_ncolsr   r  r  r	  r  r  )r   rj  r   r^   r  r
  r   z
Samples per episode:r  zmanifest.jsonl
c              3   *   K   | ]}|d          
dV  dS )r6  r   Nr   )r   r?   s     r   r  zrun_shardify.<locals>.<genexpr>0  s,      XXaQEWXXXXXXXr   audioc                 ,    i | ]}|d          |d         S )r  r   r   )r   r"  s     r   r  z run_shardify.<locals>.<dictcomp>4  s#    PPPSL)3x=PPPr   )dirs_exist_okzsubtask_index.jsonr\  )indentu     ✓ subtask_index.json (z  episode(s) with event_markers, z with audio/)z
Computing statistics...c                 >    i | ]\  }}||                                 S r   )r   )r   r  accs      r   r  z run_shardify.<locals>.<dictcomp>I  s3        (SS\\^^  r   r   r   r   Fz  [!] min==maxz: n=z
stats.jsonz1.0z%Y-%m-%dT%H:%M:%Sc                 ,    g | ]}t          |          S r   )r;   r  s     r   r   z run_shardify.<locals>.<listcomp>\  s    111SVV111r   )r  num_episodes)python_versionplatformhostname)total_samples_createdpadding_samples_filteredstill_samples_filterednan_samples_filteredelapsed_seconds)metadata_version
created_atsource_datar   environment
processingr   r   zprocessing_metadata.jsonz
Done: u    samples → z shards  filtered: z (pad=z still=z nan=z)  elapsed: z.0fr   zOutput: )Ntimer   mkdirr  r  r}  r  r  r  r  r  rz   shuffler*   r  r;   r  r   r   rI  r,   r  rn  r   r]  r   r   r   rl  r   r   r   r   r   r.   r
   mapr1  rU  set_postfixrN  r2   r?  r  rF   r   rX   r=   r   
startswithr_   rE   rp   r`   rd   r0   r   r   rX  
write_textjoinro  r   r  r  shutilcopytreer  r  r9   asdictr  versionrN  socketgethostnameroundr  )Ar  r   r  r  t_startrJ  _boto3_s3r   rh   epsr  rZ   writerstats_accumulatorstotal_samplesfiltered_paddingfiltered_stillfiltered_nanstats_counterep_contextssubtask_indexskippedtotal_framesr  r   r   er"  _mf_ep_metamarkerssegmentsr8  entryskip_msgall_workep_sample_counts	work_argsexecutorpbarr  r  r   r  r   
sample_arrre   ep_idr   n_audio_episodesn_marker_episodes
audio_rootep_dir_by_id	src_audior   statsr   mnmxsameflagelapsed	proc_metar   sA    `                                                              @r   run_shardifyr  i  s    ikkG!H,IOOD4O000  3Y 3mmD!!S)Y77 	3))Y??F!!l###8##5666!#y)<<<3444!#y)<<<<8##3444%c9i@@11111222 |

C
N3%))2F223 1==
 	$"	$'	''I'''"## 
  	6>3GGG 1$v'AAA)V%=>>F79MNLM 

-SXX
-
-
-... K &(MGLsA&& ,K ,K	6(0022 	LLLCHHLLV[LLLMMMqLG	)&11FF  	 	 	???CHH??V[??A??@@@qLGHHHH	 ay&?*++ 	&sy~~H	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	&,,//52<< 0117R\\,//
  KK" " " "#3" " " "%\%5%5or%J%J!K!K#&|'7'78I2'N'N#O#O$k#<<	8<<!("*( 	
 	
 	
$  	/h 	/* 	/!("* E  1&0l#).M&+&F#IAIICIIFKIICKKIIIJJJJ)08%G%%%%bH	V#k""VVXVVVVV  
 '  H N8		:c(mm 	: 	:3{3C3C 	: 	:&,m	: 	: 	:  
 (*KKKKK(KKKI	(:	;	;	; +CxLL	22i..	
 
 
  $	C $	CFj!Y.. A%  
#w..!#
#u,,!

6.1222"#L1
/?/C/CJPQ/R/RUV/V ,"  ->M + !    !6#66!;;#H-F$*LLNN C CS)#rz:: %$9,,1$>>-88 %CNN)= = % %%([[B%7%7%>%>rz%J%J
&,Q/&8886G !1f&A7 7.s3 +3/66zBBBBI$	C+C +C +C +C +C +C +C +C +C +C +C +C +C +C +CZ 

"###/557788 % %u#5##E##$$$$
LLNNN !!--dii8M8M8O8O.P.PSW.WXXX  
XX=+?+?+A+AXXXXX(
 QPKPPP)//11 
	" 
	"LE5*+ uyy/F/F !%%e,,F~(I##%% OIzE'9NNNN!)22C88 	2AImQq1111	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2>): > >.> > >	
 	
 	
 

%&&& ,>,D,D,F,F  E ++-- + +Qgq!!UU5"UU5"%b$//:rRxxU#'/R)3))E)4))****	i,&	,	, 	%               ikkG#G!m$78811S111HH
 
 $V,,!k )++*,,
 
 &3(8&4$0$Wa00
 
 I, ),F,=(>(>Ih%	i44c	:	: *a	)Qq))))* * * * * * * * * * * * * * * 
	#= 	# 	#v/@ 	# 	#%6E	# 	# 	# 	#)7	# 	#>J	# 	# "	# 	# 	#   

 Y
 
 !!!  6Y 6Y	9555556 6 6 6ss   I
J2JJ8KK	 K	G,[[[a99a= a=0fff.jjj)rf  )NN)<r6   r9   rb  rl  r   rN  rz   rb  rf  r  r^  rZ  r  concurrent.futuresr   pathlibr   typingr   r   r   r   r	   r   numpyrF   r
   	dataclassr   r   rR   r[   rb   rd   r;   r   r   r8   tupler   r   r   r   r  r<  r  r>   r=   rG  rI  r}  r  r  r  r  r  r  r  r1  r  r   r   r   <module>r     s        				       



    1 1 1 1 1 1       3 3 3 3 3 3 3 3 3 3 3 3 3 3 



           2 2 2 2 2 2 2 2t*RZ *BJ * * * *("* RZ BJ    ; ;
 ; ; ; ;*Z
 Z
 Z
 Z
 Z
 Z
 Z
 Z
D $tCH~*>       U38_%	
 eE3J    :D s s xPU    &*c *S *S * * * * 
Wc3h WW W 3i	W
 
#rz/W W W Wt$sCx. C C     $ 25 BG 	       .$
 $
 $
 $
 $
 $
 $
 $
X# #c #c # # # #
 
c 
c 
 
 
 
 c c    0, ,c ,c , , , ,&D S S T    ("D"D"D "D t*	"D
 
"D "D "D "DJ*.t*	#Y   .1 1C 1T%d4j@P:Q5R 1 1 1 1ry
y	#s(^y y y y~  $#	S6 S6t*S6S6 }S6 }	S6
 
S6 S6 S6 S6 S6 S6r   