
    |2g                         d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlZ	 e       d        Z
 e       d        Zd Zd Zd Z G d	 d
e      Zy)    N)Sequence)	lru_cachec                      t         j                  j                  t         j                  j                  t         j                  j	                  t
                    d      S )Nzbpe_simple_vocab_16e6.txt.gz)ospathjoindirnameabspath__file__     O/home/cameronsmith/repos/FeatUp/featup/featurizers/maskclip/simple_tokenizer.pydefault_bper      s2    77<<(ABDbccr   c            	         t        t        t        d      t        d      dz               t        t        t        d      t        d      dz               z   t        t        t        d      t        d      dz               z   } | dd }d	}t        d
      D ]1  }|| vs| j                  |       |j                  d
|z          |dz  }3 |D cg c]  }t	        |       }}t        t        | |            S c c}w )a9  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    !~      ¡   ¬   ®   ÿNr      )listrangeordappendchrdictzip)bscsnbs       r   bytes_to_unicoder$      s     
eCHc#hqj)	*4c$iT10M+N	NtTYZ]^bZcehimenopepTqOr	rB	AB	A4[ B;IIaLIId1fFA	
 	Q#a&	B	B 
s   C4c                 b    t               }| d   }| dd D ]  }|j                  ||f       |} |S )zReturn set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charchars       r   	get_pairsr,   '   sF     EEQIQR 		9d#$	 Lr   c                     t        | t              sdj                  |       } t        j                  |       } t        j                  t        j                  |             } | j                         S )Nz, )
isinstancestrr   ftfyfix_texthtmlunescapestriptexts    r   basic_cleanr7   3   sL     dC yy==D==t,-D::<r   c                 T    t        j                  dd|       } | j                         } | S )Nz\s+ )resubr4   r5   s    r   whitespace_cleanr<   >   s$    66&#t$D::<DKr   c                   8    e Zd Z e       fdefdZd Zd Zd Zy)SimpleTokenizerbpe_pathc           
         t               | _        | j                  j                         D ci c]  \  }}||
 c}}| _        t	        j
                  |      j                         j                  d      j                  d      }|dd }|D cg c]  }t        |j                                }}t        t               j                               }||D cg c]  }|dz   	 c}z   }|D ]"  }|j                  dj                  |             $ |j                  ddg       t        t!        |t#        t%        |                        | _        | j&                  j                         D ci c]  \  }}||
 c}}| _        t        t!        |t#        t%        |                        | _        ddd	| _        t/        j0                  d
t.        j2                        | _        y c c}}w c c}w c c}w c c}}w )Nutf-8
r   i  </w> <|startoftext|><|endoftext|>)rE   rF   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r$   byte_encoderitemsbyte_decodergzipopenreaddecodesplittupler   valuesr   r   extendr   r   r   lenencoderdecoder	bpe_rankscacher:   compile
IGNORECASEpat)selfr?   kvmergesmergevocabs          r   __init__zSimpleTokenizer.__init__E   s   ,..2.?.?.E.E.GHdaQTH8$))+227;AA$G-(4:;5%&;;%'..0151a611 	)ELL(	)'9:CuSZ'89:)-););)=>A1>c&%F*<=>):_]
::  A  CE  CP  CP  Q I <1
 ?s   G G%G*)G/c                     | j                   v r j                   |   S t        |d d       |d   dz   fz   }t        |      }|s|dz   S 	 t        | fd      }| j                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }| j                   |<   |S #  |j                  ||d         Y bxY w)	NrC   c                 N    j                   j                  | t        d            S )Ninf)rU   getfloat)pairrZ   s    r   <lambda>z%SimpleTokenizer.bpe.<locals>.<lambda>`   s    4>>3E3EdERWL3Y r   )keyr   r      r9   )
rV   rO   r,   minrU   rR   indexrQ   r   r   )
rZ   tokenr(   r)   bigramfirstsecondnew_wordijs
   `         r   bpezSimpleTokenizer.bpeV   s   DJJ::e$$U3BZ U2Y%7#99$<&YZFT^^+"ME6HAc$i-

5!,AOOD1I.A
 7e#CIaKD1I<OOOE&L1FAOODG,FA c$i- XHD4yA~!$7 8 xx~ 

5%OODH-s   (E E1c                 \    g }t        t        |            j                         }t        j                   j
                  |      D ]c  }dj                   fd|j                  d      D              }|j                   fd j                  |      j                  d      D               e |S )NrD   c              3   <   K   | ]  }j                   |     y wN)rG   ).0r#   rZ   s     r   	<genexpr>z)SimpleTokenizer.encode.<locals>.<genexpr>   s     PQD--a0P   rA   c              3   <   K   | ]  }j                   |     y wrw   )rS   )rx   	bpe_tokenrZ   s     r   ry   z)SimpleTokenizer.encode.<locals>.<genexpr>   s     b)dll95brz   r9   )r<   r7   lowerr:   findallrY   r   encoderQ   rt   rN   )rZ   r6   
bpe_tokensrm   s   `   r   r   zSimpleTokenizer.encode   s    
D 1288:ZZ$/ 	cEGGP%,,w:OPPEbtxxPUG\G\]`Gabb	c r   c                     dj                  |D cg c]  }| j                  |    c}      }t        |D cg c]  }| j                  |    c}      j	                  dd      j                  dd      }|S c c}w c c}w )NrD   rA   replace)errorsrC   r9   )r   rT   	bytearrayrI   rM   r   )rZ   tokensrm   r6   cs        r   rM   zSimpleTokenizer.decode   sr    ww@U+@A=1$++A.=>EEgV_E`hhioqtu A=s
   A6A;N)	__name__
__module____qualname__r   r/   r`   rt   r   rM   r   r   r   r>   r>   D   s%    '2} Q Q"'Rr   r>   )rJ   r2   r   collections.abcr   	functoolsr   r0   regexr:   r   r$   r,   r7   r<   objectr>   r   r   r   <module>r      se      	 $    d d  ,	Ff Fr   