
    mirB                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZmZ ddlmZ  G d d	ej                         Zd
 Zd Z G d dej                         Z G d dej                         Z G d dej                         Zy)    N)nn)
functional   )capture_init)center_trimunfold)
LayerScalec                   *     e Zd ZdZd fd	Zd Z xZS )BLSTMz
    BiLSTM with same hidden units as input dim.
    If `max_steps` is not None, input will be splitting in overlapping
    chunks and the LSTM applied separately on each chunk.
    c                     t         |           |
|dz  dk(  sJ || _        t        j                  d|||      | _        t        j                  d|z  |      | _        || _        y )N   r   T)bidirectional
num_layershidden_size
input_size   )	super__init__	max_stepsr   LSTMlstmLinearlinearskip)selfdimlayersr   r   	__class__s        G/var/www/stems/demucs_env/lib/python3.12/site-packages/demucs/demucs.pyr   zBLSTM.__init__   s`     IMQ$666"GG$6s_bc	iiC-	    c           	      6   |j                   \  }}}|}d}| j                  c|| j                  kD  rT| j                  }|dz  }t        |||      }	|	j                   d   }
d}|	j                  dddd      j	                  d||      }|j                  ddd      }| j                  |      d   }| j                  |      }|j                  ddd      }|rg }|j	                  |d|      }	dz  }t        
      D ]m  }|dk(  r |j                  |	d d |d d d | f          (||
dz
  k(  r|j                  |	d d |d d |d f          O|j                  |	d d |d d || f          o t        j                  |d      }|dd |f   }|}| j                  r||z   }|S )	NFr   Tr   r      .)shaper   r   permutereshaper   r   rangeappendtorchcatr   )r   xBCTyframedwidthstrideframesnframesoutlimitks                 r   forwardzBLSTM.forward"   s   ''1a>>%!dnn*<NNEaZFAuf-Fll1oGFq!Q*222q%@AIIaAIIaLOKKNIIaACYYq"a/FaKE7^ >6JJvaAww&678'A+%JJvaAuvo67JJvaAueV|&;<=> ))C$Cc2A2g,CA99AAr    )r   NF)__name__
__module____qualname____doc__r   r8   __classcell__r   s   @r   r   r      s    
!r    r   c                     | j                   j                         j                         }||z  dz  }| j                   xj                  |z  c_        | j                   | j                  xj                  |z  c_        yy)zTRescale initial weight scale. It is unclear why it helps but it certainly does.
          ?N)weightstddetachdatabias)conv	referencerB   scales       r   rescale_convrI   F   sa     ++//

"
"
$C9_s"EKKyy		% r    c                     | j                         D ]W  }t        |t        j                  t        j                  t        j
                  t        j                  f      sLt        ||       Y y N)modules
isinstancer   Conv1dConvTranspose1dConv2dConvTranspose2drI   )modulerG   subs      r   rescale_modulerT   P   sI    ~~ )cBIIr'9'9299bFXFXYZi()r    c            	       B     e Zd ZdZ	 	 	 ddedededef fdZd Z xZS )	DConva  
    New residual branches in each encoder layer.
    This alternates dilated convolutions, potentially with LSTMs and attention.
    Also before entering each residual branch, dimension is projected on a smaller subspace,
    e.g. of dim `channels // compress`.
    channelscompressdepthinitc                 (   t         |           |dz  dk(  sJ || _        || _        t	        |      | _        |dkD  }d }|rd }t        ||z        }|
rt        j                  }nt        j                  }t        j                  g       | _        t        | j
                        D ]  }|rd|z  nd}||dz  z  }t        j                  |||||       ||       |       t        j                  |d|z  d       |d|z        t        j                  d      t        ||      g}|r|j!                  dt#        |||             |	r|j!                  dt%        |dd	d
             t        j&                  | }| j                  j)                  |        y)a  
        Args:
            channels: input/output channels for residual branch.
            compress: amount of channel compression inside the branch.
            depth: number of layers in the residual branch. Each layer has its own
                projection, and potentially LSTM and attention.
            init: initial scale for LayerNorm.
            norm: use GroupNorm.
            attn: use LocalAttention.
            heads: number of heads for the LocalAttention.
            ndecay: number of decay controls in the LocalAttention.
            lstm: use LSTM.
            gelu: Use GELU activation.
            kernel: kernel size for the (dilated) convolutions.
            dilate: if true, use dilation, increasing with the depth.
        r   r   r   c                 *    t        j                         S rK   r   Identityds    r   <lambda>z DConv.__init__.<locals>.<lambda>y   s    BKKM r    c                 .    t        j                  d|       S )Nr   r   	GroupNormr_   s    r   ra   z DConv.__init__.<locals>.<lambda>{   s    Q 2 r    )dilationpaddingr"   )headsndecay   T)r   r   r   N)r   r   rW   rX   absrY   intr   GELUReLU
ModuleListr   r'   rN   GLUr	   insert
LocalStater   
Sequentialr(   )r   rW   rX   rY   rZ   normattnrg   rh   r   gelukerneldilatenorm_fnhiddenactr`   re   rf   modslayerr   s                        r   r   zDConv.__init__]   sl   ( 	zQ  Z
 *2GX() ''C''CmmB'tzz" 	&A!'qAvQH&A+.G		(FFXwW		&!h,2H%rvvay8T*D Az&fMNAuVA4PQMM4(EKKu%	&r    c                 >    | j                   D ]  }| ||      z   } |S rK   )r   )r   r+   r|   s      r   r8   zDConv.forward   s'    [[ 	EE!HA	r    )r   r   -C6?TFr   r   FTr"   T)	r9   r:   r;   r<   rk   floatr   r8   r=   r>   s   @r   rV   rV   V   s=     Z^LP"&8& 8& 8&# 8&QV 8&tr    rV   c            	       <     e Zd ZdZddedededef fdZd Z xZS )	rq   a  Local state allows to have attention based only on data (no positional embedding),
    but while setting a constraint on the time window (e.g. decaying penalty term).

    Also a failed experiments with trying to provide some frequency based attention.
    rW   rg   nfreqsrh   c                    t         |           ||z  dk(  s	J ||f       || _        || _        || _        t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _	        |rt        j                  |||z  d      | _
        |rt        j                  |||z  d      | _        | j                  j                  xj                  dz  c_        | j                  j                  J d| j                  j                  j                  d d  t        j                  |||z  z   |d      | _        y )Nr   r   g{Gz?)r   r   rg   r   rh   r   rN   contentquerykeyquery_freqsquery_decayrA   rD   rE   proj)r   rW   rg   r   rh   r   s        r   r   zLocalState.__init__   s"   %1$7x&77$
yy8Q7YYx15
99Xx3!yy56>1ED!yy56>1ED##((D0(##((444,.D!!&&q)IIh71E	r    c                    |j                   \  }}}| j                  }t        j                  ||j                  |j
                        }|d d d f   |d d d f   z
  }| j                  |      j                  ||d|      }| j                  |      j                  ||d|      }	t        j                  d|	|      }
|
|	j                   d   dz  z  }
| j                  rt        j                  d| j                  dz   |j                  |j
                        }t        j                  dt        j                  z  |z  |j                  ddd      z        }| j                  |      j                  ||d|      | j                  dz  z  }|
t        j                  d||      z  }
| j                  rt        j                  d| j                  dz   |j                  |j
                        }| j!                  |      j                  ||d|      }t        j"                  |      dz  }|j                  ddd       |j%                         z  | j                  dz  z  }|
t        j                  d||      z  }
|
j'                  t        j(                  ||
j                  t        j*                        d       t        j,                  |
d	      }| j/                  |      j                  ||d|      }t        j                  d
||      }| j                  r/t        j                  d|      }t        j0                  ||gd      }|j3                  |d|      }|| j5                  |      z   S )N)devicedtyper#   zbhct,bhcs->bhtsr   r@   r   zfts,bhfs->bhtsir   zbhts,bhct->bhcszbhts,fts->bhfs)r$   rg   r)   aranger   r   r   viewr   einsumr   cosmathpir   rh   r   sigmoidrj   masked_fill_eyeboolsoftmaxr   r*   r&   r   )r   r+   r,   r-   r.   rg   indexesdeltaquerieskeysdotsperiodsfreq_kernelfreq_qdecaysdecay_qdecay_kernelweightsr   resulttime_sigs                        r   r8   zLocalState.forward   s   ''1a

,,qA4 747#33**Q-$$Qr15xx{5"a0||-tW=

1s"";;ll1dkkAoahhaggVG))AK%$7',,r1a:P$PQK%%a(--aA>PSASSFELL!1;GGD;;\\!T[[1_QXXQWWUF&&q)..q%Q?GmmG,q0G#[[Q22UYY[@4;;PSCSSLELL!1<IID 	%))AdkkLdS--!,,,q/&&q%Q7/'B;;||$4g{KHYY115F2q)499V$$$r    )r   r   r   )r9   r:   r;   r<   rk   r   r8   r=   r>   s   @r   rq   rq      s3    
F FS Fc Fs F&"%r    rq   c                   r     e Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	       Zd Zd Zd fd	Z xZS )Demucsc           
         t         &|           || _        || _        || _        |
| _        |	| _        || _        || _        || _	        || _
        || _        || _        t        j                         | _        t        j                         | _        t        j                         | _        |rt        j$                  d      }d}nt        j&                         }d}|rt        j(                  }nt        j&                  }|}d}t+        |      D ]  }d } ||k\  rfd} g }!|!t        j,                  ||||	       | |       |       gz  }!||k\  }"||k\  }#|dz  r|!t/        |||||"|#      gz  }!|r)|!t        j,                  |||z  d       | ||z        |gz  }!| j                  j1                  t        j2                  |!        g }$|dkD  r|}%nt5        | j                        |z  }%|r1|$t        j,                  |||z  d|
z  dz   |
       | ||z        |gz  }$|dz  r|$t/        |||||"|#      gz  }$|$t        j6                  ||%||	|      gz  }$|dkD  r|$ | |%       |       gz  }$| j                   j9                  dt        j2                  |$        |}t;        ||z        } |}|rt=        ||      | _        nd	| _        |rtA        | |
       y	y	)ab	  
        Args:
            sources (list[str]): list of source names
            audio_channels (int): stereo or mono
            channels (int): first convolution channels
            depth (int): number of encoder/decoder layers
            growth (float): multiply (resp divide) number of channels by that
                for each layer of the encoder (resp decoder)
            depth (int): number of layers in the encoder and in the decoder.
            rewrite (bool): add 1x1 convolution to each layer.
            lstm_layers (int): number of lstm layers, 0 = no lstm. Deactivated
                by default, as this is now replaced by the smaller and faster small LSTMs
                in the DConv branches.
            kernel_size (int): kernel size for convolutions
            stride (int): stride for convolutions
            context (int): kernel size of the convolution in the
                decoder before the transposed convolution. If > 1,
                will provide some context from neighboring time steps.
            gelu: use GELU activation function.
            glu (bool): use glu instead of ReLU for the 1x1 rewrite conv.
            norm_starts: layer at which group norm starts being used.
                decoder layers are numbered in reverse order.
            norm_groups: number of groups for group norm.
            dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
            dconv_depth: depth of residual DConv branch.
            dconv_comp: compression of DConv branch.
            dconv_attn: adds attention layers in DConv branch starting at this layer.
            dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
            dconv_init: initial scale for the DConv branch LayerScale.
            normalize (bool): normalizes the input audio on the fly, and scales back
                the output by the same amount.
            resample (bool): upsample x2 the input and downsample /2 the output.
            rescale (float): rescale initial weights of convolutions
                to get their standard deviation closer to `rescale`.
            samplerate (int): stored as meta information for easing
                future evaluations of the model.
            segment (float): duration of the chunks of audio to ideally evaluate the model on.
                This is used by `demucs.apply.apply_model`.
        r   r   r   r   c                 *    t        j                         S rK   r]   r_   s    r   ra   z!Demucs.__init__.<locals>.<lambda>F  s     r    c                 0    t        j                  |       S rK   rc   )r`   norm_groupss    r   ra   z!Demucs.__init__.<locals>.<lambda>H  s    BLLa$@ r    )rY   rZ   rX   rt   r   )rf   N)rG   )!r   r   audio_channelssourceskernel_sizecontextr2   rY   resamplerW   	normalize
sampleratesegmentr   rn   encoderdecoderskip_scalesro   rm   rl   r'   rN   rV   r(   rr   lenrO   rp   rk   r   r   rT   )'r   r   r   rW   growthrY   rewritelstm_layersr   r2   r   ru   glunorm_startsr   
dconv_modedconv_depth
dconv_comp
dconv_attn
dconv_lstm
dconv_initr   r   rescaler   r   
activationch_scaleact2in_channelsrf   indexrx   encodert   r   decodeout_channelsr   s'                 `                       r   r   zDemucs.__init__   s   X 	,&
  "$}}}}==?AJHJH77D77D$5\ (	.E-G#@F		+xfE! F
 J&DJ&DA~5:*44dL M MIIh8(;Q?Hx/0*> > LLv 67Fqy*"4<<0>AIIh8(;Q[1_V]^Hx/0*> > A~5:*44dL M Mr))(L"FG= > >Fqy7<0$&99LL2==&#9:"K6H,-HQ(	.T h4DIDI473 r    c                    | j                   r|dz  }t        | j                        D ]@  }t        j                  || j
                  z
  | j                  z        dz   }t        d|      }B t        | j                        D ]!  }|dz
  | j                  z  | j
                  z   }# | j                   rt        j                  |dz        }t        |      S )aX  
        Return the nearest valid length to use with the model so that
        there is no time steps left over in a convolution, e.g. for all
        layers, size of the input - kernel_size % stride = 0.

        Note that input are automatically padded if necessary to ensure that the output
        has the same length as the input.
        r   r   )	r   r'   rY   r   ceilr   r2   maxrk   )r   length_idxs       r   valid_lengthzDemucs.valid_lengthx  s     ==aKFtzz" 	$AYY)9)9 9T[[HIAMFF^F	$ $ 	CCqjDKK/$2B2BBF	C ==YYvz*F6{r    c                    |}|j                   d   }| j                  rE|j                  dd      }|j                  dd      }|j                  dd      }||z
  d|z   z  }nd}d}| j	                  |      |z
  }t        j                  ||dz  ||dz  z
  f      }| j                  rt        j                  |dd      }g }| j                  D ]  }	 |	|      }|j                  |        | j                  r| j                  |      }| j                  D ]*  }
|j                  d      }t        ||      } |
||z         }, | j                  rt        j                  |dd      }||z  |z   }t        ||      }|j!                  |j#                  d      t%        | j&                        | j(                  |j#                  d            }|S )Nr#   r   T)r   keepdimgh㈵>r   r   )r$   r   meanrB   r   Fpadr   juliusresample_fracr   r(   r   r   popr   r   sizer   r   r   )r   mixr+   r   monor   rB   r   savedr   r   r   s               r   r8   zDemucs.forward  s   >>88480D99T92D((r4(0CTdSj)ADC!!&)F2EE!eqj%%1*"456==$$Q1-All 	Fq	ALLO	 99		!All 	!F99R=DtQ'Dq4x A	!
 ==$$Q1-AGdN6"FF166!9c$,,/1D1DaffRjQr    c                     t        | j                        D ]B  }dD ];  }dD ]4  }| d| d| }| d| d| }||v s||vs!|j                  |      ||<   6 = D t        |   ||       y )N)r   r   )rE   rA   .z.3.z.2.)strict)r'   rY   r   r   load_state_dict)	r   stater   r   abnewoldr   s	           r   r   zDemucs.load_state_dict  s    $ 	4C+ 4+ 4ACqS,CCqS,Ce|5(8%*YYs^c
	44	4 	f5r    )r   @   g       @   Tr      r   r   TTr   r   r   r   r   r   r   r~   TTg?iD  (   )T)	r9   r:   r;   r   r   r   r8   r   r=   r>   s   @r   r   r      sx     !" !EY4 Y4v.%N	6 	6r    r   )r   typingtpr   r)   r   torch.nnr   r   statesr   utilsr   r   transformerr	   Moduler   rI   rT   rV   rq   r    r    r   <module>r      ss         $   & #/BII /d )DBII DN;% ;%|d6RYY d6r    