
    mi)9                     R   d Z ddlmZmZ ddlmZmZ ddlZddlm	Z	 ddl
Z
ddlZddlZddlZddlZddlmZ ddlZddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZm Z   ejB                  e"      Z# G d d      Z$ G d d      Z%de&fdZ'e G d d             Z( G d d      Z)y)z)Scheduling and job monitoring utilities.
    )contextmanager	ExitStack)	dataclassfieldN)Path)SlurmJob   )git_save)SlurmConfigSubmitRules)get_distrib_spec)DecoratedMain)try_load)XP_get_sigc                   <    e Zd Zdedej
                  e   fdZd Zy)_SubmitItTargetmainargvc                     |j                  |      | _        t               }t        |j                        t
        j                  d<   |t        j                  dd   |        y )NRANKr	   )	get_xpxpr   strrankosenvironsysr   )selfr   r   specs       C/var/www/stems/demucs_env/lib/python3.12/site-packages/dora/shep.py__call__z_SubmitItTarget.__call__#   sF    ++d#! !^

6    c                    t               j                  dk(  rH| j                  j                  j	                         r$| j                  j                  j                          t        j                  j                  | g|i |S )Nr   )	r   r   r   rendezvous_fileexistsunlinksubmitithelpersDelayedSubmission)r   argskwargss      r!   
checkpointz_SubmitItTarget.checkpoint,   s`    ""a'ww&&--/''..011$HHHHr#   N)	__name__
__module____qualname__r   tpSequencer   r"   r-    r#   r!   r   r   "   s$    ] "++c2B Ir#   r   c                   X    e Zd ZdZdefdZedefd       ZddZ	ddZ
ed        Zd	 Zy
)Sheepz[
    A Sheep is a specific run for a given XP. Sheeps are managed
    by the Shepherd.
    r   c                     || _         d | _        d | _        | j                  j	                         r=t        | j                        }t        |t              r|\  | _        | _        y || _        y y N)r   job_other_jobs	_job_filer&   r   
isinstancetuple)r   r   contents      r!   __init__zSheep.__init__9   s[    37DH>>  "t~~.G'5)-4*$*" #r#   returnc                     | j                   j                  | j                   j                  j                  j                  z  S r7   )r   folderdorashepjob_filer   s    r!   r:   zSheep._job_fileE   s)    ww~~ 1 1 : :::r#   c                    | j                   y| j                   j                  j                  | j                   j                  |      }|dk(  r*| j                  rt        d | j                  D              rd}|j                  d      ry|S )z1Return the current state of the `Sheep`.
        NUNKNOWNc              3   :   K   | ]  }|j                   d k7    yw)rG   N)state).0r8   s     r!   	<genexpr>zSheep.state.<locals>.<genexpr>P   s     Fc399	)Fs   MISSING	CANCELLED)r8   watcher	get_statejob_idr9   any
startswith)r   moderI   s      r!   rI   zSheep.stateI   sv     88  **488??DAI$"2"2FT5E5EFF "K(r#   c                     | j                   y| j                   j                  j                  | j                   j                  |      S )zDReturn True if the job is no longer running on the cluster.
        T)r8   rN   is_donerP   )r   rS   s     r!   rU   zSheep.is_doneY   s6     88xx''>>r#   c                 |    | j                   0| j                  j                  | j                   j                   dz  S y)z)Return the path to the main log.
        Nz
_0_log.out)r8   r   r(   rP   rE   s    r!   logz	Sheep.log`   s6     8877##(9&DDDr#   c                     d| j                   j                   d| j                          d}| j                  |d| j                  j                   dz  }|d| j                   j
                   dz  }|S )NzSheep(z, state=z, zsid=zargv=))r   sigrI   r8   rP   r   )r   outs     r!   __repr__zSheep.__repr__h   sj    tww{{m8DJJL><88T$((//*"--Ctww||nA&&
r#   N)standard)r.   r/   r0   __doc__r   r>   propertyr   r:   rI   rU   rW   r\   r3   r#   r!   r5   r5   4   sQ    
#2 
# ;4 ; ; ?  r#   r5   xc                      y)z7No logging logging function, passed to `Shepherd`.
    Nr3   )r`   s    r!   no_logrb   q   s     	r#   c                   N    e Zd ZU eed<    ee      Zej                  e
   ed<   y)	_JobArrayslurm_config)default_factorysheepsN)r.   r/   r0   r   __annotations__r   listrg   r1   Listr5   r3   r#   r!   rd   rd   w   s    "48FBGGEN8r#   rd   c                      e Zd ZdZefdedej                  egdf   fdZ	dej                  e   defdZd	edej                  e   fd
Zdedej                  e   fdZd Zedefd       ZdededefdZdej.                  fdZd Zedefd       Zedefd       Zedefd       Zdej>                  e   fdZ dedededejB                  fdZ"d Z#edefd       Z$d e%fd!Z&y)"Shepherdz
    Takes care of the little jobs.

    Args:
        main (DecoratedMain): main function decorated by Dora.
        log (callable): log function, if provided should take a single string
            argument.
    r   rW   Nc                 &   || _         | j                  j                  dd       | j                  j                  dd       | j                  j                  dd       || _        d| _        d | _        g | _        g | _	        | j                          y )NT)exist_okparentsF)r   _by_idmkdir_orphans_arraysrW   _in_job_array_existing_git_clone
_to_cancel
_to_submit_check_orphans)r   r   rW   s      r!   r>   zShepherd.__init__   s    	46T48D$7#(6: 68.0r#   r   r?   c                 r    t        |t              rJ | j                  j                  |      }t	        |      S )z
        Given a list of of arguments, return the matching `Sheep`,
        which will contain both information on the `dora.xp.XP`, and on
        the latest job associated with that XP.
        )r;   r   r   r   r5   )r   r   r   s      r!   get_sheep_from_argvzShepherd.get_sheep_from_argv   s2     dC(((YYd#Ryr#   rZ   c                 N    | j                   j                  |      }t        |      S )zj
        Returns a `Sheep` given the XP signature, if any exists, otherwise
        returns None.
        )r   get_xp_from_sigr5   )r   rZ   r   s      r!   get_sheep_from_sigzShepherd.get_sheep_from_sig   s"    
 YY&&s+Ryr#   rP   c                     | j                   |z  }|j                         r@|j                         j                  }| j                  j                  |      }t        |      S y)zu
        Returns the `Sheep` associated with the given `job_id`. If no sheep
        is found, returns None.
        N)rp   
is_symlinkresolvenamer   r|   r5   )r   rP   linkrZ   r   s        r!   get_sheep_from_job_idzShepherd.get_sheep_from_job_id   sN    
 {{V#??,,.%%C**3/B9r#   c                 @    t         j                  j                          y)zB
        Force an update of all job states with submitit.
        N)r   rN   updaterE   s    r!   r   zShepherd.update   s     	!r#   re   c              #      K   | j                   rJ | j                  j                  t        |             d| _         	 d d| _         y# d| _         w xY ww)z*Context manager to launch XP in job array.TNF)rt   rw   appendrd   )r   re   s     r!   	job_arrayzShepherd.job_array   sN      %%%%y67!	'!&DDs   :AA	 A		AAsheeprulesc                    |j                   |j                         }|dk(  r@|j                  rt        j	                  d|j                   j
                          d|_         n|dv rAt        j	                  d|j                   j
                   d       |j                  red|_         n]|j                  rQt        j	                  d|j                   j
                   d|        | j                  |j                          d|_         |j                   w| j                  s$| j                  j                  t        |             || j                  d	   j                  k(  sJ | j                  d	   j                  j                  |       yy)
z
        Decides whether to schedule a new job for the given sheep, based on the rules
        given in `rules`.
        Jobs are actually only scheduled once the `commit()` method is called.
        N	COMPLETEDz"Ignoring previously completed job )FAILEDrM   OUT_OF_MEMORYTIMEOUTrL   	NODE_FAILzPrevious job z failed or was canceledzCancelling previous job z with status )r8   rI   replace_doneloggerdebugrP   retryreplacecancel_lazyrt   rw   r   rd   re   rg   )r   r   re   r   rI   s        r!   maybe_submit_lazyzShepherd.maybe_submit_lazy   s<    99 KKME#%%LL#EeiiFVFVEW!XY $EI ( (}UYY-=-=,>>UVW;; $EI==LL#;EII<L<L;M][`Za!bc$$UYY/ $EI99%%&&y'>?4??2#6#C#CCCCOOB&&--e4	 r#   r8   c                 :    | j                   j                  |       y)z]
        Cancel a job. The job is actually cancelled only when `commit()` is called.
        N)rv   r   )r   r8   s     r!   r   zShepherd.cancel_lazy   s     	s#r#   c                     | j                   r"| j                  | j                          g | _         d| _        | j                  r:| j                  j	                  d      }| j                  |       | j                  r9yy)zu
        Commit all changes registered so far with either `maybe_submit_lazy()`
        and `cancel_lazy()`.
        Nr   )rv   _cancelru   rw   pop_submit)r   r   s     r!   commitzShepherd.commit   sZ    
 ??LL) DO#' oo++A.ILL# oor#   c                     | j                   j                  j                  | j                   j                  j                  j                  z  S r7   )r   rB   dirrC   by_idrE   s    r!   rp   zShepherd._by_id   s/    yy~~!!DIINN$7$7$=$===r#   c                     | j                   j                  j                  | j                   j                  j                  j                  z  S r7   )r   rB   r   rC   orphansrE   s    r!   rr   zShepherd._orphans   s/    yy~~!!DIINN$7$7$?$???r#   c                     | j                   j                  j                  | j                   j                  j                  j                  z  S r7   )r   rB   r   rC   arraysrE   s    r!   rs   zShepherd._arrays   s/    yy~~!!DIINN$7$7$>$>>>r#   jobsc                     dg|D cg c]  }|j                    c}z   }t        j                  ddj                  |             t	        j
                  |d       y c c}w )Nscancelz
Running %s Tcheck)rP   r   r   joinsprun)r   r   r8   
cancel_cmds       r!   r   zShepherd._cancel   sH    [$#?3CJJ#??
\388J#78
z& $@s   Ar   rA   c                 v   dt         j                  d<   t        |j                        }t	        j
                  ||j                  d            }|j                  }|dkD  r|dz  dk7  rt        d      |dz  |d<   d}n|}d	|d<   |j                  }|r|j                  |z  }	|	 d
|d<   d| |d<   |j                  r$d	|d<   |j                  3||j                  z  |d<   n ||d<   |j                  |j                  |d<   |d= |d= |d= |d= t        j                  d|        |j                  d|dd| |S )N1SLURM_KILL_BAD_EXITmax_num_timeout)rA   r      r   z.Can only take <= 8 gpus, or multiple of 8 gpusnodesr	   GBmemzgpu:gresntasks_per_nodecpus_per_taskgpusmem_per_gpucpus_per_gpuone_task_per_nodezSlurm parameters %rT)job_namestderr_to_stdoutr3   )r   r   dict__dict__r(   SlurmExecutorr   r   
ValueErrorr   r   r   r   r   r   update_parameters)
r   r   rA   re   r,   executorr   gpus_per_noder   r   s
             r!   _get_submitit_executorzShepherd._get_submitit_executor  s~   ,/

()l++,))6::6G+HJ  !8ax1} !QRR"aiF7OM MF7O"..**]:C"e2JF5M/v))()F$%))1*7,:S:S*S'(5F$%))1*6*C*C'6N=!>"&'*F3""" 	!	 	 r#   c                    | j                   j                         D ]  }|j                  }t        j	                  d| d       t        j                  ddt        j                         d|dddgd	d	
      }|j                  j                         j                         j                  d      D cg c]  }|s|	 }}|r4t        j	                  d| d       t        j                  dg|z   d	       |j                           yc c}w )zCheck for orphaned jobs.zFound dirty tag z`, meaning a job might have been scheduled but Dora or Slurm crashed before the job id was saved.squeuez-uz-nz-oz%iz-hT)capture_outputr   
zFound orphan job ids z, will cancelr   r   N)rr   iterdirr   r   warningr   r   r   getloginstdoutdecodestripsplitr'   )r   dirtyr   proclineidss         r!   rx   zShepherd._check_orphans-  s    ]]**, 
	E::DNN-dV 4T T U668T2;;=$dDRVW)-T;D$(KK$6$6$8$>$>$@$F$Ft$LUDPT4UCU!6se=IJ	{S(5LLN
	 Vs   .D6Dc              #      K   | j                   |z  }|j                          	 d |j                          y# |j                          w xY ww)z,Context manager to enter a potential orphan.N)rr   touchr'   )r   r   tokens      r!   _enter_orphanzShepherd._enter_orphan;  s9      $	LLNELLNs    A8 AA

Ar   c           
      
   |j                   }|j                  }|sy t        |      dkD  }|d   }| j                  j	                  |j
                         |j
                  j                  j                  t        fd|D              sJ d       |r7t        t        |D cg c]  }|j
                  j                   c}            }n|j
                  j                  }|r| j                  j                  dz   |z   }n| j                  j                  dz   |z   }|r| j                  |z  }	n|j
                  j                  }	|	j                  d       |D ]^  }|j
                  }
| j                  j	                  |
       |
j                   j#                         sE|
j                   j%                          ` | j'                  ||	|      }g }r0| j(                  $t        j*                  | j                        | _        | j-                  |      5  t/               5 }r<| j(                  J |j1                  t        j2                  | j(                               |r|j1                  |j5                                |j                   D ]  }r8| j(                  J t        j6                  |j
                  | j(                         |j9                  |j;                  t=               | j                  |j
                  j>                                	 d d d        tA        ||      D ]  \  }}tC        jD                  ||ftG        |jH                  d	             tJ        jM                  d
|jN                         ||_(        ||_)        | jT                  |jN                  z  }|}|jW                  |j
                  jX                  j[                                |rg|j
                  jX                  |	j                  z  }|j#                         r#|j[                         |	j[                         k(  sJ |jW                  |	       |j
                  j\                  }|j#                         r|j%                          |jW                  |	       | j                  j_                  |j
                        }| ja                  d|jN                   d|j
                  j                   d|         	 d d d        y c c}w # 1 sw Y   xY w# 1 sw Y   y xY w)Nr	   r   c              3   d   K   | ]'  }|j                   j                  j                  k(   ) y wr7   )r   rB   r
   )rJ   otheruse_git_saves     r!   rK   z#Shepherd._submit.<locals>.<genexpr>O  s$     Ne588==))\9Ns   -0z?All jobs inside an array must have the same value for git_save._array__T)rn   wbzCreated job with id %szScheduled job z for sheep /)1rg   re   lenr   init_xpr   rB   r
   allr   sortedrZ   r   rs   _xp_submititrq   r%   r&   r'   r   ru   get_new_cloner   r   enter_contextenter_clonebatchassign_cloner   submitr   r   zippickledumpopenr:   r   r   rP   r8   r9   rp   
symlink_torA   r   _latest_submititget_namerW   )r   r   rg   re   is_arrayfirstr   name_sigr   submitit_folderr   r   r   stackr8   r   submitit_linklatestr   s                     @r!   r   zShepherd._submitE  s
   !! --v;?q			%((#xx}}--NvNN 	PO	PN &'I'I JKHxx||H99>>I-8D99>>C'(2D"llT1O#hh33Ot, 	,EBIIb!!!((*""))+		, ..t_lS&(D44<'/'='=dii'HD$% $	X 
^33???''(<(<T=U=U(VW''(89&-- ^E##77CCC --ehh8P8PQKK0A499ehhmm \]	^
^ "&$/ X
sS$Keoot)DE5szzB	$(!{{SZZ/ 7 7 9: &+XX__7K7K%KM$++-,446/:Q:Q:SSSS%00A22==?MMO!!/2yy))%((3>#**[aPTvVW/X$	X $	X1 (J2
^ 
^$	X $	Xs,    S+S=C3S0GS=0S:	5S==T)'r.   r/   r0   r^   rb   r   r1   Callabler   r>   r2   r5   rz   Optionalr}   r   r   r   r   r   r   r   r(   r   r   r   r_   r   rp   rr   rs   rj   r   r   r   rx   r   rd   r   r3   r#   r!   rl   rl   }   s    MS ] cUD[1I C(8 U c bkk%.@ 
C 
BKK4F 
" 'k ' '5u 5K 5P[ 5:$x00 $$ > > > @$ @ @ ? ? ?'BGGH- '
&3 & &-8&=E=S=S&P #  JX JXr#   rl   )*r^   
contextlibr   r   dataclassesr   r   loggingpathlibr   r   r   
subprocessr   r   typingr1   r(   r    r
   confr   r   distribr   r   r   utilsr   r   r   r   	getLoggerr.   r   r   r5   r   rb   rd   rl   r3   r#   r!   <module>r
     s    0 (    	  
     * %    
		8	$I I$: :z	c 	 9 9 9
RX RXr#   