@@ -20,18 +20,44 @@ type EventType = "Epoch" | "Iteration" | "Marker"
2020type Detail = { epoch : number ; step : number ; nSteps : number ; ip : string }
2121export type TorchEvent = Event < EventType , Detail >
2222
23- function findPrevious ( M : TorchEvent [ ] , ip : TorchEvent [ "ip" ] , type : EventType ) {
23+ function findPrevious (
24+ M : TorchEvent [ ] ,
25+ ip : TorchEvent [ "ip" ] ,
26+ type : EventType ,
27+ state : TorchEvent [ "state" ] ,
28+ step = - 1 ,
29+ epoch = - 1
30+ ) {
2431 for ( let idx = M . length - 1 ; idx >= 0 ; idx -- ) {
2532 const evt = M [ idx ]
26- if ( evt . type === type && evt . ip === ip ) {
33+ if (
34+ evt . type === type &&
35+ evt . ip === ip &&
36+ evt . state === state &&
37+ ( step === - 1 || evt . step === step ) &&
38+ ( epoch === - 1 || evt . epoch === epoch )
39+ ) {
2740 return evt
2841 }
2942 }
3043}
3144
32- function findEpoch ( M : TorchEvent [ ] , ip : TorchEvent [ "ip" ] ) {
33- const evt = findPrevious ( M , ip , "Epoch" )
34- return evt ? evt . step : - 1
45+ function findEpoch ( M : TorchEvent [ ] , ip : TorchEvent [ "ip" ] , state : TorchEvent [ "state" ] = "InProgress" , step ?: number ) {
46+ return findPrevious ( M , ip , "Epoch" , state , step )
47+ }
48+
49+ class TorchEventImpl implements TorchEvent {
50+ public constructor (
51+ public readonly name : string ,
52+ public readonly ip : string ,
53+ public readonly type : EventType ,
54+ public readonly step : number ,
55+ public readonly nSteps : number ,
56+ public readonly epoch : number ,
57+ public readonly timestamp : number ,
58+ public state : TorchEvent [ "state" ] = "InProgress" ,
59+ public readonly message = `Epoch ${ epoch } ${ type !== "Epoch" ? ` - ${ type } ${ step } ` : "" } of ${ nSteps } `
60+ ) { }
3561}
3662
3763export function collateEvent ( M : TorchEvent [ ] , line : string ) {
@@ -46,7 +72,7 @@ export function collateEvent(M: TorchEvent[], line: string) {
4672 const epoch = - 1
4773 const step = - 1
4874 const nSteps = - 1
49- const state = "InProgress "
75+ const state = "Done "
5076 M . push ( { ip, name, message, state, type, hidden, timestamp, epoch, step, nSteps } )
5177 return M
5278 }
@@ -56,36 +82,63 @@ export function collateEvent(M: TorchEvent[], line: string) {
5682 const ip = match [ 1 ]
5783 const type = match [ 2 ] as EventType
5884 // const percentage = parseInt(match[3], 10)
59- const step = parseInt ( match [ 4 ] , 10 )
85+ const step = parseInt ( match [ 4 ] , 10 ) - ( type === "Epoch" ? 0 : 1 )
6086 const nSteps = parseInt ( match [ 5 ] , 10 )
6187
62- const epoch = type === "Epoch" ? step : findEpoch ( M , ip )
63- const timestampMarker = findPrevious ( M , ip , "Marker" )
64-
65- const event = {
66- name : `Torch Training on ${ ip } ` ,
67- message : `Epoch ${ epoch } ${ type !== "Epoch" ? ` - ${ type } ${ step } ` : "" } of ${ nSteps } ` ,
68- ip,
69- type,
70- step,
71- nSteps,
72- epoch,
73- timestamp : timestampMarker ? timestampMarker . timestamp : Date . now ( ) ,
74- state : "InProgress" as const ,
88+ const epoch =
89+ type === "Epoch"
90+ ? { step, nSteps, state : "InProgress" }
91+ : findEpoch ( M , ip ) || { step : - 1 , nSteps : 0 , state : "InProgress" }
92+ const name = `Torch Training on ${ ip } `
93+ const timestampMarker = findPrevious ( M , ip , "Marker" , "Done" )
94+ const timestamp = timestampMarker ? timestampMarker . timestamp : Date . now ( )
95+
96+ if ( type === "Iteration" ) {
97+ epoch . state = "InProgress"
98+ } else if ( step > 0 ) {
99+ const thisEpoch = findEpoch ( M , ip , "Pending" , step )
100+ if ( thisEpoch ) {
101+ thisEpoch . state = "InProgress"
102+ }
75103 }
76104
77105 // find previous by ip and mark it Done
78- const prev = findPrevious ( M , ip , type )
79- if ( prev ) {
80- prev . state = "Done"
81-
82- if ( type === "Epoch" && prev . step === step ) {
83- // strange, torch seems to repeat the e.g. Epoch 6/6 event...
84- return M
106+ if ( step > 0 ) {
107+ const prev =
108+ type === "Iteration"
109+ ? findPrevious ( M , ip , type , "Pending" , step - 1 , epoch . step ) // previous iteration in this epoch
110+ : findPrevious ( M , ip , type , "InProgress" , step - 1 , epoch . step - 1 ) // previous epoch
111+ if ( prev ) {
112+ prev . state = "Done"
113+ } else if ( type === "Iteration" && step === nSteps - 1 ) {
114+ // torch repeat the last step to indicate fully done
115+ const prev = findPrevious ( M , ip , type , "Pending" , nSteps - 1 , epoch . step ) // last iteration
116+ if ( prev ) {
117+ prev . state = "Done"
118+ }
85119 }
86120 }
87121
88- M . push ( event )
122+ if ( type === "Epoch" && step === 0 ) {
123+ // first Epoch
124+ M . push ( new TorchEventImpl ( name , ip , type , step , nSteps , epoch . step , timestamp ) )
125+ } else if ( type === "Iteration" && step === 0 && epoch . step === 0 ) {
126+ // first Iteration of first Epoch: pre-fill the remaining Epochs and Iterations
127+ M . push ( new TorchEventImpl ( name , ip , type , step , nSteps , epoch . step , timestamp , "Pending" ) )
128+
129+ // pre-fill remaining Iterations for epoch 0
130+ for ( let iterIdx = 1 ; iterIdx < nSteps ; iterIdx ++ ) {
131+ M . push ( new TorchEventImpl ( name , ip , "Iteration" , iterIdx , nSteps , 0 , timestamp , "Pending" ) )
132+ }
133+
134+ // now pre-fill the remaining Epochs
135+ for ( let epochIdx = 1 ; epochIdx < epoch . nSteps ; epochIdx ++ ) {
136+ M . push ( new TorchEventImpl ( name , ip , "Epoch" , epochIdx , epoch . nSteps , epochIdx , timestamp , "Pending" ) )
137+ for ( let iterIdx = 1 ; iterIdx < nSteps ; iterIdx ++ ) {
138+ M . push ( new TorchEventImpl ( name , ip , "Iteration" , iterIdx , nSteps , epochIdx , timestamp , "Pending" ) )
139+ }
140+ }
141+ }
89142 }
90143
91144 return M
0 commit comments