Skip to content

Commit 623f1c9

Browse files
committed
feat: update event streaming UI to improve display of incremental torch progress
the initial live progress added a UI element for every torch step. this PR refines that work to pre-fill UI elements for all epochs and iterations. it marks epochs as InProgress while that epoch's iterations are being processed. etc. this makes it appear more like a progress bar.
1 parent 13a3533 commit 623f1c9

File tree

4 files changed

+101
-33
lines changed

4 files changed

+101
-33
lines changed

plugins/plugin-codeflare/src/controller/events/Event.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* limitations under the License.
1515
*/
1616

17-
export type State = "InProgress" | "Done" | "Error"
17+
export type State = "Pending" | "InProgress" | "Done" | "Error"
1818

1919
type Event<T extends string, Detail> = Detail & {
2020
name: string

plugins/plugin-codeflare/src/controller/events/Events.tsx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import { join } from "path"
1919
import stripAnsi from "strip-ansi"
2020
import { Arguments } from "@kui-shell/core"
2121

22+
import { GenericEvent } from "./Event"
2223
import parseKubeEvents, { collateEvent as collateKubeEvent, KubeEvent } from "./kube"
2324
import parseTorchEvents, { collateEvent as collateTorchEvent, TorchEvent } from "./torch"
2425

@@ -32,7 +33,9 @@ interface EventState {
3233

3334
type State = EventState & {
3435
nKubeEvents: number
36+
nNotPendingKubeEvents: number
3537
nTorchEvents: number
38+
nNotPendingTorchEvents: number
3639
catastrophicError?: Error
3740
}
3841

@@ -57,6 +60,8 @@ class Events extends React.PureComponent<Props, State> {
5760
kubeEvents,
5861
torchEvents,
5962
nKubeEvents: kubeEvents.length,
63+
nNotPendingKubeEvents: this.nNotPending(kubeEvents),
64+
nNotPendingTorchEvents: this.nNotPending(torchEvents),
6065
nTorchEvents: torchEvents.length,
6166
}
6267

@@ -83,6 +88,7 @@ class Events extends React.PureComponent<Props, State> {
8388
toBeProcessed.forEach((line) => collateKubeEvent(curState.kubeEvents, line))
8489
return {
8590
nKubeEvents: curState.kubeEvents.length,
91+
nNotPendingKubeEvents: this.nNotPending(curState.kubeEvents),
8692
}
8793
})
8894
}, queueFlushHysteresis)
@@ -108,6 +114,7 @@ class Events extends React.PureComponent<Props, State> {
108114
toBeProcessed.forEach((line) => collateTorchEvent(curState.torchEvents, line))
109115
return {
110116
nTorchEvents: curState.torchEvents.length,
117+
nNotPendingTorchEvents: this.nNotPending(curState.torchEvents),
111118
}
112119
})
113120
}, queueFlushHysteresis)
@@ -116,6 +123,10 @@ class Events extends React.PureComponent<Props, State> {
116123
}
117124
}
118125

126+
private nNotPending(events: GenericEvent[]) {
127+
return events.reduce((N, _) => N + (_.state !== "Pending" ? 1 : 0), 0)
128+
}
129+
119130
public static getDerivedStateFromError(error: Error) {
120131
return { catastrophicError: error }
121132
}

plugins/plugin-codeflare/src/controller/events/torch.ts

Lines changed: 81 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,44 @@ type EventType = "Epoch" | "Iteration" | "Marker"
2020
type Detail = { epoch: number; step: number; nSteps: number; ip: string }
2121
export type TorchEvent = Event<EventType, Detail>
2222

23-
function findPrevious(M: TorchEvent[], ip: TorchEvent["ip"], type: EventType) {
23+
function findPrevious(
24+
M: TorchEvent[],
25+
ip: TorchEvent["ip"],
26+
type: EventType,
27+
state: TorchEvent["state"],
28+
step = -1,
29+
epoch = -1
30+
) {
2431
for (let idx = M.length - 1; idx >= 0; idx--) {
2532
const evt = M[idx]
26-
if (evt.type === type && evt.ip === ip) {
33+
if (
34+
evt.type === type &&
35+
evt.ip === ip &&
36+
evt.state === state &&
37+
(step === -1 || evt.step === step) &&
38+
(epoch === -1 || evt.epoch === epoch)
39+
) {
2740
return evt
2841
}
2942
}
3043
}
3144

32-
function findEpoch(M: TorchEvent[], ip: TorchEvent["ip"]) {
33-
const evt = findPrevious(M, ip, "Epoch")
34-
return evt ? evt.step : -1
45+
function findEpoch(M: TorchEvent[], ip: TorchEvent["ip"], state: TorchEvent["state"] = "InProgress", step?: number) {
46+
return findPrevious(M, ip, "Epoch", state, step)
47+
}
48+
49+
class TorchEventImpl implements TorchEvent {
50+
public constructor(
51+
public readonly name: string,
52+
public readonly ip: string,
53+
public readonly type: EventType,
54+
public readonly step: number,
55+
public readonly nSteps: number,
56+
public readonly epoch: number,
57+
public readonly timestamp: number,
58+
public state: TorchEvent["state"] = "InProgress",
59+
public readonly message = `Epoch ${epoch}${type !== "Epoch" ? ` - ${type} ${step}` : ""} of ${nSteps}`
60+
) {}
3561
}
3662

3763
export function collateEvent(M: TorchEvent[], line: string) {
@@ -46,7 +72,7 @@ export function collateEvent(M: TorchEvent[], line: string) {
4672
const epoch = -1
4773
const step = -1
4874
const nSteps = -1
49-
const state = "InProgress"
75+
const state = "Done"
5076
M.push({ ip, name, message, state, type, hidden, timestamp, epoch, step, nSteps })
5177
return M
5278
}
@@ -56,36 +82,63 @@ export function collateEvent(M: TorchEvent[], line: string) {
5682
const ip = match[1]
5783
const type = match[2] as EventType
5884
// const percentage = parseInt(match[3], 10)
59-
const step = parseInt(match[4], 10)
85+
const step = parseInt(match[4], 10) - (type === "Epoch" ? 0 : 1)
6086
const nSteps = parseInt(match[5], 10)
6187

62-
const epoch = type === "Epoch" ? step : findEpoch(M, ip)
63-
const timestampMarker = findPrevious(M, ip, "Marker")
64-
65-
const event = {
66-
name: `Torch Training on ${ip}`,
67-
message: `Epoch ${epoch}${type !== "Epoch" ? ` - ${type} ${step}` : ""} of ${nSteps}`,
68-
ip,
69-
type,
70-
step,
71-
nSteps,
72-
epoch,
73-
timestamp: timestampMarker ? timestampMarker.timestamp : Date.now(),
74-
state: "InProgress" as const,
88+
const epoch =
89+
type === "Epoch"
90+
? { step, nSteps, state: "InProgress" }
91+
: findEpoch(M, ip) || { step: -1, nSteps: 0, state: "InProgress" }
92+
const name = `Torch Training on ${ip}`
93+
const timestampMarker = findPrevious(M, ip, "Marker", "Done")
94+
const timestamp = timestampMarker ? timestampMarker.timestamp : Date.now()
95+
96+
if (type === "Iteration") {
97+
epoch.state = "InProgress"
98+
} else if (step > 0) {
99+
const thisEpoch = findEpoch(M, ip, "Pending", step)
100+
if (thisEpoch) {
101+
thisEpoch.state = "InProgress"
102+
}
75103
}
76104

77105
// find previous by ip and mark it Done
78-
const prev = findPrevious(M, ip, type)
79-
if (prev) {
80-
prev.state = "Done"
81-
82-
if (type === "Epoch" && prev.step === step) {
83-
// strange, torch seems to repeat the e.g. Epoch 6/6 event...
84-
return M
106+
if (step > 0) {
107+
const prev =
108+
type === "Iteration"
109+
? findPrevious(M, ip, type, "Pending", step - 1, epoch.step) // previous iteration in this epoch
110+
: findPrevious(M, ip, type, "InProgress", step - 1, epoch.step - 1) // previous epoch
111+
if (prev) {
112+
prev.state = "Done"
113+
} else if (type === "Iteration" && step === nSteps - 1) {
114+
// torch repeat the last step to indicate fully done
115+
const prev = findPrevious(M, ip, type, "Pending", nSteps - 1, epoch.step) // last iteration
116+
if (prev) {
117+
prev.state = "Done"
118+
}
85119
}
86120
}
87121

88-
M.push(event)
122+
if (type === "Epoch" && step === 0) {
123+
// first Epoch
124+
M.push(new TorchEventImpl(name, ip, type, step, nSteps, epoch.step, timestamp))
125+
} else if (type === "Iteration" && step === 0 && epoch.step === 0) {
126+
// first Iteration of first Epoch: pre-fill the remaining Epochs and Iterations
127+
M.push(new TorchEventImpl(name, ip, type, step, nSteps, epoch.step, timestamp, "Pending"))
128+
129+
// pre-fill remaining Iterations for epoch 0
130+
for (let iterIdx = 1; iterIdx < nSteps; iterIdx++) {
131+
M.push(new TorchEventImpl(name, ip, "Iteration", iterIdx, nSteps, 0, timestamp, "Pending"))
132+
}
133+
134+
// now pre-fill the remaining Epochs
135+
for (let epochIdx = 1; epochIdx < epoch.nSteps; epochIdx++) {
136+
M.push(new TorchEventImpl(name, ip, "Epoch", epochIdx, epoch.nSteps, epochIdx, timestamp, "Pending"))
137+
for (let iterIdx = 1; iterIdx < nSteps; iterIdx++) {
138+
M.push(new TorchEventImpl(name, ip, "Iteration", iterIdx, nSteps, epochIdx, timestamp, "Pending"))
139+
}
140+
}
141+
}
89142
}
90143

91144
return M

plugins/plugin-codeflare/web/scss/components/Dashboard/Grid.scss

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,9 @@ $fullWidth: 1em; /* $large * ($unit + $rgap) - $rgap */
6464
}
6565
}
6666

67-
@mixin Color($color) {
67+
@mixin Color($color, $important: "") {
6868
@include CFCellContent {
69-
background-color: $color;
69+
background-color: $color $important;
7070
}
7171
}
7272

@@ -91,7 +91,11 @@ $fullWidth: 1em; /* $large * ($unit + $rgap) - $rgap */
9191
@include Color(var(--color-base0B));
9292
}
9393
@include State("InProgress") {
94-
@include Color(var(--color-base09));
94+
@include Color(var(--color-base0A), !important);
95+
}
96+
@include State("Pending") {
97+
@include Color(var(--color-base04), !important);
98+
filter: saturate(0.3) brightness(0.95);
9599
}
96100

97101
@include Color(var(--color-base0D));
@@ -117,7 +121,7 @@ $fullWidth: 1em; /* $large * ($unit + $rgap) - $rgap */
117121
grid-column: span $tiny;
118122
@include CFCellContent {
119123
background-color: var(--color-base0D);
120-
filter: saturate(0.3) brightness(0.9);
124+
filter: saturate(0.5) brightness(0.85);
121125
height: 80%;
122126
}
123127
}

0 commit comments

Comments
 (0)