@@ -42,6 +42,7 @@ import (
4242 "github.com/cortexlabs/cortex/pkg/types/userconfig"
4343 "github.com/cortexlabs/cortex/pkg/workloads"
4444 "github.com/cortexlabs/yaml"
45+ cache "github.com/patrickmn/go-cache"
4546 kbatch "k8s.io/api/batch/v1"
4647 kcore "k8s.io/api/core/v1"
4748 kerrors "k8s.io/apimachinery/pkg/api/errors"
@@ -52,13 +53,21 @@ import (
5253const (
5354 _enqueuerContainerName = "enqueuer"
5455 _deadlineExceededReason = "DeadlineExceeded"
56+ _cacheDuration = 60 * time .Second
5557)
5658
59+ var totalBatchCountCache * cache.Cache
60+
61+ func init () {
62+ totalBatchCountCache = cache .New (_cacheDuration , _cacheDuration )
63+ }
64+
5765type batchJobStatusInfo struct {
5866 QueueExists bool
5967 EnqueuingStatus batch.EnqueuingStatus
6068 EnqueuerJob * kbatch.Job
6169 WorkerJob * kbatch.Job
70+ TotalBatchCount int
6271}
6372
6473func (r * BatchJobReconciler ) checkIfQueueExists (batchJob batch.BatchJob ) (bool , error ) {
@@ -381,11 +390,12 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B
381390 case batch .EnqueuingFailed :
382391 batchJob .Status .Status = status .JobEnqueueFailed
383392 batchJob .Status .EndTime = statusInfo .EnqueuerJob .Status .CompletionTime
393+ case batch .EnqueuingDone :
394+ batchJob .Status .TotalBatchCount = statusInfo .TotalBatchCount
384395 }
385396
386397 worker := statusInfo .WorkerJob
387398 if worker != nil {
388- batchJob .Status .StartTime = worker .Status .StartTime // assign right away, because it's a pointer
389399 batchJob .Status .EndTime = worker .Status .CompletionTime // assign right away, because it's a pointer
390400
391401 if worker .Status .Failed == batchJob .Spec .Workers {
@@ -423,9 +433,7 @@ func (r *BatchJobReconciler) updateStatus(ctx context.Context, batchJob *batch.B
423433 batchJob .Status .Status = status .JobRunning
424434 }
425435
426- pendingWorkers := batchJob .Spec .Workers - (worker .Status .Active + worker .Status .Succeeded + worker .Status .Failed )
427436 batchJob .Status .WorkerCounts = & status.WorkerCounts {
428- Pending : pendingWorkers ,
429437 Running : worker .Status .Active ,
430438 Succeeded : worker .Status .Succeeded ,
431439 Failed : worker .Status .Failed ,
@@ -495,7 +503,7 @@ func (r *BatchJobReconciler) uploadJobSpec(batchJob batch.BatchJob, api spec.API
495503 timeout = pointer .Int (int (batchJob .Spec .Timeout .Seconds ()))
496504 }
497505
498- maxBatchCount , err := r .Config .GetMaxBatchCount (r , batchJob )
506+ totalBatchCount , err := r .Config .GetTotalBatchCount (r , batchJob )
499507 if err != nil {
500508 return nil , err
501509 }
@@ -513,11 +521,9 @@ func (r *BatchJobReconciler) uploadJobSpec(batchJob batch.BatchJob, api spec.API
513521 Config : config ,
514522 },
515523 APIID : api .ID ,
516- SpecID : api .SpecID ,
517- HandlerID : api .HandlerID ,
518524 SQSUrl : queueURL ,
519525 StartTime : batchJob .CreationTimestamp .Time ,
520- TotalBatchCount : maxBatchCount ,
526+ TotalBatchCount : totalBatchCount ,
521527 }
522528
523529 if err = r .AWS .UploadJSONToS3 (& jobSpec , r .ClusterConfig .Bucket , r .jobSpecKey (batchJob )); err != nil {
@@ -569,19 +575,27 @@ func (r *BatchJobReconciler) persistJobToS3(batchJob batch.BatchJob) error {
569575 )
570576}
571577
572- func getMaxBatchCount (r * BatchJobReconciler , batchJob batch.BatchJob ) (int , error ) {
578+ func getTotalBatchCount (r * BatchJobReconciler , batchJob batch.BatchJob ) (int , error ) {
573579 key := spec .JobBatchCountKey (r .ClusterConfig .ClusterUID , userconfig .BatchAPIKind , batchJob .Spec .APIName , batchJob .Name )
574- maxBatchCountBytes , err := r .AWS .ReadBytesFromS3 (r .ClusterConfig .Bucket , key )
575- if err != nil {
576- return 0 , err
577- }
580+ cachedTotalBatchCount , found := totalBatchCountCache .Get (key )
581+ var totalBatchCount int
582+ if ! found {
583+ totalBatchCountBytes , err := r .AWS .ReadBytesFromS3 (r .ClusterConfig .Bucket , key )
584+ if err != nil {
585+ return 0 , err
586+ }
578587
579- maxBatchCount , err := strconv .Atoi (string (maxBatchCountBytes ))
580- if err != nil {
581- return 0 , err
588+ totalBatchCount , err = strconv .Atoi (string (totalBatchCountBytes ))
589+ if err != nil {
590+ return 0 , err
591+ }
592+ } else {
593+ totalBatchCount = cachedTotalBatchCount .(int )
582594 }
583595
584- return maxBatchCount , nil
596+ totalBatchCountCache .Set (key , totalBatchCount , _cacheDuration )
597+
598+ return totalBatchCount , nil
585599}
586600
587601func getMetrics (r * BatchJobReconciler , batchJob batch.BatchJob ) (metrics.BatchMetrics , error ) {
@@ -611,14 +625,24 @@ func saveJobMetrics(r *BatchJobReconciler, batchJob batch.BatchJob) error {
611625}
612626
613627func saveJobStatus (r * BatchJobReconciler , batchJob batch.BatchJob ) error {
614- jobStatus := batchJob .Status .Status .String ()
615- key := filepath .Join (
616- spec .JobAPIPrefix (r .ClusterConfig .ClusterUID , userconfig .BatchAPIKind , batchJob .Spec .APIName ),
617- batchJob .Name ,
618- jobStatus ,
628+ return parallel .RunFirstErr (
629+ func () error {
630+ stoppedStatusKey := filepath .Join (
631+ spec .JobAPIPrefix (r .ClusterConfig .ClusterUID , userconfig .BatchAPIKind , batchJob .Spec .APIName ),
632+ batchJob .Name ,
633+ status .JobStopped .String (),
634+ )
635+ return r .AWS .UploadStringToS3 ("" , r .ClusterConfig .Bucket , stoppedStatusKey )
636+
637+ },
638+ func () error {
639+ jobStatus := batchJob .Status .Status .String ()
640+ key := filepath .Join (
641+ spec .JobAPIPrefix (r .ClusterConfig .ClusterUID , userconfig .BatchAPIKind , batchJob .Spec .APIName ),
642+ batchJob .Name ,
643+ jobStatus ,
644+ )
645+ return r .AWS .UploadStringToS3 ("" , r .ClusterConfig .Bucket , key )
646+ },
619647 )
620- if err := r .AWS .UploadStringToS3 ("" , r .ClusterConfig .Bucket , key ); err != nil {
621- return err
622- }
623- return nil
624648}
0 commit comments