[Embedding] Fix BatchCache coredump in background thread. (#294)

candyzone · web-flow · commit d806b3f97847 · 2022-07-05T20:58:44.000+08:00
diff --git a/tensorflow/core/framework/embedding/cache.h b/tensorflow/core/framework/embedding/cache.h
@@ -5,6 +5,7 @@
 #include <unordered_map>
 #include <set>
 #include <list>
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -17,6 +18,9 @@ template <class K>
 class BatchCache {
  public:
   BatchCache() {}
+  void add_to_rank(const Tensor& t) {
+    add_to_rank((K*)t.data(), t.NumElements());
+  }
   virtual size_t get_evic_ids(K* evic_ids, size_t k_size) = 0;
   virtual void add_to_rank(const K* batch_ids, size_t batch_size) = 0;
   virtual size_t size() = 0;
diff --git a/tensorflow/core/framework/embedding/embedding_var.h b/tensorflow/core/framework/embedding/embedding_var.h
@@ -163,6 +163,10 @@ class EmbeddingVar : public ResourceBase {
     return storage_manager_->Size();
   }
 
+  int64 CacheSize() const {
+    return storage_manager_->CacheSize();
+  }
+
   int64 MinFreq() {
     return emb_config_.filter_freq;
   }
diff --git a/tensorflow/core/framework/embedding/multilevel_embedding.h b/tensorflow/core/framework/embedding/multilevel_embedding.h
@@ -267,6 +267,10 @@ class StorageManager {
     return total_size;
   }
 
+  int64 CacheSize() const {
+    return cache_capacity_;
+  }
+
   Status GetSnapshot(std::vector<K>* key_list,
                      std::vector<ValuePtr<V>* >* value_ptr_list) {
     for (auto kv : kvs_) {
@@ -375,7 +379,6 @@ class StorageManager {
   Status Destroy() {
     if (eviction_thread_) {
       mutex_lock l(mu_);
-      shutdown_cv_.notify_all();
       shutdown_ = true;
     }
     delete eviction_thread_;
@@ -432,9 +435,7 @@ class StorageManager {
       if (shutdown_) {
         break;
       }
-      const int kTimeoutMilliseconds = 1;
-      WaitForMilliseconds(&l, &shutdown_cv_, kTimeoutMilliseconds);
-     
+      // add WaitForMilliseconds() for sleep if necessary
       for (int i = 0; i < value_ptr_out_of_date_.size(); i++) {
         value_ptr_out_of_date_[i]->Destroy(kvs_[0].second);
         delete value_ptr_out_of_date_[i];
@@ -478,10 +479,9 @@ class StorageManager {
   BatchCache<K>* cache_;
   int64 cache_capacity_;
   mutex mu_;
-  condition_variable shutdown_cv_;
-  bool shutdown_ GUARDED_BY(mu_) = false;
+  volatile bool shutdown_ GUARDED_BY(mu_) = false;
 
-  bool done_ = false;
+  volatile bool done_ = false;
   std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
 
 };
diff --git a/tensorflow/core/kernels/kv_variable_ops.cc b/tensorflow/core/kernels/kv_variable_ops.cc
@@ -418,6 +418,10 @@ class KvResourceGatherOp : public OpKernel {
           errors::InvalidArgument(
               "ev's value_len should same with output's dimension(1)",
               std::to_string(slice_elems), std::to_string(ev->ValueLen())));
+      OP_REQUIRES(c, !ev->IsMultiLevel() || (ev->IsMultiLevel() && ev->CacheSize() >= N),
+          errors::InvalidArgument(
+              "MultiLevel EV's Cache size ", ev->CacheSize(),
+              " should large than IDs in batch ", N));
       const size_t slice_bytes = slice_elems * sizeof(TValue);
       auto do_work = [this, indices_flat,
            out_base, slice_elems, c, default_v, ev, counts] (
@@ -436,10 +440,10 @@ class KvResourceGatherOp : public OpKernel {
           worker_threads->workers, indices_size,
           slice_bytes, do_work);
           
-      ev->storage_manager()->Schedule([ev, indices_flat, indices_size]() {
+      ev->storage_manager()->Schedule([ev, indices]() {
         embedding::BatchCache<TKey>* cache = ev->Cache();
         if (cache) {
-          cache->add_to_rank(indices_flat.data(), indices_size);
+          cache->add_to_rank(indices);
         }
       });
     }
diff --git a/tensorflow/python/ops/embedding_variable_ops_test.py b/tensorflow/python/ops/embedding_variable_ops_test.py
@@ -1965,6 +1965,7 @@ def runTestAdagrad(self, var, g):
         sess.run([init])
         for i in xrange(60):
           r, _, _ = sess.run([emb, train_op, loss])
+        r = sess.run(emb)
         return r
 
     with ops.Graph().as_default() as g:
@@ -1976,7 +1977,7 @@ def runTestAdagrad(self, var, g):
             steps_to_live=5,
             ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM_SSDHASH,
                                                                                                  storage_path="/tmp/ssd_utpy",
-                                                                                                 storage_size=[512])))
+                                                                                                 storage_size=[5120])))
       emb1 = runTestAdagrad(self, emb_var, g)
 
     with ops.Graph().as_default() as g:

Original file line number	Diff line number	Diff line change
`@@ -163,6 +163,10 @@ class EmbeddingVar : public ResourceBase {`
`163`	`163`	`return storage_manager_->Size();`
`164`	`164`	`}`
`165`	`165`
	`166`	`+ int64 CacheSize() const {`
	`167`	`+ return storage_manager_->CacheSize();`
	`168`	`+ }`
	`169`	`+`
`166`	`170`	`int64 MinFreq() {`
`167`	`171`	`return emb_config_.filter_freq;`
`168`	`172`	`}`