Skip to content

Commit 2dfae28

Browse files
authored
[Runtime] Fix shared resource_mgr double free corruption bug. (#286)
1 parent a8f6308 commit 2dfae28

File tree

4 files changed

+15
-2
lines changed

4 files changed

+15
-2
lines changed

tensorflow/core/common_runtime/device.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Device::Device(Env* env, const DeviceAttributes& device_attributes)
2828
CHECK(DeviceNameUtils::ParseFullName(name(), &parsed_name_))
2929
<< "Invalid device name: " << name();
3030
rmgr_ = new ResourceMgr(parsed_name_.job);
31+
owned_rmgr_ = true;
3132
}
3233

3334
Device::Device(Env* env, const DeviceAttributes& device_attributes,
@@ -41,8 +42,10 @@ Device::Device(Env* env, const DeviceAttributes& device_attributes,
4142
dev_rmgr_map->device_rmgr_map.end()) {
4243
rmgr_ = const_cast<DeviceResourceMgrMap*>(dev_rmgr_map)->device_rmgr_map[name()];
4344
LOG(INFO) << "Device " << name() << " got a shared resource_mgr: " << rmgr_;
45+
owned_rmgr_ = false;
4446
} else {
4547
rmgr_ = new ResourceMgr(parsed_name_.job);
48+
owned_rmgr_ = true;
4649
}
4750
}
4851

tensorflow/core/common_runtime/device.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,9 @@ class Device : public DeviceBase {
191191

192192
protected:
193193
void DeleteResourceMgr() {
194-
delete rmgr_;
194+
if (owned_rmgr_) {
195+
delete rmgr_;
196+
}
195197
rmgr_ = nullptr;
196198
}
197199

@@ -204,6 +206,7 @@ class Device : public DeviceBase {
204206

205207
// Resources associated w/ this device. E.g., shared variables, etc.
206208
ResourceMgr* rmgr_ = nullptr;
209+
bool owned_rmgr_ = true;
207210

208211
TF_DISALLOW_COPY_AND_ASSIGN(Device);
209212
};

tensorflow/core/common_runtime/direct_session.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ class DirectSessionFactory : public SessionFactory {
286286

287287
DeviceMgr* device_mgr = new DeviceMgr(std::move(devices));
288288

289-
SessionGroup* session_group = new SessionGroup();
289+
SessionGroup* session_group = new SessionGroup(shared_rmgr);
290290
#ifdef TENSORFLOW_USE_NUMA
291291
DirectSession* leader_session =
292292
new DirectSession(options, device_mgr, true, this,

tensorflow/core/public/session.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ limitations under the License.
3030

3131
namespace tensorflow {
3232
class DeviceMgr;
33+
class ResourceMgr;
3334

3435
namespace thread {
3536

@@ -270,7 +271,12 @@ class Session {
270271

271272
class SessionGroup {
272273
public:
274+
SessionGroup() : shared_resource_mgr_(nullptr) {}
275+
SessionGroup(ResourceMgr* mgr) : shared_resource_mgr_(mgr) {}
273276
~SessionGroup() {
277+
if (shared_resource_mgr_) {
278+
delete shared_resource_mgr_;
279+
}
274280
}
275281

276282
Status Close() {
@@ -375,6 +381,7 @@ class SessionGroup {
375381
std::vector<std::unique_ptr<Session>> sessions_;
376382
int32_t session_num_ = 0;
377383
std::atomic<int64_t> serving_index_{0};
384+
ResourceMgr* shared_resource_mgr_ = nullptr;
378385

379386
Status GetServingSessionId(int32_t* serving_id, int32_t hint_id = -1) {
380387
if (session_num_ < 1) {

0 commit comments

Comments
 (0)