Skip to content

Commit 6ca1a97

Browse files
authored
Merge pull request #249 from timvaillancourt/s3_upload_overhaul_v1
S3 Upload Fixes v1
2 parents b5f47d3 + ac4a607 commit 6ca1a97

File tree

11 files changed

+498
-176
lines changed

11 files changed

+498
-176
lines changed

Makefile

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
NAME=mongodb_consistent_backup
55
BIN_NAME?=mongodb-consistent-backup
66
VERSION=$(shell cat VERSION | cut -d- -f1)
7+
RELEASE?=1
78
GIT_COMMIT?=$(shell git show 2>/dev/null | awk 'NR==1{print $$2}')
89
PREFIX?=/usr/local
910
ARCH?=x86_64
@@ -17,7 +18,7 @@ MAKE_DIR=$(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
1718
all: bin/$(BIN_NAME)
1819

1920
bin/$(BIN_NAME): setup.py requirements.txt README.rst VERSION scripts/build.sh $(NAME)/*.py $(NAME)/*/*.py $(NAME)/*/*/*.py
20-
BIN_NAME=$(BIN_NAME) GIT_COMMIT=$(GIT_COMMIT) PYTHON_BIN=$(PYTHON_BIN) VIRTUALENV_BIN=$(VIRTUALENV_BIN) bash scripts/build.sh
21+
BIN_NAME=$(BIN_NAME) RELEASE=$(RELEASE) GIT_COMMIT=$(GIT_COMMIT) PYTHON_BIN=$(PYTHON_BIN) VIRTUALENV_BIN=$(VIRTUALENV_BIN) bash scripts/build.sh
2122

2223
install: bin/$(BIN_NAME)
2324
mkdir -p $(BINDIR) $(SHAREDIR)/$(NAME) || true
@@ -35,14 +36,14 @@ rpm: bin/$(BIN_NAME)
3536
cp -f $(MAKE_DIR)/{LICENSE,README.rst} build/rpm/SOURCES
3637
cp -f $(MAKE_DIR)/bin/$(BIN_NAME) build/rpm/SOURCES/mongodb-consistent-backup
3738
cp -f $(MAKE_DIR)/conf/mongodb-consistent-backup.example.conf build/rpm/SOURCES/mongodb-consistent-backup.conf
38-
rpmbuild -D "_topdir $(MAKE_DIR)/build/rpm" -D "version $(VERSION)" -bb $(MAKE_DIR)/scripts/$(NAME).spec
39+
rpmbuild -D "_topdir $(MAKE_DIR)/build/rpm" -D "version $(VERSION)" -D "release $(RELEASE)" -bb $(MAKE_DIR)/scripts/$(NAME).spec
3940

4041
uninstall:
4142
rm -f $(BINDIR)/mongodb-consistent-backup
4243
rm -rf $(SHAREDIR)/$(NAME)
4344

4445
# Build CentOS7 RPM (in Docker)
45-
build/rpm/RPMS/$(ARCH)/$(NAME)-$(VERSION)-1.el7.centos.$(ARCH).rpm:
46+
build/rpm/RPMS/$(ARCH)/$(NAME)-$(VERSION)-$(RELEASE).el7.centos.$(ARCH).rpm:
4647
mkdir -p $(MAKE_DIR)/build/rpm/RPMS/$(ARCH)
4748
docker run --rm \
4849
-v "$(MAKE_DIR)/bin:/src/bin:Z" \
@@ -59,7 +60,7 @@ build/rpm/RPMS/$(ARCH)/$(NAME)-$(VERSION)-1.el7.centos.$(ARCH).rpm:
5960
-v "$(MAKE_DIR)/build/rpm/RPMS/$(ARCH):/src/build/rpm/RPMS/$(ARCH):Z" \
6061
-it centos:centos7 \
6162
/bin/bash -c "yum install -y python-devel python-virtualenv gcc make libffi-devel openssl-devel rpm-build && \
62-
make -C /src GIT_COMMIT=$(GIT_COMMIT) BIN_NAME=mongodb-consistent-backup.el7.centos.$(ARCH) rpm && \
63+
make -C /src RELEASE=$(RELEASE) GIT_COMMIT=$(GIT_COMMIT) BIN_NAME=mongodb-consistent-backup.el7.centos.$(ARCH) rpm && \
6364
/src/bin/mongodb-consistent-backup.el7.centos.$(ARCH) --version"
6465

6566
centos7: build/rpm/RPMS/$(ARCH)/$(NAME)-$(VERSION)-1.el7.centos.$(ARCH).rpm
@@ -80,7 +81,7 @@ bin/mongodb-consistent-backup.debian8.$(ARCH):
8081
-v "$(MAKE_DIR)/VERSION:/src/VERSION:Z" \
8182
-it debian:jessie \
8283
/bin/bash -c "apt-get update && apt-get install -y python2.7-minimal python2.7-dev python-virtualenv gcc make libffi-dev libssl-dev && \
83-
make -C /src GIT_COMMIT=$(GIT_COMMIT) BIN_NAME=mongodb-consistent-backup.debian8.$(ARCH).tmp && \
84+
make -C /src RELEASE=$(RELEASE) GIT_COMMIT=$(GIT_COMMIT) BIN_NAME=mongodb-consistent-backup.debian8.$(ARCH).tmp && \
8485
mv -vf /src/bin/mongodb-consistent-backup.debian8.$(ARCH).tmp /src/bin/mongodb-consistent-backup.debian8.$(ARCH) && \
8586
/src/bin/mongodb-consistent-backup.debian8.$(ARCH) --version"
8687

@@ -102,13 +103,13 @@ bin/mongodb-consistent-backup.debian9.$(ARCH):
102103
-v "$(MAKE_DIR)/VERSION:/src/VERSION:Z" \
103104
-it debian:stretch \
104105
/bin/bash -c "apt-get update && apt-get install -y python2.7-minimal python2.7-dev python-virtualenv gcc make libffi-dev libssl-dev && \
105-
make -C /src GIT_COMMIT=$(GIT_COMMIT) BIN_NAME=mongodb-consistent-backup.debian9.$(ARCH).tmp && \
106+
make -C /src RELEASE=$(RELEASE) GIT_COMMIT=$(GIT_COMMIT) BIN_NAME=mongodb-consistent-backup.debian9.$(ARCH).tmp && \
106107
mv -vf /src/bin/mongodb-consistent-backup.debian9.$(ARCH).tmp /src/bin/mongodb-consistent-backup.debian9.$(ARCH) && \
107108
/src/bin/mongodb-consistent-backup.debian9.$(ARCH) --version"
108109

109110
debian9: bin/mongodb-consistent-backup.debian9.$(ARCH)
110111

111-
docker: build/rpm/RPMS/$(ARCH)/$(NAME)-$(VERSION)-1.el7.centos.$(ARCH).rpm
112+
docker: build/rpm/RPMS/$(ARCH)/$(NAME)-$(VERSION)-$(RELEASE).el7.centos.$(ARCH).rpm
112113
docker build --no-cache --tag $(DOCKER_TAG) .
113114
docker tag $(DOCKER_TAG) $(NAME):latest
114115
docker run --rm -it $(DOCKER_TAG) --version

mongodb_consistent_backup/Common/Util.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import socket
22

33
from dateutil import parser
4+
from hashlib import md5
45
from select import select
56

67
from mongodb_consistent_backup.Errors import OperationError
@@ -52,3 +53,11 @@ def wait_popen(process, stderr_callback, stdout_callback):
5253
except Exception, e:
5354
raise e
5455
return True
56+
57+
58+
def file_md5hash(file_path, blocksize=65536):
59+
md5hash = md5()
60+
with open(file_path, "rb") as f:
61+
for block in iter(lambda: f.read(blocksize), b""):
62+
md5hash.update(block)
63+
return md5hash.hexdigest()

mongodb_consistent_backup/Upload/Gs/Gs.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import logging
22
import os
33

4-
54
from copy_reg import pickle
65
from multiprocessing import Pool
76
from types import MethodType
87

98
from mongodb_consistent_backup.Errors import OperationError
109
from mongodb_consistent_backup.Pipeline import Task
10+
from mongodb_consistent_backup.Upload.Util import get_upload_files
11+
1112
from GsUploadThread import GsUploadThread
1213

1314

@@ -41,17 +42,6 @@ def close(self):
4142
self._pool.terminate()
4243
self.stopped = True
4344

44-
def get_backup_files(self, base_dir=None, files=[]):
45-
if not base_dir:
46-
base_dir = self.backup_dir
47-
for child in os.listdir(base_dir):
48-
path = os.path.join(base_dir, child)
49-
if os.path.isfile(path):
50-
files.append(path)
51-
elif os.path.isdir(path):
52-
self.get_backup_files(path, files)
53-
return files
54-
5545
def run(self):
5646
if not os.path.isdir(self.backup_dir):
5747
logging.error("The source directory: %s does not exist or is not a directory! Skipping Google Cloud Storage upload!" % self.backup_dir)
@@ -60,7 +50,7 @@ def run(self):
6050
self.running = True
6151
self.timer.start(self.timer_name)
6252
logging.info("Uploading %s to Google Cloud Storage (bucket=%s, threads=%i)" % (self.base_dir, self.bucket, self.threads()))
63-
for file_path in self.get_backup_files():
53+
for file_path in get_upload_files():
6454
gs_path = os.path.relpath(file_path, self.backup_location)
6555
self._pool.apply_async(GsUploadThread(
6656
self.backup_dir,

mongodb_consistent_backup/Upload/Gs/GsUploadThread.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import boto
2-
import hashlib
32
import logging
43
import os
54

5+
from mongodb_consistent_backup.Common.Util import file_md5hash
66
from mongodb_consistent_backup.Errors import OperationError
77

88

@@ -34,7 +34,7 @@ def configure(self):
3434
def get_uri(self):
3535
return boto.storage_uri(self.path, 'gs')
3636

37-
def exists(self):
37+
def gs_exists(self):
3838
try:
3939
self.metadata()
4040
return True
@@ -52,13 +52,6 @@ def gs_md5hash(self):
5252
if hasattr(key, 'etag'):
5353
return key.etag.strip('"\'')
5454

55-
def file_md5hash(self, blocksize=65536):
56-
md5 = hashlib.md5()
57-
with open(self.file_path, "rb") as f:
58-
for block in iter(lambda: f.read(blocksize), b""):
59-
md5.update(block)
60-
return md5.hexdigest()
61-
6255
def success(self):
6356
if self.remove_uploaded and not self.file_path.startswith(os.path.join(self.backup_dir, self.meta_data_dir)):
6457
logging.debug("Removing successfully uploaded file: %s" % self.file_path)
@@ -68,8 +61,9 @@ def run(self):
6861
f = None
6962
try:
7063
self.configure()
71-
if self.exists():
72-
if self.gs_md5hash() and self.file_md5hash() == self.gs_md5hash():
64+
if self.gs_exists():
65+
gs_md5hash = self.gs_md5hash()
66+
if gs_md5hash and file_md5hash(self.file_path) == gs_md5hash:
7367
logging.debug("Path %s already exists with the same checksum (%s), skipping" % (self.path, self.gs_md5hash()))
7468
return
7569
logging.debug("Path %s checksum and local checksum differ, re-uploading" % self.path)
Lines changed: 48 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,11 @@
11
import os
22
import logging
33

4-
import boto.s3.multipart
5-
from copy_reg import pickle
6-
from math import ceil
7-
from multiprocessing import Pool
8-
from types import MethodType
9-
10-
from S3Session import S3Session
11-
from S3UploadThread import S3UploadThread
4+
from S3UploadPool import S3UploadPool
125

136
from mongodb_consistent_backup.Errors import OperationError
147
from mongodb_consistent_backup.Pipeline import Task
15-
16-
17-
# Allows pooled .apply_async()s to work on Class-methods:
18-
def _reduce_method(m):
19-
if m.im_self is None:
20-
return getattr, (m.im_class, m.im_func.func_name)
21-
else:
22-
return getattr, (m.im_self, m.im_func.func_name)
23-
24-
25-
pickle(MethodType, _reduce_method)
8+
from mongodb_consistent_backup.Upload.Util import get_upload_files
269

2710

2811
class S3(Task):
@@ -32,122 +15,67 @@ def __init__(self, manager, config, timer, base_dir, backup_dir, **kwargs):
3215
self.retries = self.config.upload.retries
3316
self.thread_count = self.config.upload.threads
3417
self.region = self.config.upload.s3.region
35-
self.bucket_name = self.config.upload.s3.bucket_name
36-
self.bucket_prefix = self.config.upload.s3.bucket_prefix
37-
self.bucket_explicit_key = self.config.upload.s3.bucket_explicit_key
38-
self.access_key = self.config.upload.s3.access_key
39-
self.secret_key = self.config.upload.s3.secret_key
18+
self.bucket_name = getattr(self.config.upload.s3, 'bucket_name', None)
19+
self.bucket_prefix = getattr(self.config.upload.s3, 'bucket_prefix', None)
20+
self.bucket_explicit_key = getattr(self.config.upload.s3, 'bucket_explicit_key', None)
21+
self.access_key = getattr(self.config.upload.s3, 'access_key', None)
22+
self.secret_key = getattr(self.config.upload.s3, 'secret_key', None)
4023
self.chunk_size_mb = self.config.upload.s3.chunk_size_mb
4124
self.chunk_size = self.chunk_size_mb * 1024 * 1024
42-
self.secure = self.config.upload.s3.secure
4325
self.s3_acl = self.config.upload.s3.acl
4426
self.key_prefix = base_dir
4527

46-
self._pool = None
47-
self._multipart = None
48-
self._upload_done = False
28+
self._pool = None
29+
4930
if None in (self.access_key, self.secret_key, self.region):
50-
raise OperationError("Invalid S3 security key or region detected!")
51-
try:
52-
self.s3_conn = S3Session(self.region, self.access_key, self.secret_key, self.bucket_name)
53-
self.bucket = self.s3_conn.get_bucket(self.bucket_name)
54-
except Exception, e:
55-
raise OperationError(e)
31+
raise OperationError("Invalid or missing AWS S3 access key, secret key or region detected!")
32+
33+
self._pool = S3UploadPool(
34+
self.bucket_name,
35+
self.region,
36+
self.access_key,
37+
self.secret_key,
38+
self.thread_count,
39+
self.remove_uploaded,
40+
self.chunk_size,
41+
self.s3_acl
42+
)
43+
44+
def get_key_name(self, file_path):
45+
rel_path = os.path.relpath(file_path, self.backup_dir)
46+
if self.bucket_explicit_key:
47+
key_name = self.bucket_explicit_key
48+
elif self.bucket_prefix == "/":
49+
key_name = "/%s/%s" % (self.key_prefix, rel_path)
50+
else:
51+
key_name = "%s/%s/%s" % (self.bucket_prefix, self.key_prefix, rel_path)
52+
return key_name
5653

5754
def run(self):
5855
if not os.path.isdir(self.backup_dir):
5956
logging.error("The source directory: %s does not exist or is not a directory! Skipping AWS S3 Upload!" % self.backup_dir)
6057
return
6158
try:
6259
self.timer.start(self.timer_name)
63-
for file_name in os.listdir(self.backup_dir):
64-
file_path = os.path.join(self.backup_dir, file_name)
65-
# skip mongodb-consistent-backup_META dir
66-
if os.path.isdir(file_path):
67-
continue
68-
file_size = os.stat(file_path).st_size
69-
chunk_count = int(ceil(file_size / float(self.chunk_size)))
70-
71-
if self.bucket_explicit_key:
72-
key_name = self.bucket_explicit_key
73-
else:
74-
if self.bucket_prefix == "/":
75-
key_name = "/%s/%s" % (self.key_prefix, file_name)
76-
else:
77-
key_name = "%s/%s/%s" % (self.bucket_prefix, self.key_prefix, file_name)
78-
79-
logging.info("Starting multipart AWS S3 upload to key: %s%s using %i threads, %imb chunks, %i retries" % (
80-
self.bucket_name,
81-
key_name,
82-
self.thread_count,
83-
self.chunk_size_mb,
84-
self.retries
85-
))
86-
self._multipart = self.bucket.initiate_multipart_upload(key_name)
87-
self._pool = Pool(processes=self.thread_count)
88-
89-
for i in range(chunk_count):
90-
offset = self.chunk_size * i
91-
byte_count = min(self.chunk_size, file_size - offset)
92-
part_num = i + 1
93-
self._pool.apply_async(S3UploadThread(
94-
self.bucket_name,
95-
self.region,
96-
self.access_key,
97-
self.secret_key,
98-
self._multipart.id,
99-
part_num,
100-
file_path,
101-
offset,
102-
byte_count,
103-
self.retries,
104-
self.secure
105-
).run)
106-
self._pool.close()
107-
self._pool.join()
108-
109-
part_count = 0
110-
for part in boto.s3.multipart.part_lister(self._multipart):
111-
part_count += 1
112-
if part_count == chunk_count:
113-
self._multipart.complete_upload()
114-
if self.s3_acl:
115-
try:
116-
self.bucket.set_acl(self.s3_acl, key_name)
117-
except Exception:
118-
logging.exception("Unable to set ACLs on uploaded key: {}.".format(key_name))
119-
self._upload_done = True
120-
121-
if self.remove_uploaded:
122-
logging.info("Uploaded AWS S3 key: %s%s successfully. Removing local file" % (self.bucket_name, key_name))
123-
os.remove(os.path.join(self.backup_dir, file_name))
124-
else:
125-
logging.info("Uploaded AWS S3 key: %s%s successfully" % (self.bucket_name, key_name))
126-
else:
127-
self._multipart.cancel_upload()
128-
logging.error("Failed to upload all multiparts for key: %s%s! Upload cancelled" % (self.bucket_name, key_name))
129-
raise OperationError("Failed to upload all multiparts for key: %s%s! Upload cancelled" % (self.bucket_name, key_name))
130-
131-
if self.remove_uploaded:
132-
logging.info("Removing backup source dir after successful AWS S3 upload of all backups")
133-
os.rmdir(self.backup_dir)
134-
self.timer.stop(self.timer_name)
60+
logging.info("Starting AWS S3 upload to %s (%i threads, %imb multipart chunks, %i retries)" % (
61+
self.bucket_name,
62+
self.thread_count,
63+
self.chunk_size_mb,
64+
self.retries
65+
))
66+
for file_path in get_upload_files(self.backup_dir):
67+
key_name = self.get_key_name(file_path)
68+
self._pool.upload(file_path, key_name)
69+
self._pool.wait()
13570
except Exception, e:
136-
logging.error("Uploading to AWS S3 failed! Error: %s" % e)
137-
if self._multipart:
138-
self._multipart.cancel_upload()
71+
logging.error("Uploading to AWS S3 failed! Error: %s (error type: %s)" % (e, type(e)))
13972
raise OperationError(e)
73+
finally:
74+
self.timer.stop(self.timer_name)
75+
self._pool.close()
76+
14077
self.completed = True
14178

142-
def close(self):
79+
def close(self, code=None, frame=None):
14380
if self._pool:
144-
logging.error("Terminating multipart AWS S3 upload threads")
145-
self._pool.terminate()
146-
self._pool.join()
147-
148-
if self._multipart and not self._upload_done:
149-
logging.error("Cancelling incomplete multipart AWS S3 upload")
150-
self._multipart.cancel_upload()
151-
152-
if self.s3_conn:
153-
self.s3_conn.close()
81+
self._pool.close()

0 commit comments

Comments
 (0)