55# hadolint global ignore=DL3038,DL4006
66
77# hadolint ignore=DL3006
8- FROM stackable/image/java-devel AS builder
8+ FROM stackable/image/java-devel AS hadoop- builder
99
1010ARG PRODUCT
1111ARG ASYNC_PROFILER
@@ -25,25 +25,31 @@ COPY hadoop/stackable/fuse_dfs_wrapper /stackable/fuse_dfs_wrapper
2525# At the same time a new HDFS Operator will still work with older images which do not have the symlink to the versionless jar.
2626# After one of our next releases (23.11 or 24.x) we should update the operator to point at the non-versioned symlink (jmx_prometheus_javaagent.jar)
2727# And then we can also remove the symlink to 0.16.1 from this Dockerfile.
28- RUN curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \
29- chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" && \
30- ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar && \
31- ln -s /stackable/jmx/jmx_prometheus_javaagent.jar /stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar
28+ RUN <<EOF
29+ curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
30+ chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
31+ ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
32+ ln -s /stackable/jmx/jmx_prometheus_javaagent.jar /stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar
33+ # TODO: Can the symlink go?
3234
33- RUN ARCH="${TARGETARCH/amd64/x64}" && \
34- curl --fail -L "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC . && \
35- ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler
35+ ARCH="${TARGETARCH/amd64/x64}"
36+ curl --fail -L "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC .
37+ ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler
3638
3739# This Protobuf version is the exact version as used in the Hadoop Dockerfile
3840# See https://github.com/apache/hadoop/blob/trunk/dev-support/docker/pkg-resolver/install-protobuf.sh
3941# (this was hardcoded in the Dockerfile in earlier versions of Hadoop, make sure to look at the exact version in Github)
40- WORKDIR /opt/protobuf-src
41- RUN curl --fail -L -s -S https://repo.stackable.tech/repository/packages/protobuf/protobuf-java-${PROTOBUF}.tar.gz -o /opt/protobuf.tar.gz && \
42- tar xzf /opt/protobuf.tar.gz --strip-components 1 --no-same-owner && \
43- ./configure --prefix=/opt/protobuf && \
44- make "-j$(nproc)" && \
45- make install && \
46- rm -rf /opt/protobuf-src
42+ # At the time of writing we could save around ~350MB if we included this in the later RUN statement and deleted it afterwards
43+ mkdir /opt/protobuf-src
44+ cd /opt/protobuf-src
45+ curl --fail -L -s -S https://repo.stackable.tech/repository/packages/protobuf/protobuf-java-${PROTOBUF}.tar.gz -o /opt/protobuf.tar.gz
46+ tar xzf /opt/protobuf.tar.gz --strip-components 1 --no-same-owner
47+ ./configure --prefix=/opt/protobuf
48+ make "-j$(nproc)"
49+ make install
50+ rm -rf /opt/protobuf-src
51+ rm -f /opt/protobuf.tar.gz
52+ EOF
4753
4854ENV PROTOBUF_HOME=/opt/protobuf
4955ENV PATH="${PATH}:/opt/protobuf/bin"
@@ -56,6 +62,7 @@ RUN microdnf update && \
5662 microdnf clean all && \
5763 rm -rf /var/cache/yum
5864
65+ USER stackable
5966WORKDIR /stackable
6067
6168COPY hadoop/stackable/patches /stackable/patches
@@ -65,123 +72,6 @@ COPY hadoop/stackable/patches /stackable/patches
6572# Also skip building the yarn, mapreduce and minicluster modules: this will result in the modules being excluded but not all
6673# jar files will be stripped if they are needed elsewhere e.g. share/hadoop/yarn will not be part of the build, but yarn jars
6774# will still exist in share/hadoop/tools as they would be needed by the resource estimator tool. Such jars are removed in a later step.
68- RUN curl --fail -L "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUCT}-src.tar.gz" | tar -xzC . && \
69- patches/apply_patches.sh ${PRODUCT} && \
70- cd hadoop-${PRODUCT}-src && \
71- mvn clean package -Pdist,native -pl '!hadoop-tools/hadoop-pipes,!hadoop-yarn-project,!hadoop-mapreduce-project,!hadoop-minicluster' -Drequire.fuse=true -DskipTests -Dmaven.javadoc.skip=true && \
72- cp -r hadoop-dist/target/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT} && \
73- # HDFS fuse-dfs is not part of the regular dist output, so we need to copy it in ourselves
74- cp hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/fuse-dfs/fuse_dfs /stackable/hadoop-${PRODUCT}/bin && \
75- rm -rf /stackable/hadoop-${PRODUCT}-src
76-
77- # For earlier versions this script removes the .class file that contains the
78- # vulnerable code.
79- # TODO: This can be restricted to target only versions which do not honor the environment
80- # varible that has been set above but this has not currently been implemented
81- COPY shared/log4shell.sh /bin
82- RUN /bin/log4shell.sh "/stackable/hadoop-${PRODUCT}"
83-
84- # Ensure no vulnerable files are left over
85- # This will currently report vulnerable files being present, as it also alerts on
86- # SocketNode.class, which we do not remove with our scripts.
87- # Further investigation will be needed whether this should also be removed.
88- COPY shared/log4shell_1.6.1-log4shell_Linux_x86_64 /bin/log4shell_scanner_x86_64
89- COPY shared/log4shell_1.6.1-log4shell_Linux_aarch64 /bin/log4shell_scanner_aarch64
90- COPY shared/log4shell_scanner /bin/log4shell_scanner
91- RUN /bin/log4shell_scanner s "/stackable/hadoop-${PRODUCT}"
92- # ===
93-
94- FROM stackable/image/java-devel AS hdfs-utils-builder
95-
96- ARG HDFS_UTILS
97- ARG PRODUCT
98-
99- WORKDIR /stackable
100-
101- # The Stackable HDFS utils contain an OPA authorizer, group mapper & topology provider.
102- # The topology provider provides rack awareness functionality for HDFS by allowing users to specify Kubernetes
103- # labels to build a rackID from.
104- # Starting with hdfs-utils version 0.3.0 the topology provider is not a standalone jar anymore and included in hdfs-utils.
105-
106- RUN curl --fail -L "https://github.com/stackabletech/hdfs-utils/archive/refs/tags/v${HDFS_UTILS}.tar.gz" | tar -xzC . && \
107- cd hdfs-utils-${HDFS_UTILS} && \
108- mvn clean package -P hadoop-${PRODUCT} -DskipTests -Dmaven.javadoc.skip=true && \
109- mkdir -p /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib && \
110- cp target/hdfs-utils-$HDFS_UTILS.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar && \
111- rm -rf /stackable/hdfs-utils-main
112-
113- FROM stackable/image/java-base AS final
114-
115- ARG PRODUCT
116- ARG RELEASE
117- ARG HDFS_UTILS
118-
119- LABEL name="Apache Hadoop" \
120- maintainer="info@stackable.tech" \
121- vendor="Stackable GmbH" \
122- version="${PRODUCT}" \
123- release="${RELEASE}" \
124- summary="The Stackable image for Apache Hadoop." \
125- description="This image is deployed by the Stackable Operator for Apache Hadoop / HDFS."
126-
127- # fuse is required for fusermount (called by fuse_dfs)
128- # fuse-libs is required for fuse_dfs (not included in fuse)
129- # openssl -> not sure
130- RUN microdnf update && \
131- microdnf install \
132- fuse \
133- fuse-libs \
134- # tar is required for `kubectl cp` which can be used to copy the log files
135- # or profiler flamegraph from the Pod
136- tar && \
137- microdnf clean all && \
138- rm -rf /var/cache/yum
139-
140- COPY hadoop/licenses /licenses
141-
142- # Without this fuse_dfs does not work
143- # It is so non-root users (as we are) can mount a FUSE device and let other users access it
144- RUN echo "user_allow_other" > /etc/fuse.conf
145-
146- USER stackable
147- WORKDIR /stackable
148-
149- COPY --chown=stackable:stackable --from=builder /stackable/hadoop-${PRODUCT} /stackable/hadoop-${PRODUCT}/
150- COPY --chown=stackable:stackable --from=builder /stackable/jmx /stackable/jmx/
151- COPY --chown=stackable:stackable --from=builder /stackable/async-profiler /stackable/async-profiler/
152- COPY --chown=stackable:stackable --from=hdfs-utils-builder /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar
153- RUN ln -s /stackable/hadoop-${PRODUCT} /stackable/hadoop
154-
155- COPY hadoop/stackable/fuse_dfs_wrapper /stackable/hadoop/bin
156-
157- ENV HOME=/stackable
158- ENV LD_LIBRARY_PATH=/stackable/hadoop/lib/native:/usr/lib/jvm/jre/lib/server
159- ENV PATH="${PATH}" :/stackable/hadoop/bin
160- ENV HADOOP_HOME=/stackable/hadoop
161- ENV HADOOP_CONF_DIR=/stackable/config
162- ENV ASYNC_PROFILER_HOME=/stackable/async-profiler
163- # The following 2 env-vars are required for common scripts even if the respective libraries are never used.
164- # HADOOP_HOME is often used internally if HADOOP_YARN_HOME/HADOOP_MAPRED_HOME are not set, although
165- # a subdirectory is also required in (at least)
166- # hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh
167- # if HADOOP_YARN_HOME does not exist at all, so we set it here to a sensible default.
168- ENV HADOOP_YARN_HOME=/stackable/hadoop
169- ENV HADOOP_MAPRED_HOME=/stackable/hadoop
170-
171- # Remove unneeded binaries:
172- # - code sources
173- # - mapreduce/yarn binaries that were built as cross-project dependencies
174- # - minicluster (only used for testing) and test .jars
175- # - json-io: this is a transitive dependency pulled in by cedarsoft/java-utils/json-io and is excluded in 3.4.0. See CVE-2023-34610.
176- RUN rm -rf /stackable/hadoop/share/hadoop/common/sources/ && \
177- rm -rf /stackable/hadoop/share/hadoop/hdfs/sources/ && \
178- rm -rf /stackable/hadoop/share/hadoop/tools/sources/ && \
179- rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar && \
180- rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar && \
181- rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar && \
182- find . -name 'hadoop-minicluster-*.jar' -type f -delete && \
183- find . -name 'hadoop-client-minicluster-*.jar' -type f -delete && \
184- find . -name 'hadoop-*tests.jar' -type f -delete
185-
186- WORKDIR /stackable/hadoop
187- CMD ["echo" , "This image is not meant to be 'run' directly." ]
75+ RUN <<EOF
76+ curl --fail -L "https://repo.stackable.tech/repository/packages/hadoop/hadoop-${PRODUCT}-src.tar.gz" | tar -xzC .
77+ EOF
0 commit comments