From 29e73242cba9797ed24127b24bb0380c69a608d3 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Wed, 28 Mar 2018 18:38:57 +0100
Subject: [PATCH 1/9] SPARK-23807 Add Hadoop 3 profile with relevant POM fix
 ups, cloud-storage artifacts and binding

Change-Id: Ia4526f184ced9eef5b67aee9e91eced0dd38d723
---
 core/pom.xml                                  |   6 +
 hadoop-cloud/pom.xml                          | 355 ++++++++++++++----
 .../cloud/BindingParquetOutputCommitter.scala | 122 ++++++
 .../io/cloud/PathOutputCommitProtocol.scala   | 260 +++++++++++++
 .../spark/internal/io/cloud/package.scala     | 105 ++++++
 .../io/cloud/CommitterBindingSuite.scala      |  86 +++++
 .../io/cloud/StubPathOutputCommitter.scala    | 110 ++++++
 .../io/cloud/PathCommitterConstants.scala     |  87 +++++
 hadoop-cloud/src/test/scala/.keep             |   0
 pom.xml                                       |   9 +
 10 files changed, 1072 insertions(+), 68 deletions(-)
 create mode 100644 hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/BindingParquetOutputCommitter.scala
 create mode 100644 hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
 create mode 100644 hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/package.scala
 create mode 100644 hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
 create mode 100644 hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/StubPathOutputCommitter.scala
 create mode 100644 hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/PathCommitterConstants.scala
 create mode 100644 hadoop-cloud/src/test/scala/.keep
diff --git a/core/pom.xml b/core/pom.xml
index 9258a856028a0..093a9869b6dd7 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -95,6 +95,12 @@
       <groupId>org.apache.curator</groupId>
       <artifactId>curator-recipes</artifactId>
     </dependency>
+    <!-- With curator 2.12  SBT/Ivy doesn't get ZK on the build classpath.
+         Explicitly declaring it as a dependency fixes this. -->
+    <dependency>
+      <groupId>org.apache.zookeeper</groupId>
+      <artifactId>zookeeper</artifactId>
+    </dependency>
 
     <!-- Jetty dependencies promoted to compile here so they are shaded
          and inlined into spark-core jar -->
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 8e424b1c50236..cc520aed6e17c 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -38,81 +38,33 @@
     <sbt.project.name>hadoop-cloud</sbt.project.name>
   </properties>
 
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+  </build>
+
   <dependencies>
-    <!--
-      the AWS module pulls in jackson; its transitive dependencies can create
-      intra-jackson-module version problems.
-      -->
+    <!--used during compilation but not exported as transitive dependencies-->
     <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-aws</artifactId>
-      <version>${hadoop.version}</version>
-      <scope>${hadoop.deps.scope}</scope>
-      <exclusions>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-common</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>commons-logging</groupId>
-          <artifactId>commons-logging</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.codehaus.jackson</groupId>
-          <artifactId>jackson-mapper-asl</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.codehaus.jackson</groupId>
-          <artifactId>jackson-core-asl</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>com.fasterxml.jackson.core</groupId>
-          <artifactId>jackson-core</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>com.fasterxml.jackson.core</groupId>
-          <artifactId>jackson-databind</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>com.fasterxml.jackson.core</groupId>
-          <artifactId>jackson-annotations</artifactId>
-        </exclusion>
-      </exclusions>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-openstack</artifactId>
+      <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
-      <scope>${hadoop.deps.scope}</scope>
-      <exclusions>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-common</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>commons-logging</groupId>
-          <artifactId>commons-logging</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>junit</groupId>
-          <artifactId>junit</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mockito</groupId>
-          <artifactId>mockito-all</artifactId>
-        </exclusion>
-      </exclusions>
+      <scope>provided</scope>
     </dependency>
 
-    <!--
-    Add joda time to ensure that anything downstream which doesn't pull in spark-hive
-    gets the correct joda time artifact, so doesn't have auth failures on later Java 8 JVMs
-    -->
-    <dependency>
-      <groupId>joda-time</groupId>
-      <artifactId>joda-time</artifactId>
-      <scope>${hadoop.deps.scope}</scope>
-    </dependency>
     <!-- explicitly declare the jackson artifacts desired -->
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
@@ -141,13 +93,98 @@
       <artifactId>httpcore</artifactId>
       <scope>${hadoop.deps.scope}</scope>
     </dependency>
+
   </dependencies>
 
   <profiles>
 
+    <!-- this inner profile is the default one and includes openstack and aws -->
+    <profile>
+      <id>hadoop-2.6</id>
+      <activation>
+        <activeByDefault>true</activeByDefault>
+      </activation>
+      <dependencies>
+        <!--
+          the AWS module pulls in jackson; its transitive dependencies can create
+          intra-jackson-module version problems.
+          -->
+        <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-aws</artifactId>
+          <version>${hadoop.version}</version>
+          <scope>${hadoop.deps.scope}</scope>
+          <exclusions>
+            <exclusion>
+              <groupId>org.apache.hadoop</groupId>
+              <artifactId>hadoop-common</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>commons-logging</groupId>
+              <artifactId>commons-logging</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>org.codehaus.jackson</groupId>
+              <artifactId>jackson-mapper-asl</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>org.codehaus.jackson</groupId>
+              <artifactId>jackson-core-asl</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.fasterxml.jackson.core</groupId>
+              <artifactId>jackson-core</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.fasterxml.jackson.core</groupId>
+              <artifactId>jackson-databind</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.fasterxml.jackson.core</groupId>
+              <artifactId>jackson-annotations</artifactId>
+            </exclusion>
+          </exclusions>
+        </dependency>
+        <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-openstack</artifactId>
+          <version>${hadoop.version}</version>
+          <scope>${hadoop.deps.scope}</scope>
+          <exclusions>
+            <exclusion>
+              <groupId>org.apache.hadoop</groupId>
+              <artifactId>hadoop-common</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>commons-logging</groupId>
+              <artifactId>commons-logging</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>junit</groupId>
+              <artifactId>junit</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>org.mockito</groupId>
+              <artifactId>mockito-all</artifactId>
+            </exclusion>
+          </exclusions>
+        </dependency>
+
+        <!--
+        Add joda time to ensure that anything downstream which doesn't pull in spark-hive
+        gets the correct joda time artifact, so doesn't have auth failures on later Java 8 JVMs.
+        -->
+        <dependency>
+          <groupId>joda-time</groupId>
+          <artifactId>joda-time</artifactId>
+          <scope>${hadoop.deps.scope}</scope>
+        </dependency>
+      </dependencies>
+    </profile>
+
     <profile>
       <id>hadoop-2.7</id>
-      <!-- Hadoop Azure is a new Jar with -->
+      <!-- 2.7+ adds the azure Jar to the set of dependencies -->
       <dependencies>
 
         <!--
@@ -177,6 +214,188 @@
             </exclusion>
           </exclusions>
         </dependency>
+        <!--
+          the AWS module pulls in jackson; its transitive dependencies can create
+          intra-jackson-module version problems.
+          -->
+        <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-aws</artifactId>
+          <version>${hadoop.version}</version>
+          <scope>${hadoop.deps.scope}</scope>
+          <exclusions>
+            <exclusion>
+              <groupId>org.apache.hadoop</groupId>
+              <artifactId>hadoop-common</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>commons-logging</groupId>
+              <artifactId>commons-logging</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>org.codehaus.jackson</groupId>
+              <artifactId>jackson-mapper-asl</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>org.codehaus.jackson</groupId>
+              <artifactId>jackson-core-asl</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.fasterxml.jackson.core</groupId>
+              <artifactId>jackson-core</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.fasterxml.jackson.core</groupId>
+              <artifactId>jackson-databind</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.fasterxml.jackson.core</groupId>
+              <artifactId>jackson-annotations</artifactId>
+            </exclusion>
+          </exclusions>
+        </dependency>
+        <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-openstack</artifactId>
+          <version>${hadoop.version}</version>
+          <scope>${hadoop.deps.scope}</scope>
+          <exclusions>
+            <exclusion>
+              <groupId>org.apache.hadoop</groupId>
+              <artifactId>hadoop-common</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>commons-logging</groupId>
+              <artifactId>commons-logging</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>junit</groupId>
+              <artifactId>junit</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>org.mockito</groupId>
+              <artifactId>mockito-all</artifactId>
+            </exclusion>
+          </exclusions>
+        </dependency>
+
+        <!--
+        Add joda time to ensure that anything downstream which doesn't pull in spark-hive
+        gets the correct joda time artifact, so doesn't have auth failures on later Java 8 JVMs
+        -->
+        <dependency>
+          <groupId>joda-time</groupId>
+          <artifactId>joda-time</artifactId>
+          <scope>${hadoop.deps.scope}</scope>
+        </dependency>
+        <!-- explicitly declare the jackson artifacts desired -->
+        <dependency>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-databind</artifactId>
+          <scope>${hadoop.deps.scope}</scope>
+        </dependency>
+        <dependency>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-annotations</artifactId>
+          <scope>${hadoop.deps.scope}</scope>
+        </dependency>
+        <dependency>
+          <groupId>com.fasterxml.jackson.dataformat</groupId>
+          <artifactId>jackson-dataformat-cbor</artifactId>
+          <version>${fasterxml.jackson.version}</version>
+        </dependency>
+        <!--Explicit declaration to force in Spark version into transitive dependencies -->
+        <dependency>
+          <groupId>org.apache.httpcomponents</groupId>
+          <artifactId>httpclient</artifactId>
+          <scope>${hadoop.deps.scope}</scope>
+        </dependency>
+        <!--Explicit declaration to force in Spark version into transitive dependencies -->
+        <dependency>
+          <groupId>org.apache.httpcomponents</groupId>
+          <artifactId>httpcore</artifactId>
+          <scope>${hadoop.deps.scope}</scope>
+        </dependency>
+      </dependencies>
+    </profile>
+
+    <!--
+     Hadoop 3 simplifies the classpath, and adds a new committer base class which
+     enables store-specific committers.
+    -->
+    <profile>
+      <id>hadoop-3</id>
+      <properties>
+        <extra.source.dir>src/hadoop-3/main/scala</extra.source.dir>
+        <extra.testsource.dir>src/hadoop-3/test/scala</extra.testsource.dir>
+      </properties>
+
+      <build>
+        <plugins>
+          <!-- Include a source dir depending on the Scala version -->
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>build-helper-maven-plugin</artifactId>
+            <executions>
+              <execution>
+                <id>add-scala-sources</id>
+                <phase>generate-sources</phase>
+                <goals>
+                  <goal>add-source</goal>
+                </goals>
+                <configuration>
+                  <sources>
+                    <source>${extra.source.dir}</source>
+                  </sources>
+                </configuration>
+              </execution>
+              <execution>
+                <id>add-scala-test-sources</id>
+                <phase>generate-test-sources</phase>
+                <goals>
+                  <goal>add-test-source</goal>
+                </goals>
+                <configuration>
+                  <sources>
+                    <source>${extra.testsource.dir}</source>
+                  </sources>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+
+      </build>
+      <dependencies>
+
+        <!--
+        There's now a hadoop-cloud-storage which transitively pulls in the store JARs,
+        but it still needs some selective exclusion across versions, especially 3.0.x.
+        -->
+        <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-cloud-storage</artifactId>
+          <version>${hadoop.version}</version>
+          <scope>${hadoop.deps.scope}</scope>
+          <exclusions>
+            <exclusion>
+              <groupId>org.apache.hadoop</groupId>
+              <artifactId>hadoop-common</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>org.codehaus.jackson</groupId>
+              <artifactId>jackson-mapper-asl</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.fasterxml.jackson.core</groupId>
+              <artifactId>jackson-core</artifactId>
+            </exclusion>
+            <exclusion>
+              <groupId>com.google.guava</groupId>
+              <artifactId>guava</artifactId>
+            </exclusion>
+          </exclusions>
+        </dependency>
       </dependencies>
     </profile>
 
diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/BindingParquetOutputCommitter.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/BindingParquetOutputCommitter.scala
new file mode 100644
index 0000000000000..f2a2d208291fc
--- /dev/null
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/BindingParquetOutputCommitter.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io.cloud
+
+import java.io.IOException
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.lib.output.{BindingPathOutputCommitter, PathOutputCommitter}
+import org.apache.hadoop.mapreduce.{JobContext, JobStatus, TaskAttemptContext}
+import org.apache.parquet.hadoop.ParquetOutputCommitter
+
+import org.apache.spark.internal.Logging
+
+
+/**
+ * This dynamically binds to the factory-configured
+ * output committer, and is intended to allow callers to use any [[PathOutputCommitter]],
+ * even if not a subclass of [[ParquetOutputCommitter]].
+ *
+ * The Parquet "parquet.enable.summary-metadata" option will only be supported
+ * if the instantiated committer itself supports it.
+ */
+
+class BindingParquetOutputCommitter(
+    path: Path,
+    context: TaskAttemptContext)
+  extends ParquetOutputCommitter(path, context) with Logging {
+
+  logInfo(s"${this.getClass.getName} binding to configured PathOutputCommitter and dest $path")
+
+  val committer = new BindingPathOutputCommitter(path, context)
+
+  /**
+   * This is the committer ultimately bound to.
+   * @return the committer instantiated by the factory.
+   */
+  def boundCommitter(): PathOutputCommitter = {
+    committer.getCommitter()
+  }
+
+  override def getWorkPath: Path = {
+    committer.getWorkPath()
+  }
+
+  override def setupTask(taskAttemptContext: TaskAttemptContext): Unit = {
+    committer.setupTask(taskAttemptContext)
+  }
+
+  override def commitTask(taskAttemptContext: TaskAttemptContext): Unit = {
+    committer.commitTask(taskAttemptContext)
+  }
+
+  override def abortTask(taskAttemptContext: TaskAttemptContext): Unit = {
+    committer.abortTask(taskAttemptContext)
+  }
+
+  override def setupJob(jobContext: JobContext): Unit = {
+    committer.setupJob(jobContext)
+  }
+
+  override def needsTaskCommit(taskAttemptContext: TaskAttemptContext): Boolean = {
+    committer.needsTaskCommit(taskAttemptContext)
+  }
+
+  override def cleanupJob(jobContext: JobContext): Unit = {
+    committer.cleanupJob(jobContext)
+  }
+
+  override def isCommitJobRepeatable(jobContext: JobContext): Boolean = {
+    committer.isCommitJobRepeatable(jobContext)
+  }
+
+  override def commitJob(jobContext: JobContext): Unit = {
+    committer.commitJob(jobContext)
+  }
+
+  override def recoverTask(taskAttemptContext: TaskAttemptContext): Unit = {
+    committer.recoverTask(taskAttemptContext)
+  }
+
+  /**
+   * Abort the job; log and ignore any IO exception thrown.
+   *
+   * @param jobContext job context
+   * @param state final state of the job
+   */
+  override def abortJob(
+      jobContext: JobContext,
+      state: JobStatus.State): Unit = {
+    try {
+      committer.abortJob(jobContext, state)
+    } catch {
+      case e: IOException =>
+        logWarning("Abort job failed", e)
+    }
+  }
+
+  override def isRecoverySupported: Boolean = {
+    committer.isRecoverySupported()
+  }
+
+  override def isRecoverySupported(jobContext: JobContext): Boolean = {
+    committer.isRecoverySupported(jobContext)
+  }
+
+  override def toString: String = s"BindingParquetOutputCommitter($committer)"
+}
diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
new file mode 100644
index 0000000000000..5645ad53c1bb2
--- /dev/null
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io.cloud
+
+import java.io.IOException
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
+import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory}
+
+import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol}
+import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
+
+/**
+ * Spark Commit protocol for Path Output Committers.
+ * This committer will work with the `FileOutputCommitter` and subclasses.
+ * All implementations *must* be serializable.
+ *
+ * Rather than ask the `FileOutputFormat` for a committer, it uses the
+ * `org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory` factory
+ * API to create the committer.
+ * This is what [[org.apache.hadoop.mapreduce.lib.output.FileOutputFormat]] does,
+ * but as [[HadoopMapReduceCommitProtocol]] still uses the original
+ * `org.apache.hadoop.mapred.FileOutputFormat` binding
+ * subclasses do not do this, overrides those subclasses to using the
+ * factory mechanism now supported in the base class.
+ *
+ * In `setupCommitter` the factory is bonded to and the committer for
+ * the destination path chosen.
+ *
+ * @constructor Instantiate. dynamic partition overwrite is not supported,
+ *              so that committers for stores which do not support rename
+ *              will not get confused.
+ * @param jobId                     job
+ * @param destination               destination
+ * @param dynamicPartitionOverwrite does the caller want support for dynamic
+ *                                  partition overwrite. If so, it will be
+ *                                  refused.
+ * @throws IOException when an unsupported dynamicPartitionOverwrite option is supplied.
+ */
+class PathOutputCommitProtocol(
+  jobId: String,
+  destination: String,
+  dynamicPartitionOverwrite: Boolean = false)
+  extends HadoopMapReduceCommitProtocol(
+    jobId,
+    destination,
+    false) with Serializable {
+
+  @transient var committer: PathOutputCommitter = _
+
+  require(destination != null, "Null destination specified")
+
+  val destPath = new Path(destination)
+
+  logInfo(s"Instantiated committer with job ID=$jobId;" +
+    s" destination=$destPath;" +
+    s" dynamicPartitionOverwrite=$dynamicPartitionOverwrite")
+
+  if (dynamicPartitionOverwrite) {
+    // until there's explicit extensions to the PathOutputCommitProtocols
+    // to support the spark mechanism, it's left to the individual committer
+    // choice to handle partitioning.
+    throw new IOException("PathOutputCommitProtocol does not support dynamicPartitionOverwrite")
+  }
+
+  import PathOutputCommitProtocol._
+
+  /**
+   * Set up the committer.
+   * This creates it by talking directly to the Hadoop factories, instead
+   * of the V1 `mapred.FileOutputFormat` methods.
+   * @param context task attempt
+   * @return the committer to use. This will always be a subclass of
+   *         [[PathOutputCommitter]].
+   */
+  override protected def setupCommitter(
+    context: TaskAttemptContext): PathOutputCommitter = {
+
+    logInfo(s"Setting up committer for path $destination")
+    committer = PathOutputCommitterFactory.createCommitter(destPath, context)
+
+    // Special feature to force out the FileOutputCommitter, so as to guarantee
+    // that the binding is working properly.
+    val rejectFileOutput = context.getConfiguration
+      .getBoolean(REJECT_FILE_OUTPUT, REJECT_FILE_OUTPUT_DEFVAL)
+    if (rejectFileOutput && committer.isInstanceOf[FileOutputCommitter]) {
+      // the output format returned a file output format committer, which
+      // is exactly what we do not want. So switch back to the factory.
+      val factory = PathOutputCommitterFactory.getCommitterFactory(
+        destPath,
+        context.getConfiguration)
+      logInfo(s"Using committer factory $factory")
+      committer = factory.createOutputCommitter(destPath, context)
+    }
+
+    logInfo(s"Using committer ${committer.getClass}")
+    logInfo(s"Committer details: $committer")
+    if (committer.isInstanceOf[FileOutputCommitter]) {
+      require(!rejectFileOutput,
+        s"Committer created is the FileOutputCommitter $committer")
+
+      if (committer.isCommitJobRepeatable(context)) {
+        // If FileOutputCommitter says its job commit is repeatable, it means
+        // it is using the v2 algorithm, which is not safe for task commit
+        // failures. Warn
+        logWarning(s"Committer $committer may not be tolerant of task commit failures")
+      }
+    }
+    committer
+  }
+
+  /**
+   * Create a temporary file for a task.
+   *
+   * @param taskContext task context
+   * @param dir         optional subdirectory
+   * @param ext         file extension
+   * @return a path as a string
+   */
+  override def newTaskTempFile(
+    taskContext: TaskAttemptContext,
+    dir: Option[String],
+    ext: String): String = {
+
+    val workDir = committer.getWorkPath
+    val parent = dir.map(d => new Path(workDir, d)).getOrElse(workDir)
+    val file = new Path(parent, buildFilename(taskContext, ext))
+    logInfo(s"Creating task file $file for dir $dir and ext $ext")
+    file.toString
+  }
+
+  /**
+   * Absolute files are still renamed into place with a warning.
+   *
+   * @param taskContext task
+   * @param absoluteDir destination dir
+   * @param ext         extension
+   * @return an absolute path
+   */
+  override def newTaskTempFileAbsPath(
+    taskContext: TaskAttemptContext,
+    absoluteDir: String,
+    ext: String): String = {
+
+    val file = super.newTaskTempFileAbsPath(taskContext, absoluteDir, ext)
+    logWarning(
+      s"Creating temporary file $file for absolute path for dir $absoluteDir")
+    file
+  }
+
+  /**
+   * Build a filename which is unique across all task events.
+   * It does not have to be consistent across multiple attempts of the same
+   * task or job.
+   *
+   * @param taskContext task context
+   * @param ext         extension
+   * @return a name for a file which must be unique across all task attempts
+   */
+  protected def buildFilename(
+    taskContext: TaskAttemptContext,
+    ext: String): String = {
+
+    // The file name looks like part-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003-c000.parquet
+    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
+    // the file name is fine and won't overflow.
+    val split = taskContext.getTaskAttemptID.getTaskID.getId
+    f"part-$split%05d-$jobId$ext"
+  }
+
+  override def setupJob(jobContext: JobContext): Unit = {
+    logInfo("setup job")
+    super.setupJob(jobContext)
+  }
+
+  override def commitJob(
+    jobContext: JobContext,
+    taskCommits: Seq[FileCommitProtocol.TaskCommitMessage]): Unit = {
+    logInfo(s"commit job with ${taskCommits.length} task commit message(s)")
+    super.commitJob(jobContext, taskCommits)
+  }
+
+  /**
+   * Abort the job; log and ignore any IO exception thrown.
+   *
+   * @param jobContext job context
+   */
+  override def abortJob(jobContext: JobContext): Unit = {
+    try {
+      super.abortJob(jobContext)
+    } catch {
+      case e: IOException =>
+        logWarning("Abort job failed", e)
+    }
+  }
+
+  override def setupTask(taskContext: TaskAttemptContext): Unit = {
+    super.setupTask(taskContext)
+  }
+
+  override def commitTask(
+    taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
+    logInfo("Commit task")
+    super.commitTask(taskContext)
+  }
+
+  /**
+   * Abort the task; log and ignore any failure thrown.
+   *
+   * @param taskContext context
+   */
+  override def abortTask(taskContext: TaskAttemptContext): Unit = {
+    logInfo("Abort task")
+    try {
+      super.abortTask(taskContext)
+    } catch {
+      case e: IOException =>
+        logWarning("Abort task failed", e)
+    }
+  }
+
+  override def onTaskCommit(msg: TaskCommitMessage): Unit = {
+    logInfo(s"onTaskCommit($msg)")
+  }
+}
+
+object PathOutputCommitProtocol {
+
+  /**
+   * Hadoop configuration option.
+   * Fail fast if the committer is using the path output protocol.
+   * This option can be used to catch configuration issues early.
+   *
+   * It's mostly relevant when testing/diagnostics, as it can be used to
+   * enforce that schema-specific options are triggering a switch
+   * to a new committer.
+   */
+  val REJECT_FILE_OUTPUT = "pathoutputcommit.reject.fileoutput"
+
+  /**
+   * Default behavior: accept the file output.
+   */
+  val REJECT_FILE_OUTPUT_DEFVAL = false
+}
diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/package.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/package.scala
new file mode 100644
index 0000000000000..d2a0cd28f95c1
--- /dev/null
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/package.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Package object to assist in switching to the Hadoop Hadoop 3
+ * [[org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory]] factory
+ * mechanism for dynamically loading committers for the destination stores.
+ *
+ * = Using Alternative Committers with Spark and Hadoop 3 =
+ *
+ * Hadoop 3.1 adds a means to select a different output committer when writing
+ * data to object stores. This can provide higher performance as well as
+ * addressing the consistency and atomicity problems encountered on some filesystems.
+ *
+ * Every object store can implement its own committer factory: the factory
+ * itself will then instantiated the committer of its choice.
+ *
+ * == Prerequisites ==
+ *
+ * Apache Hadoop 3.0.2 or later for the factory APIs, for the S3A connectors, Hadoop 3.1+
+ *
+ * The Hadoop cluster needs to be configured for the binding from filesystem scheme
+ * to factory. In Hadoop 3.1 this is done automatically for s3a in the file
+ * `mapred-default.xml`.
+ * Other stores' committers may need to be explicitly declared.
+ *
+ * {{{
+ *   <property>
+ *   <name>mapreduce.outputcommitter.factory.scheme.s3a</name>
+ *   <value>org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory</value>
+ *   <description>
+ *     The committer factory to use when writing data to S3A filesystems.
+ *     If mapreduce.outputcommitter.factory.class is set, it will
+ *     override this property.
+ *   </description>
+ * </property>
+ * }}}
+ *
+ * == Binding a Spark Context to use the new committers for a store ==
+ *
+ * Spark uses the Hadoop committers in
+ * [[org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand]]
+ * by instantiating and then invoking an instance of
+ * [[org.apache.spark.internal.io.HadoopMapReduceCommitProtocol]].
+ * `InsertIntoHadoopFsRelationCommand` needs to be configured to use
+ * [[org.apache.spark.internal.io.cloud.PathOutputCommitProtocol]] as
+ * the commit protocol to use. This instantiates the committer through
+ * the factory mechanism, and relays operations to it.
+ *
+ * When working with Parquet data, you need to explicitly switch
+ * the Parquet committers to use the same mechanism
+ *
+ * In `spark-defaults.conf`, everything can be set up with the following settings:
+ * {{{
+ *   spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
+ *   spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
+ * }}}
+ *
+ * It can be done programmatically by calling [[cloud.bind()]] on the
+ * spark configuration.
+ */
+package object cloud {
+
+  /**
+   * Options for committer setup.
+   * When applied to a spark configuration, this will set the
+   * Dataframe output to use the factory mechanism for writing data for
+   * all file formats.
+   */
+  val COMMITTER_BINDING_OPTIONS: Map[String, String] = Map(
+    SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key ->
+      classOf[BindingParquetOutputCommitter].getName,
+    SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+      classOf[PathOutputCommitProtocol].getName)
+
+  /**
+   * Set the options defined in [[cloud.COMMITTER_BINDING_OPTIONS]] on the
+   * spark context.
+   *
+   * @param sparkConf spark configuration to bind.
+   */
+  def bind(sparkConf: SparkConf): Unit = {
+    sparkConf.setAll(COMMITTER_BINDING_OPTIONS)
+  }
+
+}
diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
new file mode 100644
index 0000000000000..8654e9df3f06c
--- /dev/null
+++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io.cloud
+
+import java.io.IOException
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.hadoop.mapreduce.{Job, JobStatus, MRJobConfig, TaskAttemptID}
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.internal.io.cloud
+import org.apache.spark.internal.io.cloud.PathCommitterConstants._
+
+/**
+ * Test committer binding logic.
+ */
+class CommitterBindingSuite extends SparkFunSuite {
+
+
+  private val jobId = "2007071202143_0101"
+  private val attempt0 = "attempt_" + jobId + "_m_000000_0"
+  private val taskAttempt0 = TaskAttemptID.forName(attempt0)
+
+  /**
+   * Does the
+   * [[BindingParquetOutputCommitter]] committer bind to the schema-specific
+   * committer declared for the destination path?
+   */
+  test("BindingParquetOutputCommitter will bind") {
+    val path = new Path("http://example/data")
+    val job = newJob(path)
+    val conf = job.getConfiguration
+    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0)
+    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 1)
+
+    StubPathOutputCommitterFactory.bind(conf, "http")
+    val tContext = new TaskAttemptContextImpl(conf, taskAttempt0)
+    val parquet = new BindingParquetOutputCommitter(path, tContext)
+    val inner = parquet.boundCommitter().asInstanceOf[StubPathOutputCommitter]
+    parquet.setupJob(tContext)
+    assert(inner.setup, s"$inner not setup")
+    parquet.commitJob(tContext)
+    assert(inner.committed, s"$inner not committed")
+    parquet.abortJob(tContext, JobStatus.State.RUNNING)
+    assert(inner.aborted, s"$inner not aborted")
+  }
+
+  test("cloud binding") {
+    val sc = new SparkConf()
+    cloud.bind(sc)
+  }
+
+  /**
+   * Create a a new job. Sets the task attempt ID.
+   *
+   * @return the new job
+   * @throws IOException failure
+   */
+  @throws[IOException]
+  def newJob(outDir: Path): Job = {
+    val job = Job.getInstance(new Configuration())
+    val conf = job.getConfiguration
+    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0)
+    conf.setBoolean(CREATE_SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)
+    FileOutputFormat.setOutputPath(job, outDir)
+    job
+  }
+}
diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/StubPathOutputCommitter.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/StubPathOutputCommitter.scala
new file mode 100644
index 0000000000000..0a67c71b58c6c
--- /dev/null
+++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/StubPathOutputCommitter.scala
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io.cloud
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.lib.output.{PathOutputCommitter, PathOutputCommitterFactory}
+import org.apache.hadoop.mapreduce.{JobContext, JobStatus, TaskAttemptContext}
+
+/**
+ * A local path output committer which tracks its state, for use in
+ * tests.
+ * @param outputPath final destination.
+ * @param workPath work path
+ * @param context task/job attempt.
+ */
+class StubPathOutputCommitter(
+    outputPath: Path,
+    workPath: Path,
+    context: TaskAttemptContext) extends PathOutputCommitter(workPath, context) {
+
+  var setup: Boolean = false
+  var committed: Boolean = false
+  var aborted: Boolean = false
+
+  override def getOutputPath: Path = outputPath
+
+  override def getWorkPath: Path = {
+    workPath
+  }
+
+  override def setupTask(taskAttemptContext: TaskAttemptContext): Unit = {
+    setup = true
+  }
+
+  override def abortTask(taskAttemptContext: TaskAttemptContext): Unit = {
+    aborted = true
+  }
+
+  override def setupJob(jobContext: JobContext): Unit = {
+    setup = true
+  }
+
+  override def commitTask(taskAttemptContext: TaskAttemptContext): Unit = {
+    committed = true
+  }
+
+  override def commitJob(jobContext: JobContext): Unit = {
+    committed = true
+  }
+
+  override def abortJob(
+      jobContext: JobContext,
+      state: JobStatus.State): Unit = {
+    aborted = true
+  }
+
+  override def needsTaskCommit(taskAttemptContext: TaskAttemptContext): Boolean = {
+    true
+  }
+
+  override def toString(): String  = s"StubPathOutputCommitter(setup=$setup," +
+    s" committed=$committed, aborted=$aborted)"
+}
+
+/**
+ * Factory.
+ */
+class StubPathOutputCommitterFactory extends PathOutputCommitterFactory {
+
+  override def createOutputCommitter(
+      outputPath: Path,
+      context: TaskAttemptContext): PathOutputCommitter = {
+    new StubPathOutputCommitter(outputPath, workPath(outputPath), context)
+  }
+
+
+  private def workPath(out: Path): Path = new Path(out, PathCommitterConstants.TEMP_DIR_NAME)
+}
+
+object StubPathOutputCommitterFactory {
+  val Name: String = "org.apache.spark.internal.io.cloud.StubPathOutputCommitterFactory"
+
+  /**
+   * Given a hadoop configuration, set up the factory binding for the scheme.
+   * @param conf config to patch
+   * @param scheme filesystem scheme.
+   */
+  def bind(conf: Configuration, scheme: String): Unit = {
+    val key = String.format(
+      PathCommitterConstants.OUTPUTCOMMITTER_FACTORY_SCHEME_PATTERN, scheme)
+    conf.set(key, Name)
+  }
+
+}
diff --git a/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/PathCommitterConstants.scala b/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/PathCommitterConstants.scala
new file mode 100644
index 0000000000000..bbf86e8a5fc00
--- /dev/null
+++ b/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/PathCommitterConstants.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io.cloud
+
+/**
+ * Constants related to Hadoop committer setup and configuration.
+ * Most of these are scattered around the hadoop-mapreduce classes.
+ */
+object PathCommitterConstants {
+
+  /**
+   * Scheme prefix for per-filesystem scheme committers.
+   */
+  val OUTPUTCOMMITTER_FACTORY_SCHEME = "mapreduce.outputcommitter.factory.scheme"
+
+  /**
+   * String format pattern for per-filesystem scheme committers.
+   */
+  val OUTPUTCOMMITTER_FACTORY_SCHEME_PATTERN: String =
+    OUTPUTCOMMITTER_FACTORY_SCHEME + ".%s"
+
+  /**
+   * Name of the configuration option used to configure the
+   * output committer factory to use unless there is a specific
+   * one for a schema.
+   */
+  val OUTPUTCOMMITTER_FACTORY_CLASS = "mapreduce.pathoutputcommitter.factory.class"
+
+  /** Default committer factory. */
+  val DEFAULT_COMMITTER_FACTORY =
+    "org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory"
+
+  /**
+   * The committer which can be directly instantiated and which then delegates
+   * all operations to the factory-created committer it creates itself.
+   */
+  val BINDING_PATH_OUTPUT_COMMITTER_CLASS =
+    "org.apache.hadoop.mapreduce.lib.output.BindingPathOutputCommitter"
+
+  /**
+   * Classname of a parquet committer which just hands off to the
+   * `BindingPathOutputCommitter` in hadoop-mapreduce, which takes on the
+   * task of binding to the current factory.
+   */
+  val BINDING_PARQUET_OUTPUT_COMMITTER_CLASS =
+    "org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter"
+
+  /** hadoop-mapreduce option to choose the algorithm. */
+  val FILEOUTPUTCOMMITTER_ALGORITHM_VERSION = "mapreduce.fileoutputcommitter.algorithm.version"
+
+  /** The default committer is not actually safe during task commit failures. */
+  val FILEOUTPUTCOMMITTER_ALGORITHM_VERSION_DEFAULT = 2
+
+  /** Skip cleanup _temporary folders under job's output directory? */
+  val FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED = "mapreduce.fileoutputcommitter.cleanup.skipped"
+
+  /**
+   * This is the "Pending" directory of the FileOutputCommitter;
+   * data written here is, in that algorithm, renamed into place.
+   */
+  val TEMP_DIR_NAME = "_temporary"
+
+  /**
+   * Name of the marker file created on success.
+   * This is a 0-byte file with the FileOutputCommitter; object store committers
+   * often add a (non-standard) manifest here.
+   */
+  val SUCCESS_FILE_NAME = "_SUCCESS"
+
+  /** hadoop-mapreduce option to enable the _SUCCESS marker. */
+  val CREATE_SUCCESSFUL_JOB_OUTPUT_DIR_MARKER = "mapreduce.fileoutputcommitter.marksuccessfuljobs"
+}
diff --git a/hadoop-cloud/src/test/scala/.keep b/hadoop-cloud/src/test/scala/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pom.xml b/pom.xml
index 0a711f287a53f..9a07c8d87a6c1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2671,6 +2671,15 @@
       </properties>
     </profile>
 
+    <profile>
+      <id>hadoop-3</id>
+      <properties>
+        <hadoop.version>3.1.0-SNAPSHOT</hadoop.version>
+        <curator.version>2.12.0</curator.version>
+        <zookeeper.version>3.4.9</zookeeper.version>
+      </properties>
+    </profile>
+
     <profile>
       <id>yarn</id>
       <modules>

From 016d69090691631343d37f9704d0f37a84ddf297 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Thu, 29 Mar 2018 16:04:02 +0100
Subject: [PATCH 2/9] SPARK-23807 review set 1: * hadoop branch-2 dependencies
 always declared * minor nits in POM addressed * added log4j.properties for
 tests

Change-Id: Ibb64b20a0be8624d1709e592b9fe85bdc4dd1af7
---
 hadoop-cloud/pom.xml                          | 265 +++++-------------
 .../src/test/resources/log4j.properties       |  36 +++
 2 files changed, 111 insertions(+), 190 deletions(-)
 create mode 100644 hadoop-cloud/src/test/resources/log4j.properties

diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index cc520aed6e17c..880759eb57c11 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -64,7 +64,80 @@
       <version>${hadoop.version}</version>
       <scope>provided</scope>
     </dependency>
+    <!--
+      the AWS module pulls in jackson; its transitive dependencies can create
+      intra-jackson-module version problems.
+      -->
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-aws</artifactId>
+      <version>${hadoop.version}</version>
+      <scope>${hadoop.deps.scope}</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-common</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.codehaus.jackson</groupId>
+          <artifactId>jackson-mapper-asl</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.codehaus.jackson</groupId>
+          <artifactId>jackson-core-asl</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-databind</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-annotations</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-openstack</artifactId>
+      <version>${hadoop.version}</version>
+      <scope>${hadoop.deps.scope}</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-common</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mockito</groupId>
+          <artifactId>mockito-all</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
 
+    <!--
+    Add joda time to ensure that anything downstream which doesn't pull in spark-hive
+    gets the correct joda time artifact, so doesn't have auth failures on later Java 8 JVMs
+    -->
+    <dependency>
+      <groupId>joda-time</groupId>
+      <artifactId>joda-time</artifactId>
+      <scope>${hadoop.deps.scope}</scope>
+    </dependency>
     <!-- explicitly declare the jackson artifacts desired -->
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
@@ -93,95 +166,10 @@
       <artifactId>httpcore</artifactId>
       <scope>${hadoop.deps.scope}</scope>
     </dependency>
-
   </dependencies>
 
   <profiles>
 
-    <!-- this inner profile is the default one and includes openstack and aws -->
-    <profile>
-      <id>hadoop-2.6</id>
-      <activation>
-        <activeByDefault>true</activeByDefault>
-      </activation>
-      <dependencies>
-        <!--
-          the AWS module pulls in jackson; its transitive dependencies can create
-          intra-jackson-module version problems.
-          -->
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-aws</artifactId>
-          <version>${hadoop.version}</version>
-          <scope>${hadoop.deps.scope}</scope>
-          <exclusions>
-            <exclusion>
-              <groupId>org.apache.hadoop</groupId>
-              <artifactId>hadoop-common</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>commons-logging</groupId>
-              <artifactId>commons-logging</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>org.codehaus.jackson</groupId>
-              <artifactId>jackson-mapper-asl</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>org.codehaus.jackson</groupId>
-              <artifactId>jackson-core-asl</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>com.fasterxml.jackson.core</groupId>
-              <artifactId>jackson-core</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>com.fasterxml.jackson.core</groupId>
-              <artifactId>jackson-databind</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>com.fasterxml.jackson.core</groupId>
-              <artifactId>jackson-annotations</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-openstack</artifactId>
-          <version>${hadoop.version}</version>
-          <scope>${hadoop.deps.scope}</scope>
-          <exclusions>
-            <exclusion>
-              <groupId>org.apache.hadoop</groupId>
-              <artifactId>hadoop-common</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>commons-logging</groupId>
-              <artifactId>commons-logging</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>junit</groupId>
-              <artifactId>junit</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>org.mockito</groupId>
-              <artifactId>mockito-all</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-
-        <!--
-        Add joda time to ensure that anything downstream which doesn't pull in spark-hive
-        gets the correct joda time artifact, so doesn't have auth failures on later Java 8 JVMs.
-        -->
-        <dependency>
-          <groupId>joda-time</groupId>
-          <artifactId>joda-time</artifactId>
-          <scope>${hadoop.deps.scope}</scope>
-        </dependency>
-      </dependencies>
-    </profile>
-
     <profile>
       <id>hadoop-2.7</id>
       <!-- 2.7+ adds the azure Jar to the set of dependencies -->
@@ -214,108 +202,6 @@
             </exclusion>
           </exclusions>
         </dependency>
-        <!--
-          the AWS module pulls in jackson; its transitive dependencies can create
-          intra-jackson-module version problems.
-          -->
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-aws</artifactId>
-          <version>${hadoop.version}</version>
-          <scope>${hadoop.deps.scope}</scope>
-          <exclusions>
-            <exclusion>
-              <groupId>org.apache.hadoop</groupId>
-              <artifactId>hadoop-common</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>commons-logging</groupId>
-              <artifactId>commons-logging</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>org.codehaus.jackson</groupId>
-              <artifactId>jackson-mapper-asl</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>org.codehaus.jackson</groupId>
-              <artifactId>jackson-core-asl</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>com.fasterxml.jackson.core</groupId>
-              <artifactId>jackson-core</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>com.fasterxml.jackson.core</groupId>
-              <artifactId>jackson-databind</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>com.fasterxml.jackson.core</groupId>
-              <artifactId>jackson-annotations</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-openstack</artifactId>
-          <version>${hadoop.version}</version>
-          <scope>${hadoop.deps.scope}</scope>
-          <exclusions>
-            <exclusion>
-              <groupId>org.apache.hadoop</groupId>
-              <artifactId>hadoop-common</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>commons-logging</groupId>
-              <artifactId>commons-logging</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>junit</groupId>
-              <artifactId>junit</artifactId>
-            </exclusion>
-            <exclusion>
-              <groupId>org.mockito</groupId>
-              <artifactId>mockito-all</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-
-        <!--
-        Add joda time to ensure that anything downstream which doesn't pull in spark-hive
-        gets the correct joda time artifact, so doesn't have auth failures on later Java 8 JVMs
-        -->
-        <dependency>
-          <groupId>joda-time</groupId>
-          <artifactId>joda-time</artifactId>
-          <scope>${hadoop.deps.scope}</scope>
-        </dependency>
-        <!-- explicitly declare the jackson artifacts desired -->
-        <dependency>
-          <groupId>com.fasterxml.jackson.core</groupId>
-          <artifactId>jackson-databind</artifactId>
-          <scope>${hadoop.deps.scope}</scope>
-        </dependency>
-        <dependency>
-          <groupId>com.fasterxml.jackson.core</groupId>
-          <artifactId>jackson-annotations</artifactId>
-          <scope>${hadoop.deps.scope}</scope>
-        </dependency>
-        <dependency>
-          <groupId>com.fasterxml.jackson.dataformat</groupId>
-          <artifactId>jackson-dataformat-cbor</artifactId>
-          <version>${fasterxml.jackson.version}</version>
-        </dependency>
-        <!--Explicit declaration to force in Spark version into transitive dependencies -->
-        <dependency>
-          <groupId>org.apache.httpcomponents</groupId>
-          <artifactId>httpclient</artifactId>
-          <scope>${hadoop.deps.scope}</scope>
-        </dependency>
-        <!--Explicit declaration to force in Spark version into transitive dependencies -->
-        <dependency>
-          <groupId>org.apache.httpcomponents</groupId>
-          <artifactId>httpcore</artifactId>
-          <scope>${hadoop.deps.scope}</scope>
-        </dependency>
       </dependencies>
     </profile>
 
@@ -332,7 +218,8 @@
 
       <build>
         <plugins>
-          <!-- Include a source dir depending on the Scala version -->
+          <!-- Include a source dir for Hadoop 3 only; will only compile against
+               Hadoop 3.0.2+ -->
           <plugin>
             <groupId>org.codehaus.mojo</groupId>
             <artifactId>build-helper-maven-plugin</artifactId>
@@ -364,10 +251,8 @@
             </executions>
           </plugin>
         </plugins>
-
       </build>
       <dependencies>
-
         <!--
         There's now a hadoop-cloud-storage which transitively pulls in the store JARs,
         but it still needs some selective exclusion across versions, especially 3.0.x.
diff --git a/hadoop-cloud/src/test/resources/log4j.properties b/hadoop-cloud/src/test/resources/log4j.properties
new file mode 100644
index 0000000000000..fb9d9851cb4de
--- /dev/null
+++ b/hadoop-cloud/src/test/resources/log4j.properties
@@ -0,0 +1,36 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the file target/unit-tests.log
+test.appender=file
+log4j.rootCategory=INFO, ${test.appender}
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=true
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
+
+# Tests that launch java subprocesses can set the "test.appender" system property to
+# "console" to avoid having the child process's logs overwrite the unit test's
+# log file.
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%t: %m%n
+
+# Ignore messages below warning level from Jetty, because it's a bit verbose
+log4j.logger.org.spark_project.jetty=WARN

From 942365763f90260e671629b519ce3dbbf7e5455e Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Tue, 3 Apr 2018 14:36:52 +0100
Subject: [PATCH 3/9] SPARK-23807 move new hadoop-cloud source out to new PR;
 this contains the build with all the POM changes other than those adding the
 optional hadoop-3.02+ source tree to the spark-hadoop-cloud build

Change-Id: Iccc2b66602db05db132ce5cf5c8546fe9a13a3fa
---
 hadoop-cloud/pom.xml                          |  41 ---
 .../cloud/BindingParquetOutputCommitter.scala | 122 --------
 .../io/cloud/PathOutputCommitProtocol.scala   | 260 ------------------
 .../spark/internal/io/cloud/package.scala     | 105 -------
 .../io/cloud/CommitterBindingSuite.scala      |  86 ------
 .../io/cloud/StubPathOutputCommitter.scala    | 110 --------
 .../io/cloud/PathCommitterConstants.scala     |  87 ------
 .../src/test/resources/log4j.properties       |  36 ---
 hadoop-cloud/src/test/scala/.keep             |   0
 9 files changed, 847 deletions(-)
 delete mode 100644 hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/BindingParquetOutputCommitter.scala
 delete mode 100644 hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
 delete mode 100644 hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/package.scala
 delete mode 100644 hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
 delete mode 100644 hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/StubPathOutputCommitter.scala
 delete mode 100644 hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/PathCommitterConstants.scala
 delete mode 100644 hadoop-cloud/src/test/resources/log4j.properties
 delete mode 100644 hadoop-cloud/src/test/scala/.keep

diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 880759eb57c11..48187c7a6ff3d 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -211,47 +211,6 @@
     -->
     <profile>
       <id>hadoop-3</id>
-      <properties>
-        <extra.source.dir>src/hadoop-3/main/scala</extra.source.dir>
-        <extra.testsource.dir>src/hadoop-3/test/scala</extra.testsource.dir>
-      </properties>
-
-      <build>
-        <plugins>
-          <!-- Include a source dir for Hadoop 3 only; will only compile against
-               Hadoop 3.0.2+ -->
-          <plugin>
-            <groupId>org.codehaus.mojo</groupId>
-            <artifactId>build-helper-maven-plugin</artifactId>
-            <executions>
-              <execution>
-                <id>add-scala-sources</id>
-                <phase>generate-sources</phase>
-                <goals>
-                  <goal>add-source</goal>
-                </goals>
-                <configuration>
-                  <sources>
-                    <source>${extra.source.dir}</source>
-                  </sources>
-                </configuration>
-              </execution>
-              <execution>
-                <id>add-scala-test-sources</id>
-                <phase>generate-test-sources</phase>
-                <goals>
-                  <goal>add-test-source</goal>
-                </goals>
-                <configuration>
-                  <sources>
-                    <source>${extra.testsource.dir}</source>
-                  </sources>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-        </plugins>
-      </build>
       <dependencies>
         <!--
         There's now a hadoop-cloud-storage which transitively pulls in the store JARs,
diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/BindingParquetOutputCommitter.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/BindingParquetOutputCommitter.scala
deleted file mode 100644
index f2a2d208291fc..0000000000000
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/BindingParquetOutputCommitter.scala
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.internal.io.cloud
-
-import java.io.IOException
-
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce.lib.output.{BindingPathOutputCommitter, PathOutputCommitter}
-import org.apache.hadoop.mapreduce.{JobContext, JobStatus, TaskAttemptContext}
-import org.apache.parquet.hadoop.ParquetOutputCommitter
-
-import org.apache.spark.internal.Logging
-
-
-/**
- * This dynamically binds to the factory-configured
- * output committer, and is intended to allow callers to use any [[PathOutputCommitter]],
- * even if not a subclass of [[ParquetOutputCommitter]].
- *
- * The Parquet "parquet.enable.summary-metadata" option will only be supported
- * if the instantiated committer itself supports it.
- */
-
-class BindingParquetOutputCommitter(
-    path: Path,
-    context: TaskAttemptContext)
-  extends ParquetOutputCommitter(path, context) with Logging {
-
-  logInfo(s"${this.getClass.getName} binding to configured PathOutputCommitter and dest $path")
-
-  val committer = new BindingPathOutputCommitter(path, context)
-
-  /**
-   * This is the committer ultimately bound to.
-   * @return the committer instantiated by the factory.
-   */
-  def boundCommitter(): PathOutputCommitter = {
-    committer.getCommitter()
-  }
-
-  override def getWorkPath: Path = {
-    committer.getWorkPath()
-  }
-
-  override def setupTask(taskAttemptContext: TaskAttemptContext): Unit = {
-    committer.setupTask(taskAttemptContext)
-  }
-
-  override def commitTask(taskAttemptContext: TaskAttemptContext): Unit = {
-    committer.commitTask(taskAttemptContext)
-  }
-
-  override def abortTask(taskAttemptContext: TaskAttemptContext): Unit = {
-    committer.abortTask(taskAttemptContext)
-  }
-
-  override def setupJob(jobContext: JobContext): Unit = {
-    committer.setupJob(jobContext)
-  }
-
-  override def needsTaskCommit(taskAttemptContext: TaskAttemptContext): Boolean = {
-    committer.needsTaskCommit(taskAttemptContext)
-  }
-
-  override def cleanupJob(jobContext: JobContext): Unit = {
-    committer.cleanupJob(jobContext)
-  }
-
-  override def isCommitJobRepeatable(jobContext: JobContext): Boolean = {
-    committer.isCommitJobRepeatable(jobContext)
-  }
-
-  override def commitJob(jobContext: JobContext): Unit = {
-    committer.commitJob(jobContext)
-  }
-
-  override def recoverTask(taskAttemptContext: TaskAttemptContext): Unit = {
-    committer.recoverTask(taskAttemptContext)
-  }
-
-  /**
-   * Abort the job; log and ignore any IO exception thrown.
-   *
-   * @param jobContext job context
-   * @param state final state of the job
-   */
-  override def abortJob(
-      jobContext: JobContext,
-      state: JobStatus.State): Unit = {
-    try {
-      committer.abortJob(jobContext, state)
-    } catch {
-      case e: IOException =>
-        logWarning("Abort job failed", e)
-    }
-  }
-
-  override def isRecoverySupported: Boolean = {
-    committer.isRecoverySupported()
-  }
-
-  override def isRecoverySupported(jobContext: JobContext): Boolean = {
-    committer.isRecoverySupported(jobContext)
-  }
-
-  override def toString: String = s"BindingParquetOutputCommitter($committer)"
-}
diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
deleted file mode 100644
index 5645ad53c1bb2..0000000000000
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.internal.io.cloud
-
-import java.io.IOException
-
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
-import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory}
-
-import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol}
-import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
-
-/**
- * Spark Commit protocol for Path Output Committers.
- * This committer will work with the `FileOutputCommitter` and subclasses.
- * All implementations *must* be serializable.
- *
- * Rather than ask the `FileOutputFormat` for a committer, it uses the
- * `org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory` factory
- * API to create the committer.
- * This is what [[org.apache.hadoop.mapreduce.lib.output.FileOutputFormat]] does,
- * but as [[HadoopMapReduceCommitProtocol]] still uses the original
- * `org.apache.hadoop.mapred.FileOutputFormat` binding
- * subclasses do not do this, overrides those subclasses to using the
- * factory mechanism now supported in the base class.
- *
- * In `setupCommitter` the factory is bonded to and the committer for
- * the destination path chosen.
- *
- * @constructor Instantiate. dynamic partition overwrite is not supported,
- *              so that committers for stores which do not support rename
- *              will not get confused.
- * @param jobId                     job
- * @param destination               destination
- * @param dynamicPartitionOverwrite does the caller want support for dynamic
- *                                  partition overwrite. If so, it will be
- *                                  refused.
- * @throws IOException when an unsupported dynamicPartitionOverwrite option is supplied.
- */
-class PathOutputCommitProtocol(
-  jobId: String,
-  destination: String,
-  dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(
-    jobId,
-    destination,
-    false) with Serializable {
-
-  @transient var committer: PathOutputCommitter = _
-
-  require(destination != null, "Null destination specified")
-
-  val destPath = new Path(destination)
-
-  logInfo(s"Instantiated committer with job ID=$jobId;" +
-    s" destination=$destPath;" +
-    s" dynamicPartitionOverwrite=$dynamicPartitionOverwrite")
-
-  if (dynamicPartitionOverwrite) {
-    // until there's explicit extensions to the PathOutputCommitProtocols
-    // to support the spark mechanism, it's left to the individual committer
-    // choice to handle partitioning.
-    throw new IOException("PathOutputCommitProtocol does not support dynamicPartitionOverwrite")
-  }
-
-  import PathOutputCommitProtocol._
-
-  /**
-   * Set up the committer.
-   * This creates it by talking directly to the Hadoop factories, instead
-   * of the V1 `mapred.FileOutputFormat` methods.
-   * @param context task attempt
-   * @return the committer to use. This will always be a subclass of
-   *         [[PathOutputCommitter]].
-   */
-  override protected def setupCommitter(
-    context: TaskAttemptContext): PathOutputCommitter = {
-
-    logInfo(s"Setting up committer for path $destination")
-    committer = PathOutputCommitterFactory.createCommitter(destPath, context)
-
-    // Special feature to force out the FileOutputCommitter, so as to guarantee
-    // that the binding is working properly.
-    val rejectFileOutput = context.getConfiguration
-      .getBoolean(REJECT_FILE_OUTPUT, REJECT_FILE_OUTPUT_DEFVAL)
-    if (rejectFileOutput && committer.isInstanceOf[FileOutputCommitter]) {
-      // the output format returned a file output format committer, which
-      // is exactly what we do not want. So switch back to the factory.
-      val factory = PathOutputCommitterFactory.getCommitterFactory(
-        destPath,
-        context.getConfiguration)
-      logInfo(s"Using committer factory $factory")
-      committer = factory.createOutputCommitter(destPath, context)
-    }
-
-    logInfo(s"Using committer ${committer.getClass}")
-    logInfo(s"Committer details: $committer")
-    if (committer.isInstanceOf[FileOutputCommitter]) {
-      require(!rejectFileOutput,
-        s"Committer created is the FileOutputCommitter $committer")
-
-      if (committer.isCommitJobRepeatable(context)) {
-        // If FileOutputCommitter says its job commit is repeatable, it means
-        // it is using the v2 algorithm, which is not safe for task commit
-        // failures. Warn
-        logWarning(s"Committer $committer may not be tolerant of task commit failures")
-      }
-    }
-    committer
-  }
-
-  /**
-   * Create a temporary file for a task.
-   *
-   * @param taskContext task context
-   * @param dir         optional subdirectory
-   * @param ext         file extension
-   * @return a path as a string
-   */
-  override def newTaskTempFile(
-    taskContext: TaskAttemptContext,
-    dir: Option[String],
-    ext: String): String = {
-
-    val workDir = committer.getWorkPath
-    val parent = dir.map(d => new Path(workDir, d)).getOrElse(workDir)
-    val file = new Path(parent, buildFilename(taskContext, ext))
-    logInfo(s"Creating task file $file for dir $dir and ext $ext")
-    file.toString
-  }
-
-  /**
-   * Absolute files are still renamed into place with a warning.
-   *
-   * @param taskContext task
-   * @param absoluteDir destination dir
-   * @param ext         extension
-   * @return an absolute path
-   */
-  override def newTaskTempFileAbsPath(
-    taskContext: TaskAttemptContext,
-    absoluteDir: String,
-    ext: String): String = {
-
-    val file = super.newTaskTempFileAbsPath(taskContext, absoluteDir, ext)
-    logWarning(
-      s"Creating temporary file $file for absolute path for dir $absoluteDir")
-    file
-  }
-
-  /**
-   * Build a filename which is unique across all task events.
-   * It does not have to be consistent across multiple attempts of the same
-   * task or job.
-   *
-   * @param taskContext task context
-   * @param ext         extension
-   * @return a name for a file which must be unique across all task attempts
-   */
-  protected def buildFilename(
-    taskContext: TaskAttemptContext,
-    ext: String): String = {
-
-    // The file name looks like part-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003-c000.parquet
-    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
-    // the file name is fine and won't overflow.
-    val split = taskContext.getTaskAttemptID.getTaskID.getId
-    f"part-$split%05d-$jobId$ext"
-  }
-
-  override def setupJob(jobContext: JobContext): Unit = {
-    logInfo("setup job")
-    super.setupJob(jobContext)
-  }
-
-  override def commitJob(
-    jobContext: JobContext,
-    taskCommits: Seq[FileCommitProtocol.TaskCommitMessage]): Unit = {
-    logInfo(s"commit job with ${taskCommits.length} task commit message(s)")
-    super.commitJob(jobContext, taskCommits)
-  }
-
-  /**
-   * Abort the job; log and ignore any IO exception thrown.
-   *
-   * @param jobContext job context
-   */
-  override def abortJob(jobContext: JobContext): Unit = {
-    try {
-      super.abortJob(jobContext)
-    } catch {
-      case e: IOException =>
-        logWarning("Abort job failed", e)
-    }
-  }
-
-  override def setupTask(taskContext: TaskAttemptContext): Unit = {
-    super.setupTask(taskContext)
-  }
-
-  override def commitTask(
-    taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
-    logInfo("Commit task")
-    super.commitTask(taskContext)
-  }
-
-  /**
-   * Abort the task; log and ignore any failure thrown.
-   *
-   * @param taskContext context
-   */
-  override def abortTask(taskContext: TaskAttemptContext): Unit = {
-    logInfo("Abort task")
-    try {
-      super.abortTask(taskContext)
-    } catch {
-      case e: IOException =>
-        logWarning("Abort task failed", e)
-    }
-  }
-
-  override def onTaskCommit(msg: TaskCommitMessage): Unit = {
-    logInfo(s"onTaskCommit($msg)")
-  }
-}
-
-object PathOutputCommitProtocol {
-
-  /**
-   * Hadoop configuration option.
-   * Fail fast if the committer is using the path output protocol.
-   * This option can be used to catch configuration issues early.
-   *
-   * It's mostly relevant when testing/diagnostics, as it can be used to
-   * enforce that schema-specific options are triggering a switch
-   * to a new committer.
-   */
-  val REJECT_FILE_OUTPUT = "pathoutputcommit.reject.fileoutput"
-
-  /**
-   * Default behavior: accept the file output.
-   */
-  val REJECT_FILE_OUTPUT_DEFVAL = false
-}
diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/package.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/package.scala
deleted file mode 100644
index d2a0cd28f95c1..0000000000000
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/package.scala
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.internal.io
-
-import org.apache.spark.SparkConf
-import org.apache.spark.sql.internal.SQLConf
-
-/**
- * Package object to assist in switching to the Hadoop Hadoop 3
- * [[org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory]] factory
- * mechanism for dynamically loading committers for the destination stores.
- *
- * = Using Alternative Committers with Spark and Hadoop 3 =
- *
- * Hadoop 3.1 adds a means to select a different output committer when writing
- * data to object stores. This can provide higher performance as well as
- * addressing the consistency and atomicity problems encountered on some filesystems.
- *
- * Every object store can implement its own committer factory: the factory
- * itself will then instantiated the committer of its choice.
- *
- * == Prerequisites ==
- *
- * Apache Hadoop 3.0.2 or later for the factory APIs, for the S3A connectors, Hadoop 3.1+
- *
- * The Hadoop cluster needs to be configured for the binding from filesystem scheme
- * to factory. In Hadoop 3.1 this is done automatically for s3a in the file
- * `mapred-default.xml`.
- * Other stores' committers may need to be explicitly declared.
- *
- * {{{
- *   <property>
- *   <name>mapreduce.outputcommitter.factory.scheme.s3a</name>
- *   <value>org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory</value>
- *   <description>
- *     The committer factory to use when writing data to S3A filesystems.
- *     If mapreduce.outputcommitter.factory.class is set, it will
- *     override this property.
- *   </description>
- * </property>
- * }}}
- *
- * == Binding a Spark Context to use the new committers for a store ==
- *
- * Spark uses the Hadoop committers in
- * [[org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand]]
- * by instantiating and then invoking an instance of
- * [[org.apache.spark.internal.io.HadoopMapReduceCommitProtocol]].
- * `InsertIntoHadoopFsRelationCommand` needs to be configured to use
- * [[org.apache.spark.internal.io.cloud.PathOutputCommitProtocol]] as
- * the commit protocol to use. This instantiates the committer through
- * the factory mechanism, and relays operations to it.
- *
- * When working with Parquet data, you need to explicitly switch
- * the Parquet committers to use the same mechanism
- *
- * In `spark-defaults.conf`, everything can be set up with the following settings:
- * {{{
- *   spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
- *   spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
- * }}}
- *
- * It can be done programmatically by calling [[cloud.bind()]] on the
- * spark configuration.
- */
-package object cloud {
-
-  /**
-   * Options for committer setup.
-   * When applied to a spark configuration, this will set the
-   * Dataframe output to use the factory mechanism for writing data for
-   * all file formats.
-   */
-  val COMMITTER_BINDING_OPTIONS: Map[String, String] = Map(
-    SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key ->
-      classOf[BindingParquetOutputCommitter].getName,
-    SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
-      classOf[PathOutputCommitProtocol].getName)
-
-  /**
-   * Set the options defined in [[cloud.COMMITTER_BINDING_OPTIONS]] on the
-   * spark context.
-   *
-   * @param sparkConf spark configuration to bind.
-   */
-  def bind(sparkConf: SparkConf): Unit = {
-    sparkConf.setAll(COMMITTER_BINDING_OPTIONS)
-  }
-
-}
diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
deleted file mode 100644
index 8654e9df3f06c..0000000000000
--- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.internal.io.cloud
-
-import java.io.IOException
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
-import org.apache.hadoop.mapreduce.{Job, JobStatus, MRJobConfig, TaskAttemptID}
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.internal.io.cloud
-import org.apache.spark.internal.io.cloud.PathCommitterConstants._
-
-/**
- * Test committer binding logic.
- */
-class CommitterBindingSuite extends SparkFunSuite {
-
-
-  private val jobId = "2007071202143_0101"
-  private val attempt0 = "attempt_" + jobId + "_m_000000_0"
-  private val taskAttempt0 = TaskAttemptID.forName(attempt0)
-
-  /**
-   * Does the
-   * [[BindingParquetOutputCommitter]] committer bind to the schema-specific
-   * committer declared for the destination path?
-   */
-  test("BindingParquetOutputCommitter will bind") {
-    val path = new Path("http://example/data")
-    val job = newJob(path)
-    val conf = job.getConfiguration
-    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0)
-    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 1)
-
-    StubPathOutputCommitterFactory.bind(conf, "http")
-    val tContext = new TaskAttemptContextImpl(conf, taskAttempt0)
-    val parquet = new BindingParquetOutputCommitter(path, tContext)
-    val inner = parquet.boundCommitter().asInstanceOf[StubPathOutputCommitter]
-    parquet.setupJob(tContext)
-    assert(inner.setup, s"$inner not setup")
-    parquet.commitJob(tContext)
-    assert(inner.committed, s"$inner not committed")
-    parquet.abortJob(tContext, JobStatus.State.RUNNING)
-    assert(inner.aborted, s"$inner not aborted")
-  }
-
-  test("cloud binding") {
-    val sc = new SparkConf()
-    cloud.bind(sc)
-  }
-
-  /**
-   * Create a a new job. Sets the task attempt ID.
-   *
-   * @return the new job
-   * @throws IOException failure
-   */
-  @throws[IOException]
-  def newJob(outDir: Path): Job = {
-    val job = Job.getInstance(new Configuration())
-    val conf = job.getConfiguration
-    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt0)
-    conf.setBoolean(CREATE_SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)
-    FileOutputFormat.setOutputPath(job, outDir)
-    job
-  }
-}
diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/StubPathOutputCommitter.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/StubPathOutputCommitter.scala
deleted file mode 100644
index 0a67c71b58c6c..0000000000000
--- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/StubPathOutputCommitter.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.internal.io.cloud
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce.lib.output.{PathOutputCommitter, PathOutputCommitterFactory}
-import org.apache.hadoop.mapreduce.{JobContext, JobStatus, TaskAttemptContext}
-
-/**
- * A local path output committer which tracks its state, for use in
- * tests.
- * @param outputPath final destination.
- * @param workPath work path
- * @param context task/job attempt.
- */
-class StubPathOutputCommitter(
-    outputPath: Path,
-    workPath: Path,
-    context: TaskAttemptContext) extends PathOutputCommitter(workPath, context) {
-
-  var setup: Boolean = false
-  var committed: Boolean = false
-  var aborted: Boolean = false
-
-  override def getOutputPath: Path = outputPath
-
-  override def getWorkPath: Path = {
-    workPath
-  }
-
-  override def setupTask(taskAttemptContext: TaskAttemptContext): Unit = {
-    setup = true
-  }
-
-  override def abortTask(taskAttemptContext: TaskAttemptContext): Unit = {
-    aborted = true
-  }
-
-  override def setupJob(jobContext: JobContext): Unit = {
-    setup = true
-  }
-
-  override def commitTask(taskAttemptContext: TaskAttemptContext): Unit = {
-    committed = true
-  }
-
-  override def commitJob(jobContext: JobContext): Unit = {
-    committed = true
-  }
-
-  override def abortJob(
-      jobContext: JobContext,
-      state: JobStatus.State): Unit = {
-    aborted = true
-  }
-
-  override def needsTaskCommit(taskAttemptContext: TaskAttemptContext): Boolean = {
-    true
-  }
-
-  override def toString(): String  = s"StubPathOutputCommitter(setup=$setup," +
-    s" committed=$committed, aborted=$aborted)"
-}
-
-/**
- * Factory.
- */
-class StubPathOutputCommitterFactory extends PathOutputCommitterFactory {
-
-  override def createOutputCommitter(
-      outputPath: Path,
-      context: TaskAttemptContext): PathOutputCommitter = {
-    new StubPathOutputCommitter(outputPath, workPath(outputPath), context)
-  }
-
-
-  private def workPath(out: Path): Path = new Path(out, PathCommitterConstants.TEMP_DIR_NAME)
-}
-
-object StubPathOutputCommitterFactory {
-  val Name: String = "org.apache.spark.internal.io.cloud.StubPathOutputCommitterFactory"
-
-  /**
-   * Given a hadoop configuration, set up the factory binding for the scheme.
-   * @param conf config to patch
-   * @param scheme filesystem scheme.
-   */
-  def bind(conf: Configuration, scheme: String): Unit = {
-    val key = String.format(
-      PathCommitterConstants.OUTPUTCOMMITTER_FACTORY_SCHEME_PATTERN, scheme)
-    conf.set(key, Name)
-  }
-
-}
diff --git a/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/PathCommitterConstants.scala b/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/PathCommitterConstants.scala
deleted file mode 100644
index bbf86e8a5fc00..0000000000000
--- a/hadoop-cloud/src/main/scala/org/apache/spark/internal/io/cloud/PathCommitterConstants.scala
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.internal.io.cloud
-
-/**
- * Constants related to Hadoop committer setup and configuration.
- * Most of these are scattered around the hadoop-mapreduce classes.
- */
-object PathCommitterConstants {
-
-  /**
-   * Scheme prefix for per-filesystem scheme committers.
-   */
-  val OUTPUTCOMMITTER_FACTORY_SCHEME = "mapreduce.outputcommitter.factory.scheme"
-
-  /**
-   * String format pattern for per-filesystem scheme committers.
-   */
-  val OUTPUTCOMMITTER_FACTORY_SCHEME_PATTERN: String =
-    OUTPUTCOMMITTER_FACTORY_SCHEME + ".%s"
-
-  /**
-   * Name of the configuration option used to configure the
-   * output committer factory to use unless there is a specific
-   * one for a schema.
-   */
-  val OUTPUTCOMMITTER_FACTORY_CLASS = "mapreduce.pathoutputcommitter.factory.class"
-
-  /** Default committer factory. */
-  val DEFAULT_COMMITTER_FACTORY =
-    "org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory"
-
-  /**
-   * The committer which can be directly instantiated and which then delegates
-   * all operations to the factory-created committer it creates itself.
-   */
-  val BINDING_PATH_OUTPUT_COMMITTER_CLASS =
-    "org.apache.hadoop.mapreduce.lib.output.BindingPathOutputCommitter"
-
-  /**
-   * Classname of a parquet committer which just hands off to the
-   * `BindingPathOutputCommitter` in hadoop-mapreduce, which takes on the
-   * task of binding to the current factory.
-   */
-  val BINDING_PARQUET_OUTPUT_COMMITTER_CLASS =
-    "org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter"
-
-  /** hadoop-mapreduce option to choose the algorithm. */
-  val FILEOUTPUTCOMMITTER_ALGORITHM_VERSION = "mapreduce.fileoutputcommitter.algorithm.version"
-
-  /** The default committer is not actually safe during task commit failures. */
-  val FILEOUTPUTCOMMITTER_ALGORITHM_VERSION_DEFAULT = 2
-
-  /** Skip cleanup _temporary folders under job's output directory? */
-  val FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED = "mapreduce.fileoutputcommitter.cleanup.skipped"
-
-  /**
-   * This is the "Pending" directory of the FileOutputCommitter;
-   * data written here is, in that algorithm, renamed into place.
-   */
-  val TEMP_DIR_NAME = "_temporary"
-
-  /**
-   * Name of the marker file created on success.
-   * This is a 0-byte file with the FileOutputCommitter; object store committers
-   * often add a (non-standard) manifest here.
-   */
-  val SUCCESS_FILE_NAME = "_SUCCESS"
-
-  /** hadoop-mapreduce option to enable the _SUCCESS marker. */
-  val CREATE_SUCCESSFUL_JOB_OUTPUT_DIR_MARKER = "mapreduce.fileoutputcommitter.marksuccessfuljobs"
-}
diff --git a/hadoop-cloud/src/test/resources/log4j.properties b/hadoop-cloud/src/test/resources/log4j.properties
deleted file mode 100644
index fb9d9851cb4de..0000000000000
--- a/hadoop-cloud/src/test/resources/log4j.properties
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Set everything to be logged to the file target/unit-tests.log
-test.appender=file
-log4j.rootCategory=INFO, ${test.appender}
-log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=true
-log4j.appender.file.file=target/unit-tests.log
-log4j.appender.file.layout=org.apache.log4j.PatternLayout
-log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
-
-# Tests that launch java subprocesses can set the "test.appender" system property to
-# "console" to avoid having the child process's logs overwrite the unit test's
-# log file.
-log4j.appender.console=org.apache.log4j.ConsoleAppender
-log4j.appender.console.target=System.err
-log4j.appender.console.layout=org.apache.log4j.PatternLayout
-log4j.appender.console.layout.ConversionPattern=%t: %m%n
-
-# Ignore messages below warning level from Jetty, because it's a bit verbose
-log4j.logger.org.spark_project.jetty=WARN
diff --git a/hadoop-cloud/src/test/scala/.keep b/hadoop-cloud/src/test/scala/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From 58c04e92da4f394b4983e48981f32040e92600e0 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Tue, 3 Apr 2018 15:16:45 +0100
Subject: [PATCH 4/9] HADOOP-13207 and switch to the RC hadoop 3.1

Change-Id: Ic13caf5fcf96d617085051579ede8380b2106119
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 9a07c8d87a6c1..9dff90ca2ab49 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2674,7 +2674,7 @@
     <profile>
       <id>hadoop-3</id>
       <properties>
-        <hadoop.version>3.1.0-SNAPSHOT</hadoop.version>
+        <hadoop.version>3.1.0</hadoop.version>
         <curator.version>2.12.0</curator.version>
         <zookeeper.version>3.4.9</zookeeper.version>
       </properties>

From 41845269f950a57968c473f90233d30b77a905dc Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Thu, 5 Apr 2018 15:08:35 +0100
Subject: [PATCH 5/9] SPARK-23807 add the dependencies for the hadoop 3
 profile.

This  includes the profile in test-dependencies.sh, so this part of the build will work: hive doesn't need to be working to build that dependency graph.

Change-Id: I1ecfd4b1a8bea26600765b1de59f2425c42f6b03
---
 dev/deps/spark-deps-hadoop-3 | 221 +++++++++++++++++++++++++++++++++++
 dev/test-dependencies.sh     |   1 +
 2 files changed, 222 insertions(+)
 create mode 100644 dev/deps/spark-deps-hadoop-3

diff --git a/dev/deps/spark-deps-hadoop-3 b/dev/deps/spark-deps-hadoop-3
new file mode 100644
index 0000000000000..97ad65a4096cb
--- /dev/null
+++ b/dev/deps/spark-deps-hadoop-3
@@ -0,0 +1,221 @@
+HikariCP-java7-2.4.12.jar
+JavaEWAH-0.3.2.jar
+RoaringBitmap-0.5.11.jar
+ST4-4.0.4.jar
+accessors-smart-1.2.jar
+activation-1.1.1.jar
+aircompressor-0.8.jar
+antlr-2.7.7.jar
+antlr-runtime-3.4.jar
+antlr4-runtime-4.7.jar
+aopalliance-1.0.jar
+aopalliance-repackaged-2.4.0-b34.jar
+apache-log4j-extras-1.2.17.jar
+arpack_combined_all-0.1.jar
+arrow-format-0.8.0.jar
+arrow-memory-0.8.0.jar
+arrow-vector-0.8.0.jar
+automaton-1.11-8.jar
+avro-1.7.7.jar
+avro-ipc-1.7.7.jar
+avro-mapred-1.7.7-hadoop2.jar
+base64-2.3.8.jar
+bcprov-jdk15on-1.58.jar
+bonecp-0.8.0.RELEASE.jar
+breeze-macros_2.11-0.13.2.jar
+breeze_2.11-0.13.2.jar
+calcite-avatica-1.2.0-incubating.jar
+calcite-core-1.2.0-incubating.jar
+calcite-linq4j-1.2.0-incubating.jar
+chill-java-0.8.4.jar
+chill_2.11-0.8.4.jar
+commons-beanutils-1.9.3.jar
+commons-cli-1.2.jar
+commons-codec-1.10.jar
+commons-collections-3.2.2.jar
+commons-compiler-3.0.8.jar
+commons-compress-1.4.1.jar
+commons-configuration2-2.1.1.jar
+commons-crypto-1.0.0.jar
+commons-daemon-1.0.13.jar
+commons-dbcp-1.4.jar
+commons-httpclient-3.1.jar
+commons-io-2.4.jar
+commons-lang-2.6.jar
+commons-lang3-3.5.jar
+commons-logging-1.1.3.jar
+commons-math3-3.4.1.jar
+commons-net-3.1.jar
+commons-pool-1.5.4.jar
+compress-lzf-1.0.3.jar
+core-1.1.2.jar
+curator-client-2.12.0.jar
+curator-framework-2.12.0.jar
+curator-recipes-2.12.0.jar
+datanucleus-api-jdo-3.2.6.jar
+datanucleus-core-3.2.10.jar
+datanucleus-rdbms-3.2.9.jar
+derby-10.12.1.1.jar
+dnsjava-2.1.7.jar
+ehcache-3.3.1.jar
+eigenbase-properties-1.1.5.jar
+flatbuffers-1.2.0-3f79e055.jar
+generex-1.0.1.jar
+geronimo-jcache_1.0_spec-1.0-alpha-1.jar
+gson-2.2.4.jar
+guava-14.0.1.jar
+guice-4.0.jar
+guice-servlet-4.0.jar
+hadoop-annotations-3.1.0.jar
+hadoop-auth-3.1.0.jar
+hadoop-client-3.1.0.jar
+hadoop-common-3.1.0.jar
+hadoop-hdfs-client-3.1.0.jar
+hadoop-mapreduce-client-common-3.1.0.jar
+hadoop-mapreduce-client-core-3.1.0.jar
+hadoop-mapreduce-client-jobclient-3.1.0.jar
+hadoop-yarn-api-3.1.0.jar
+hadoop-yarn-client-3.1.0.jar
+hadoop-yarn-common-3.1.0.jar
+hadoop-yarn-registry-3.1.0.jar
+hadoop-yarn-server-common-3.1.0.jar
+hadoop-yarn-server-web-proxy-3.1.0.jar
+hk2-api-2.4.0-b34.jar
+hk2-locator-2.4.0-b34.jar
+hk2-utils-2.4.0-b34.jar
+hppc-0.7.2.jar
+htrace-core4-4.1.0-incubating.jar
+httpclient-4.5.4.jar
+httpcore-4.4.8.jar
+ivy-2.4.0.jar
+jackson-annotations-2.6.7.jar
+jackson-core-2.6.7.jar
+jackson-core-asl-1.9.13.jar
+jackson-databind-2.6.7.1.jar
+jackson-dataformat-yaml-2.6.7.jar
+jackson-jaxrs-base-2.7.8.jar
+jackson-jaxrs-json-provider-2.7.8.jar
+jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.7.jar
+jackson-module-paranamer-2.7.9.jar
+jackson-module-scala_2.11-2.6.7.1.jar
+janino-3.0.8.jar
+java-xmlbuilder-1.1.jar
+javassist-3.18.1-GA.jar
+javax.annotation-api-1.2.jar
+javax.inject-1.jar
+javax.inject-2.4.0-b34.jar
+javax.servlet-api-3.1.0.jar
+javax.ws.rs-api-2.0.1.jar
+javolution-5.5.1.jar
+jaxb-api-2.2.11.jar
+jcip-annotations-1.0-1.jar
+jcl-over-slf4j-1.7.16.jar
+jdo-api-3.0.1.jar
+jersey-client-2.22.2.jar
+jersey-common-2.22.2.jar
+jersey-container-servlet-2.22.2.jar
+jersey-container-servlet-core-2.22.2.jar
+jersey-guava-2.22.2.jar
+jersey-media-jaxb-2.22.2.jar
+jersey-server-2.22.2.jar
+jets3t-0.9.4.jar
+jetty-webapp-9.3.20.v20170531.jar
+jetty-xml-9.3.20.v20170531.jar
+jline-2.12.1.jar
+joda-time-2.9.3.jar
+jodd-core-3.5.2.jar
+jpam-1.1.jar
+json-smart-2.3.jar
+json4s-ast_2.11-3.5.3.jar
+json4s-core_2.11-3.5.3.jar
+json4s-jackson_2.11-3.5.3.jar
+json4s-scalap_2.11-3.5.3.jar
+jsp-api-2.1.jar
+jsr305-1.3.9.jar
+jta-1.1.jar
+jtransforms-2.4.0.jar
+jul-to-slf4j-1.7.16.jar
+kerb-admin-1.0.1.jar
+kerb-client-1.0.1.jar
+kerb-common-1.0.1.jar
+kerb-core-1.0.1.jar
+kerb-crypto-1.0.1.jar
+kerb-identity-1.0.1.jar
+kerb-server-1.0.1.jar
+kerb-simplekdc-1.0.1.jar
+kerb-util-1.0.1.jar
+kerby-asn1-1.0.1.jar
+kerby-config-1.0.1.jar
+kerby-pkix-1.0.1.jar
+kerby-util-1.0.1.jar
+kerby-xdr-1.0.1.jar
+kryo-shaded-3.0.3.jar
+kubernetes-client-3.0.0.jar
+kubernetes-model-2.0.0.jar
+leveldbjni-all-1.8.jar
+libfb303-0.9.3.jar
+libthrift-0.9.3.jar
+log4j-1.2.17.jar
+logging-interceptor-3.8.1.jar
+lz4-java-1.4.0.jar
+machinist_2.11-0.6.1.jar
+macro-compat_2.11-1.1.1.jar
+mesos-1.4.0-shaded-protobuf.jar
+metrics-core-3.1.5.jar
+metrics-graphite-3.1.5.jar
+metrics-json-3.1.5.jar
+metrics-jvm-3.1.5.jar
+minlog-1.3.0.jar
+mssql-jdbc-6.2.1.jre7.jar
+netty-3.9.9.Final.jar
+netty-all-4.1.17.Final.jar
+nimbus-jose-jwt-4.41.1.jar
+objenesis-2.1.jar
+okhttp-2.7.5.jar
+okhttp-3.8.1.jar
+okio-1.13.0.jar
+opencsv-2.3.jar
+orc-core-1.4.3-nohive.jar
+orc-mapreduce-1.4.3-nohive.jar
+oro-2.0.8.jar
+osgi-resource-locator-1.0.1.jar
+paranamer-2.8.jar
+parquet-column-1.8.2.jar
+parquet-common-1.8.2.jar
+parquet-encoding-1.8.2.jar
+parquet-format-2.3.1.jar
+parquet-hadoop-1.8.2.jar
+parquet-hadoop-bundle-1.6.0.jar
+parquet-jackson-1.8.2.jar
+protobuf-java-2.5.0.jar
+py4j-0.10.6.jar
+pyrolite-4.13.jar
+re2j-1.1.jar
+scala-compiler-2.11.8.jar
+scala-library-2.11.8.jar
+scala-parser-combinators_2.11-1.0.4.jar
+scala-reflect-2.11.8.jar
+scala-xml_2.11-1.0.5.jar
+shapeless_2.11-2.3.2.jar
+slf4j-api-1.7.16.jar
+slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
+snappy-0.2.jar
+snappy-java-1.1.7.1.jar
+spire-macros_2.11-0.13.0.jar
+spire_2.11-0.13.0.jar
+stax-api-1.0.1.jar
+stax2-api-3.1.4.jar
+stream-2.7.0.jar
+stringtemplate-3.2.1.jar
+super-csv-2.2.0.jar
+token-provider-1.0.1.jar
+univocity-parsers-2.5.9.jar
+validation-api-1.1.0.Final.jar
+woodstox-core-5.0.3.jar
+xbean-asm5-shaded-4.4.jar
+xz-1.0.jar
+zjsonpatch-0.3.0.jar
+zookeeper-3.4.9.jar
+zstd-jni-1.3.2-2.jar
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 3bf7618e1ea96..0b88a337f5471 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -34,6 +34,7 @@ MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6
     hadoop-2.7
+    hadoop-3
 )
 
 # We'll switch the version to a temp. one, publish POMs using that new version, then switch back to

From 7c93d98aae8d74e0f0606cb03e68b0ac94bde177 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Thu, 5 Apr 2018 18:52:26 +0100
Subject: [PATCH 6/9] remove hadoop-3 as a profile to do a dependency check on,
 as hadoop 3.1 is still in staging

Change-Id: Id2d5655088b2a8c2bdec43f7d17110a513be3f7c
---
 dev/test-dependencies.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 0b88a337f5471..3bf7618e1ea96 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -34,7 +34,6 @@ MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6
     hadoop-2.7
-    hadoop-3
 )
 
 # We'll switch the version to a temp. one, publish POMs using that new version, then switch back to

From 036d92a0973276d9e583a3e6df58b60c2e5a64ad Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Mon, 9 Apr 2018 13:31:55 +0100
Subject: [PATCH 7/9] Revert "remove hadoop-3 as a profile to do a dependency
 check on, as hadoop 3.1 is still in staging"

This reverts commit 7c93d98aae8d74e0f0606cb03e68b0ac94bde177.
---
 dev/test-dependencies.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 3bf7618e1ea96..0b88a337f5471 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -34,6 +34,7 @@ MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6
     hadoop-2.7
+    hadoop-3
 )
 
 # We'll switch the version to a temp. one, publish POMs using that new version, then switch back to

From 52a8c28c564f669aa2cb2998b471f6085fb0742b Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Mon, 9 Apr 2018 14:13:29 +0100
Subject: [PATCH 8/9] SPARK-23807 Hadoop 3.1.0 is shipping: profile =>
 "hadoop-3.1" and test-dependencies.sh knows about it

Change-Id: Ie4906e2f41e9992e803674dce283f03b4dbab67e
---
 dev/deps/{spark-deps-hadoop-3 => spark-deps-hadoop-3.1} | 0
 dev/test-dependencies.sh                                | 2 +-
 hadoop-cloud/pom.xml                                    | 2 +-
 pom.xml                                                 | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename dev/deps/{spark-deps-hadoop-3 => spark-deps-hadoop-3.1} (100%)

diff --git a/dev/deps/spark-deps-hadoop-3 b/dev/deps/spark-deps-hadoop-3.1
similarity index 100%
rename from dev/deps/spark-deps-hadoop-3
rename to dev/deps/spark-deps-hadoop-3.1
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 0b88a337f5471..2fbd6b5e98f7f 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -34,7 +34,7 @@ MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6
     hadoop-2.7
-    hadoop-3
+    hadoop-3.1
 )
 
 # We'll switch the version to a temp. one, publish POMs using that new version, then switch back to
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 48187c7a6ff3d..5dee31fdea4bb 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -210,7 +210,7 @@
      enables store-specific committers.
     -->
     <profile>
-      <id>hadoop-3</id>
+      <id>hadoop-3.1</id>
       <dependencies>
         <!--
         There's now a hadoop-cloud-storage which transitively pulls in the store JARs,
diff --git a/pom.xml b/pom.xml
index 9dff90ca2ab49..88e77ff874748 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2672,7 +2672,7 @@
     </profile>
 
     <profile>
-      <id>hadoop-3</id>
+      <id>hadoop-3.1</id>
       <properties>
         <hadoop.version>3.1.0</hadoop.version>
         <curator.version>2.12.0</curator.version>

From f6b9dc83d56c20d887166ddba7a7b876a57d65cb Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Thu, 12 Apr 2018 20:22:24 +0100
Subject: [PATCH 9/9] SPARK-23807 unshaded jetty dependency fixup needed for
 Azure wasb://

jetty-util and jetty-util-ajax are forced into the dist/jars directory by
explicit identification in the relevant POMs as in the hadoop-dist-scope.

Without this they weren't coming in as spark-assembly was seeing jetty-util marked
as provided. It's not needed for the spark-* JARs, which all use the shaded reference,
but it is needed indirectly via hadoop-azure. This change to the poms reinstates it.

Maven has proven surprisingly "fussy" here; the implication being its "closest declaration wins"
resolution policy doesn't just control versions, it has influence over scoping.

Change-Id: I081023cae84236c925fad4e94168f1dac5a8026a
---
 assembly/pom.xml     |  8 ++++++++
 hadoop-cloud/pom.xml | 18 ++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index a207dae5a74ff..9608c96fd5369 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -254,6 +254,14 @@
           <artifactId>spark-hadoop-cloud_${scala.binary.version}</artifactId>
           <version>${project.version}</version>
         </dependency>
+        <!--
+        Redeclare this dependency to force it into the distribution.
+        -->
+        <dependency>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-util</artifactId>
+          <scope>${hadoop.deps.scope}</scope>
+        </dependency>
       </dependencies>
     </profile>
   </profiles>
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index 5dee31fdea4bb..2c39a7df0146e 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -240,6 +240,24 @@
             </exclusion>
           </exclusions>
         </dependency>
+        <!--
+        The jetty declarations are made
+        (a) to keep that jetty-util-ajax version in sync with the rest of Spark.
+        (b) to minimise the effects which Spark's jetty shading has on the
+            availability of the jetty JARs on for hadoop-azure, which depends
+            on them.
+         -->
+        <dependency>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-util</artifactId>
+          <scope>${hadoop.deps.scope}</scope>
+        </dependency>
+        <dependency>
+          <groupId>org.eclipse.jetty</groupId>
+          <artifactId>jetty-util-ajax</artifactId>
+          <version>${jetty.version}</version>
+          <scope>${hadoop.deps.scope}</scope>
+        </dependency>
       </dependencies>
     </profile>