Remove explicit differentiation parameters. They are no longer required! (#33)

rxwei · web-flow · commit ce67f8d06a9f · 2019-02-26T11:28:09.000-08:00
We've recently changed the type checker to improve the ergonomics of the `@differentiable` attribute. * [swiftlang/swift#22915](swiftlang/swift#22915): Explicit differentiation parameters are no longer required in a `@differentiable` attribute when the function has some arguments that do not conform to `Differentiable`. Those non-differentiable parameters will be skipped and the rest will be differentiated with respect to. * [swiftlang/swift#22877](swiftlang/swift#22877): On an instance method, when a `wrt:` is not specified, `self` is being implicitly included as a differentiation parameter. * [swiftlang/swift#22877](swiftlang/swift#22877): When a `@differentiable` requirement is not met, the `@differentiable` attribute fix-it will appear exactly as written in the original declaration instead of the most complex, canonical form. For instance, `@differentiable` instead of `@differentiable(wrt: (x))`. This greatly simplifies libraries and applications that use automatic differentiation. The protocol requirement `Layer.applied(to:in:)` becomes as simple as this: ```swift @differentiable func applied(to input: Input, in context: Context) -> Output ``` This PR updates deep learning APIs to use the simplest form of `@differentiable` possible. Hooray!
diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ struct Model: Layer {
     var layer2 = Dense<Float>(inputSize: hiddenSize, outputSize: hiddenSize, activation: relu)
     var layer3 = Dense<Float>(inputSize: hiddenSize, outputSize: 3, activation: identity)
     
-    @differentiable(wrt: (self, input))
+    @differentiable
     func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {
         return input.sequenced(in: context, through: layer1, layer2, layer3)
     }
diff --git a/Sources/DeepLearning/Layer.swift b/Sources/DeepLearning/Layer.swift
@@ -63,7 +63,7 @@ public protocol Layer: Differentiable & KeyPathIterable
     ///   - context: The contextual informance for the layer application, e.g. the current learning
     ///     phase.
     /// - Returns: The output.
-    @differentiable(wrt: (self, input))
+    @differentiable
     func applied(to input: Input, in context: Context) -> Output
 }
 
@@ -78,7 +78,7 @@ public extension Layer {
     ///
     /// - Parameter input: The input to the layer.
     /// - Returns: The inference output.
-    @differentiable(wrt: (self, input))
+    @differentiable
     func inferring(from input: Input) -> Output {
         let context = Context(learningPhase: .inference)
         return applied(to: input, in: context)
@@ -104,7 +104,7 @@ public extension Layer {
 
 /// Adds helpers for standard feed-forward, sequential models.
 public extension Differentiable {
-    @differentiable(wrt: (self, l1, l2))
+    @differentiable
     func sequenced<L1: Layer, L2: Layer>(
         in context: Context, through l1: L1, _ l2: L2)
         -> L2.Output
@@ -114,7 +114,7 @@ public extension Differentiable {
         return l2.applied(to: o1, in: context)
     }
 
-    @differentiable(wrt: (self, l1, l2, l3))
+    @differentiable
     func sequenced<L1: Layer, L2: Layer, L3: Layer>(
         in context: Context, through l1: L1, _ l2: L2, _ l3: L3)
         -> L3.Output
@@ -126,7 +126,7 @@ public extension Differentiable {
         return l3.applied(to: o2, in: context)
     }
 
-    @differentiable(wrt: (self, l1, l2, l3, l4))
+    @differentiable
     func sequenced<L1: Layer, L2: Layer, L3: Layer, L4: Layer>(
         in context: Context, through l1: L1, _ l2: L2, _ l3: L3, _ l4: L4)
         -> L4.Output
@@ -140,7 +140,7 @@ public extension Differentiable {
         return l4.applied(to: o3, in: context)
     }
 
-    @differentiable(wrt: (self, l1, l2, l3, l4, l5))
+    @differentiable
     func sequenced<L1: Layer, L2: Layer, L3: Layer, L4: Layer, L5: Layer>(
         in context: Context, through l1: L1, _ l2: L2, _ l3: L3, _ l4: L4, _ l5: L5)
         -> L5.Output
@@ -156,7 +156,7 @@ public extension Differentiable {
         return l5.applied(to: o4, in: context)
     }
 
-    @differentiable(wrt: (self, l1, l2, l3, l4, l5, l6))
+    @differentiable
     func sequenced<L1: Layer, L2: Layer, L3: Layer, L4: Layer, L5: Layer, L6: Layer>(
         in context: Context, through l1: L1, _ l2: L2, _ l3: L3, _ l4: L4, _ l5: L5, _ l6: L6)
         -> L6.Output
@@ -196,7 +196,7 @@ public struct Dense<Scalar: TensorFlowFloatingPoint>: Layer {
     public typealias Activation = @differentiable (Tensor<Scalar>) -> Tensor<Scalar>
     @noDerivative public let activation: Activation
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     public func applied(to input: Tensor<Scalar>, in _: Context) -> Tensor<Scalar> {
         return activation(matmul(input, weight) + bias)
     }
@@ -230,7 +230,7 @@ public struct Conv2D<Scalar: TensorFlowFloatingPoint>: Layer {
     @noDerivative public let strides: (Int32, Int32)
     @noDerivative public let padding: Padding
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     public func applied(to input: Tensor<Scalar>, in _: Context) -> Tensor<Scalar> {
         return activation(input.convolved2D(withFilter: filter,
                                             strides: (1, strides.0, strides.1, 1),
@@ -286,7 +286,7 @@ public struct BatchNorm<Scalar: TensorFlowFloatingPoint>: Layer {
     /// The running variance.
     @noDerivative public let runningVariance: Parameter<Scalar>
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     private func applyingTraining(to input: Tensor<Scalar>) -> Tensor<Scalar> {
         let positiveAxis = (input.rank + axis) % input.rank
         let mean = input.mean(alongAxes: [0, positiveAxis])
@@ -298,13 +298,13 @@ public struct BatchNorm<Scalar: TensorFlowFloatingPoint>: Layer {
         return (input - mean) * inv + offset
     }
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     private func applyingInference(to input: Tensor<Scalar>) -> Tensor<Scalar> {
         let inv = rsqrt(runningVariance.value + epsilon) * scale
         return (input - runningMean.value) * inv + offset
     }
 
-    @differentiable(wrt: (self, input), vjp: _vjpApplied(to:in:))
+    @differentiable(vjp: _vjpApplied(to:in:))
     public func applied(to input: Tensor<Scalar>, in context: Context) -> Tensor<Scalar> {
         switch context.learningPhase {
         case .training:
@@ -360,7 +360,7 @@ public struct MaxPool2D<Scalar: TensorFlowFloatingPoint>: Layer {
         self.padding = padding
     }
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     public func applied(to input: Tensor<Scalar>, in _: Context) -> Tensor<Scalar> {
         return input.maxPooled(
           kernelSize: poolSize, strides: strides, padding: padding)
@@ -383,7 +383,7 @@ public struct AvgPool2D<Scalar: TensorFlowFloatingPoint>: Layer {
         self.padding = padding
     }
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     public func applied(to input: Tensor<Scalar>, in _: Context) -> Tensor<Scalar> {
         return input.averagePooled(
           kernelSize: poolSize, strides: strides, padding: padding)
@@ -410,7 +410,7 @@ public struct LayerNorm<Scalar: TensorFlowFloatingPoint>: Layer {
         self.epsilon = epsilon
     }
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     public func applied(to input: Tensor<Scalar>, in _: Context) -> Tensor<Scalar> {
         let mean = input.mean(alongAxes: axis)
         let variance = input.variance(alongAxes: axis)
@@ -439,17 +439,17 @@ public struct Dropout<Scalar: TensorFlowFloatingPoint>: Layer
         self.probability = probability
     }
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     private func applyingTraining(to input: Tensor<Scalar>) -> Tensor<Scalar> {
         return input.droppingOut(probability: probability)
     }
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     private func applyingInference(to input: Tensor<Scalar>) -> Tensor<Scalar> {
         return input
     }
 
-    @differentiable(wrt: (self, input), vjp: _vjpApplied(to:in:))
+    @differentiable(vjp: _vjpApplied(to:in:))
     public func applied(to input: Tensor<Scalar>, in context: Context) -> Tensor<Scalar> {
         switch context.learningPhase {
         case .training:
@@ -484,7 +484,7 @@ public struct UpSampling2D<Scalar: TensorFlowFloatingPoint>: Layer {
        self.size = size
     }
 
-    @differentiable(wrt: (self, input))
+    @differentiable
     public func applied(to input: Tensor<Scalar>, in _: Context) -> Tensor<Scalar> {
         let shape = input.shape
         let (batchSize, height, width, channels) = (shape[0], shape[1], shape[2], shape[3])
diff --git a/Tests/DeepLearningTests/SequentialTests.swift b/Tests/DeepLearningTests/SequentialTests.swift
@@ -21,7 +21,7 @@ final class SequentialTests: XCTestCase {
             var dense1 = Dense<Float>(inputSize: 2, outputSize: 4, activation: relu)
             var dense2 = Dense<Float>(inputSize: 4, outputSize: 1, activation: relu)
 
-            @differentiable(wrt: (self, input))
+            @differentiable
             func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {
               return input.sequenced(in: context, through: dense1, dense2)
             }
diff --git a/Tests/DeepLearningTests/TrivialModelTests.swift b/Tests/DeepLearningTests/TrivialModelTests.swift
@@ -34,7 +34,7 @@ final class TrivialModelTests: XCTestCase {
                     generator: &Classifier.generator
                 )
             }
-            @differentiable(wrt: (self, input))
+            @differentiable
             func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {
                 let h1 = l1.applied(to: input, in: context)
                 return l2.applied(to: h1, in: context)

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ struct Model: Layer {`
`28`	`28`	`var layer2 = Dense<Float>(inputSize: hiddenSize, outputSize: hiddenSize, activation: relu)`
`29`	`29`	`var layer3 = Dense<Float>(inputSize: hiddenSize, outputSize: 3, activation: identity)`
`30`	`30`
`31`		`- @differentiable(wrt: (self, input))`
	`31`	`+ @differentiable`
`32`	`32`	`func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {`
`33`	`33`	`return input.sequenced(in: context, through: layer1, layer2, layer3)`
`34`	`34`	`}`
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ final class SequentialTests: XCTestCase {`
`21`	`21`	`var dense1 = Dense<Float>(inputSize: 2, outputSize: 4, activation: relu)`
`22`	`22`	`var dense2 = Dense<Float>(inputSize: 4, outputSize: 1, activation: relu)`
`23`	`23`
`24`		`- @differentiable(wrt: (self, input))`
	`24`	`+ @differentiable`
`25`	`25`	`func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {`
`26`	`26`	`return input.sequenced(in: context, through: dense1, dense2)`
`27`	`27`	`}`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ final class TrivialModelTests: XCTestCase {`
`34`	`34`	`generator: &Classifier.generator`
`35`	`35`	`)`
`36`	`36`	`}`
`37`		`- @differentiable(wrt: (self, input))`
	`37`	`+ @differentiable`
`38`	`38`	`func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {`
`39`	`39`	`let h1 = l1.applied(to: input, in: context)`
`40`	`40`	`return l2.applied(to: h1, in: context)`