1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
//! Gradient Descent Module

use num::Float;

use util::GradientDescAlgo;

/// Simplest possible Gradient Descent algorithm
/// Gradient step is just gradient * learning_rate
#[derive(Clone, Copy, Debug)]
pub struct GradientDesc;

// Up to clinet to negate step if needed
impl<F: Float> GradientDescAlgo<F> for GradientDesc {
	fn calculate(&mut self, mut grad: Vec<F>, lr: F) -> Vec<F> {
		for x in &mut grad {
			*x = *x * lr;
		}
		grad
	}
}

/// Basic gradient descent with momentum
#[derive(Clone, Debug)]
pub struct GradDescMomentum<F: Float> {
	momentum: F,
	cache: Option<Vec<F>>,
}

impl<F: Float> GradientDescAlgo<F> for GradDescMomentum<F> {
    fn calculate(&mut self, grad: Vec<F>, lr: F) -> Vec<F> {
    	if self.cache.is_none() {
    		self.cache = Some(vec![F::zero(); grad.len()]);
    	}

    	// Probably not the cleanest way to do this
    	self.cache = Some(
    		self.cache.as_ref()
    				  .unwrap()
    				  .iter()
    				  .zip(grad.into_iter())
    				  .map(|(&x, y)| self.momentum*x + lr*y)
    				  .collect());
    	self.cache.clone().unwrap()
    }
}

impl Default for GradDescMomentum<f64> {
	fn default() -> GradDescMomentum<f64> {
		GradDescMomentum {
			momentum: 0.9,
			cache: None
		}
	}
}

impl<F: Float> GradDescMomentum<F> {
	/// Creates a new GradDescMomentum
	pub fn new(momentum: F) -> GradDescMomentum<F> {
		GradDescMomentum {
			momentum: momentum,
			cache: None
		}
	}
}

/// The RMSProp algorithm (Hinton et al. 2012).
#[derive(Debug, Clone)]
pub struct RMSProp<F: Float> {
    /// Rate at which running total of average square gradients decays
    decay_rate: F,
    /// Small value used to avoid divide by zero
    epsilon: F,
    cache: Option<Vec<F>>,
}

impl<F: Float> GradientDescAlgo<F> for RMSProp<F> {
    fn calculate(&mut self, grad: Vec<F>, lr: F) -> Vec<F> {
    	if self.cache.is_none() {
    		self.cache = Some(vec![F::zero(); grad.len()]);
    	}

    	self.cache = Some(
    		self.cache.as_ref()
    				  .unwrap()
    				  .iter()
    				  .zip(grad.iter())
    				  .map(|(&x, &y)| x * self.decay_rate + y*y*(F::one() - self.decay_rate))
    				  .collect());

    	self.cache.as_ref()
    			  .unwrap()
    			  .iter()
    			  .zip(grad.into_iter())
    			  .map(|(&x, y)| y * lr / (x.sqrt() + self.epsilon))
    			  .collect()
    }
}

impl Default for RMSProp<f64> {
	fn default() -> RMSProp<f64> {
		RMSProp {
			decay_rate: 0.90,
			epsilon: 0.00001,
			cache: None
		}
	}
}

impl<F: Float> RMSProp<F> {
	/// Creates a new RMSProp
	pub fn new(decay_rate: F, epsilon: F) -> RMSProp<F> {
		RMSProp {
			decay_rate: decay_rate,
			epsilon: epsilon,
			cache: None
		}
	}
}