Coverage for src/fast_minimum_variance/proximal.py: 100%

58 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-02 13:28 +0000

1"""Fast solver for 0.5 ||mat @ x - vec||^2 s. t. {x >= 0, sum(x) = 1}. 

2 

3This module implements proximal gradient descent for constrained linear least squares 

4optimization on the probability simplex. The algorithm is based on iterative projection 

5using the efficient simplex projection from Duchi et al. (2008). 

6 

7The gradient is evaluated matrix-free as mat.T @ (mat @ w), avoiding explicit assembly 

8of the n x n normal matrix mat.T @ mat. The Lipschitz constant is estimated via power 

9iteration, also matrix-free. 

10 

11References: 

12---------- 

13Duchi, J., Shalev-Shwartz, S., Singer, Y., & Chandra, T. (2008). 

14"Efficient Projections onto the l1-Ball for Learning in High Dimensions." 

15Proceedings of the 25th International Conference on Machine Learning (ICML). 

16""" 

17 

18from __future__ import annotations 

19 

20from typing import TYPE_CHECKING, cast 

21 

22import numpy as np 

23from cvx.linalg import power_iteration 

24 

25if TYPE_CHECKING: 

26 from collections.abc import Callable 

27 

28 from numpy.typing import NDArray 

29 

30 

31def proj_simplex( 

32 vec: NDArray[np.floating], 

33 rad: float = 1.0, 

34) -> NDArray[np.floating]: 

35 """Project a vector onto the probability simplex. 

36 

37 This function computes the Euclidean projection of a given vector onto the probability 

38 simplex. The simplex is defined as the set of non-negative vectors that sum to a 

39 given radius, typically 1. The projection ensures that the resulting vector satisfies 

40 these constraints. 

41 

42 The algorithm is based on Duchi et al. (2008) "Efficient Projections onto the 

43 l1-Ball for Learning in High Dimensions". 

44 

45 Parameters 

46 ---------- 

47 vec : NDArray[np.floating] 

48 Input vector that is to be projected onto the simplex. 

49 rad : float, optional 

50 Radius of the simplex. The projected vector will have components summing 

51 to this value. Default is 1.0. 

52 

53 Returns: 

54 ------- 

55 NDArray[np.floating] 

56 The projected vector that lies on the probability simplex. 

57 

58 Raises: 

59 ------ 

60 ValueError 

61 If the input vector is empty. 

62 

63 Examples: 

64 -------- 

65 >>> import numpy as np 

66 >>> vec = np.array([1.0, 2.0, 3.0]) 

67 >>> result = proj_simplex(vec) 

68 >>> bool(np.isclose(result.sum(), 1.0)) 

69 True 

70 >>> bool(np.all(result >= 0)) 

71 True 

72 

73 """ 

74 muu = np.sort(vec)[::-1] 

75 cummeans = 1 / np.arange(1, len(vec) + 1) * (np.cumsum(muu) - rad) 

76 rho = max(np.where(muu > cummeans)[0]) 

77 result: NDArray[np.floating] = np.maximum(vec - cummeans[rho], 0) 

78 return result 

79 

80 

81def _lipschitz( 

82 mat: NDArray[np.floating], 

83 extra_matvec: Callable[[NDArray[np.floating]], NDArray[np.floating]] | None = None, 

84 n_iter: int = 30, 

85 rng: np.random.Generator | None = None, 

86) -> float: 

87 """Estimate lambda_max(mat.T @ mat + extra) via power iteration (matrix-free). 

88 

89 extra_matvec: optional callable v -> extra @ v for a second SPD contribution. 

90 Each iteration costs O(rows * cols) — two matrix-vector products with mat — 

91 and never forms the cols x cols normal matrix. The iteration is delegated to 

92 cvx-linalg's operator-aware power_iteration, applied matrix-free to the normal 

93 operator v -> mat.T @ (mat @ v) (+ extra). 

94 """ 

95 seed = None if rng is None else int(rng.integers(np.iinfo(np.int64).max)) 

96 

97 def normal_matvec(v: NDArray[np.floating]) -> NDArray[np.floating]: 

98 """Apply the normal operator mat.T @ mat (+ extra) to v, matrix-free.""" 

99 w = mat.T @ (mat @ v) 

100 if extra_matvec is not None: 

101 w = w + extra_matvec(v) 

102 return w 

103 

104 matvec = cast("Callable[[NDArray[np.float64]], NDArray[np.float64]]", normal_matvec) 

105 eigenvalue, _ = power_iteration(matvec, n=mat.shape[1], n_iter=n_iter, seed=seed) 

106 return max(float(eigenvalue), 0.0) 

107 

108 

109def fista_gradient( 

110 mat: NDArray[np.floating], 

111 vec: NDArray[np.floating], 

112 *, 

113 extra_grad: Callable[[NDArray[np.floating]], NDArray[np.floating]] | None = None, 

114 eps_rel: float = 1e-6, 

115 max_iter: int = 100000, 

116) -> tuple[NDArray[np.floating], int]: 

117 r"""FISTA (Nesterov-accelerated proximal gradient) on the probability simplex. 

118 

119 Same interface as ``prox_gradient`` but uses the Beck-Teboulle momentum 

120 sequence $t_{k+1} = (1 + \\sqrt{1+4t_k^2})/2$ to achieve $O(1/k^2)$ 

121 convergence for convex objectives (versus $O(1/k)$ for plain gradient 

122 descent). For strongly convex $f$ with condition number $\\kappa$ the 

123 linear convergence rate is $(1 - 1/\\sqrt{\\kappa})^k$, matching CG's 

124 asymptotic iteration count. 

125 

126 The gradient is evaluated at the extrapolated point $y_k$; the simplex 

127 projection is applied to obtain $x_k$; the momentum update then forms 

128 $y_{k+1} = x_k + \\frac{t_k-1}{t_{k+1}}(x_k - x_{k-1})$. 

129 

130 References: 

131 ---------- 

132 Beck, A., & Teboulle, M. (2009). "A Fast Iterative Shrinkage-Thresholding 

133 Algorithm for Linear Inverse Problems." SIAM Journal on Imaging Sciences. 

134 

135 Examples: 

136 -------- 

137 >>> import numpy as np 

138 >>> mat = np.array([[1.0, 0.5], [0.5, 1.0]]) 

139 >>> vec = np.ones(2) 

140 >>> result, _ = fista_gradient(mat, vec) 

141 >>> bool(np.isclose(result.sum(), 1.0)) 

142 True 

143 

144 """ 

145 rng = np.random.default_rng() 

146 lip = _lipschitz(mat, extra_matvec=extra_grad, rng=rng) 

147 step = 1.0 / lip if lip > 1e-15 else 1.0 

148 out_prod = mat.T @ vec 

149 

150 x = proj_simplex(np.asarray(rng.standard_normal(mat.shape[1]))) 

151 y = x.copy() 

152 t = 1.0 

153 

154 for ite in range(1, max_iter + 1): # noqa: B007 

155 grad = mat.T @ (mat @ y) - out_prod 

156 if extra_grad is not None: 

157 grad = grad + extra_grad(y) 

158 x_new = proj_simplex(y - step * grad) 

159 

160 t_new = 0.5 * (1.0 + np.sqrt(1.0 + 4.0 * t * t)) 

161 y = x_new + ((t - 1.0) / t_new) * (x_new - x) 

162 

163 err = float(np.linalg.norm(x - x_new)) 

164 x = x_new 

165 t = t_new 

166 

167 if err < eps_rel: 

168 break 

169 

170 return x, ite 

171 

172 

173def prox_gradient( 

174 mat: NDArray[np.floating], 

175 vec: NDArray[np.floating], 

176 *, 

177 extra_grad: Callable[[NDArray[np.floating]], NDArray[np.floating]] | None = None, 

178 eps_rel: float = 1e-6, 

179 max_iter: int = 100000, 

180) -> tuple[NDArray[np.floating], int]: 

181 """Perform proximal gradient descent to solve a constrained optimization problem. 

182 

183 Solves the optimization problem: 

184 minimize 0.5 ||mat @ x - vec||^2 + g(x) 

185 subject to x >= 0, sum(x) = 1 

186 

187 where g captures an optional extra gradient term supplied via ``extra_grad``. 

188 The gradient is evaluated matrix-free at each step; the normal matrix 

189 mat.T @ mat is never formed. The Lipschitz constant is estimated once via 

190 power iteration at O(n_power_iter * rows * cols) setup cost. 

191 

192 Parameters 

193 ---------- 

194 mat : NDArray[np.floating] 

195 A matrix of shape (n_samples, n_features). 

196 vec : NDArray[np.floating] 

197 A vector of shape (n_samples,). 

198 extra_grad : callable, optional 

199 v -> additional gradient term (e.g. ``alpha * target @ v`` for 

200 Ledoit-Wolf shrinkage). Must be SPD for convergence guarantees. 

201 When provided, the Lipschitz estimate accounts for this term. 

202 eps_rel : float, optional 

203 Relative step-size change stopping tolerance. Default is 1e-6. 

204 max_iter : int, optional 

205 Maximum number of iterations. Default is 100000. 

206 

207 Returns: 

208 ------- 

209 tuple[NDArray[np.floating], int] 

210 ``(w, n_iters)`` — weight vector of shape (n_features,) and the 

211 number of gradient steps taken. 

212 

213 Examples: 

214 -------- 

215 >>> import numpy as np 

216 >>> mat = np.array([[1.0, 0.5], [0.5, 1.0]]) 

217 >>> vec = np.ones(2) 

218 >>> result, _ = prox_gradient(mat, vec) 

219 >>> bool(np.isclose(result.sum(), 1.0)) 

220 True 

221 

222 """ 

223 rng = np.random.default_rng() 

224 prim_var: NDArray[np.floating] = np.asarray(rng.standard_normal(size=mat.shape[1])) 

225 lip = _lipschitz(mat, extra_matvec=extra_grad, rng=rng) 

226 step = 1.0 / lip if lip > 1e-15 else 1.0 

227 

228 # Precompute mat.T @ vec once; zero for minimum-variance (vec = 0). 

229 out_prod = mat.T @ vec 

230 ite = 0 

231 err_rel = eps_rel + 1 

232 while err_rel > eps_rel and ite < max_iter: 

233 grad = mat.T @ (mat @ prim_var) - out_prod 

234 if extra_grad is not None: 

235 grad = grad + extra_grad(prim_var) 

236 prim_var_new = proj_simplex(prim_var - step * grad) 

237 err_rel = float(np.linalg.norm(prim_var - prim_var_new, 2)) 

238 prim_var = prim_var_new.copy() 

239 ite += 1 

240 return prim_var, ite