Coverage for src/cvx/linalg/covariance/pca.py: 100%

25 statements  

« prev     ^ index     » next       coverage.py v7.15.0, created at 2026-07-03 18:56 +0000

1"""PCA analysis (pure NumPy implementation). 

2 

3This module provides Principal Component Analysis (PCA) for dimensionality 

4reduction of return data. PCA is commonly used to construct factor models 

5for portfolio optimization. 

6 

7Example: 

8 Perform PCA on stock returns: 

9 

10 >>> import numpy as np 

11 >>> from cvx.linalg import pca 

12 >>> np.random.seed(42) 

13 >>> returns = np.random.randn(100, 5) 

14 >>> result = pca(returns, n_components=3) 

15 >>> len(result.explained_variance) 

16 3 

17 >>> result.factors.shape 

18 (100, 3) 

19 >>> result.exposure.shape 

20 (3, 5) 

21 

22""" 

23 

24from __future__ import annotations 

25 

26from collections import namedtuple 

27 

28import numpy as np 

29 

30from ..core.exceptions import InvalidComponentsError 

31from ..core.types import Matrix 

32from ..decomposition.svd import svd 

33 

34PCA = namedtuple( 

35 "PCA", 

36 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"], 

37) 

38"""Named tuple containing the results of PCA analysis. 

39 

40Attributes: 

41 explained_variance: Explained variance ratio for each component. 

42 Shape (n_components,). 

43 factors: Factor returns (principal components). Shape (n_samples, n_components). 

44 exposure: Factor exposures (loadings). Shape (n_components, n_assets). 

45 cov: Covariance matrix of the factors. Shape (n_components, n_components). 

46 systematic: Returns explained by the factors. Shape (n_samples, n_assets). 

47 idiosyncratic: Residual returns not explained by factors. Shape (n_samples, n_assets). 

48 

49Example: 

50 >>> import numpy as np 

51 >>> from cvx.linalg import pca 

52 >>> np.random.seed(42) 

53 >>> returns = np.random.randn(50, 4) 

54 >>> result = pca(returns, n_components=2) 

55 >>> result.explained_variance.sum() < 1 

56 True 

57 >>> np.allclose(result.systematic + result.idiosyncratic, returns, atol=1e-10) 

58 True 

59 

60""" 

61 

62 

63def pca(returns: Matrix, n_components: int = 10) -> PCA: 

64 """Compute the first n principal components for a return matrix using SVD. 

65 

66 Unlike most functions in this package, ``pca`` is not NaN-aware: *returns* 

67 must contain only finite values. Clean or impute missing data first. 

68 

69 Args: 

70 returns: Array of asset returns with shape (n_samples, n_assets). 

71 Must not contain NaN or infinite values. 

72 n_components: Number of principal components to extract. Must be 

73 between 1 and ``min(n_samples, n_assets)``. Defaults to 10. 

74 

75 Raises: 

76 InvalidComponentsError: If *n_components* is smaller than 1 or larger 

77 than ``min(n_samples, n_assets)``. 

78 

79 Returns: 

80 PCA named tuple containing: 

81 - explained_variance: Ratio of variance explained by each component 

82 - factors: Factor returns (scores) 

83 - exposure: Factor exposures (loadings) 

84 - cov: Factor covariance matrix 

85 - systematic: Returns explained by factors 

86 - idiosyncratic: Residual returns 

87 

88 Example: 

89 >>> import numpy as np 

90 >>> from cvx.linalg import pca 

91 >>> np.random.seed(42) 

92 >>> returns = np.random.randn(100, 10) 

93 >>> result = pca(returns, n_components=3) 

94 >>> bool(result.explained_variance[0] > result.explained_variance[1]) 

95 True 

96 >>> factor_corr = np.corrcoef(result.factors.T) 

97 >>> bool(np.allclose(factor_corr, np.eye(3), atol=0.1)) 

98 True 

99 >>> VtV = result.exposure @ result.exposure.T 

100 >>> bool(np.allclose(VtV, np.eye(3), atol=1e-10)) 

101 True 

102 >>> all(result.explained_variance[i] >= result.explained_variance[i+1] 

103 ... for i in range(len(result.explained_variance)-1)) 

104 True 

105 >>> reconstructed = result.factors @ result.exposure 

106 >>> centered_systematic = result.systematic - returns.mean(axis=0) 

107 >>> bool(np.allclose(reconstructed, centered_systematic, atol=1e-10)) 

108 True 

109 

110 """ 

111 max_components = min(returns.shape) 

112 if not 1 <= n_components <= max_components: 

113 raise InvalidComponentsError(n_components, max_components) 

114 

115 x_mean = returns.mean(axis=0) 

116 x_centered = returns - x_mean 

117 

118 u, s_full, vt = svd(x_centered) 

119 

120 u = u[:, :n_components] 

121 s = s_full[:n_components] 

122 vt = vt[:n_components, :] 

123 

124 factors: Matrix = u * s 

125 exposure: Matrix = vt 

126 explained_variance: Matrix = (s**2) / np.sum(s_full**2) 

127 cov: Matrix = np.atleast_2d(np.cov(factors.T)) 

128 systematic: Matrix = factors @ vt + x_mean 

129 idiosyncratic: Matrix = x_centered - factors @ vt 

130 

131 return PCA( 

132 explained_variance=explained_variance, 

133 factors=factors, 

134 exposure=exposure, 

135 cov=cov, 

136 systematic=systematic, 

137 idiosyncratic=idiosyncratic, 

138 )