Coverage for src/cvx/linalg/covariance/pca.py: 100%

1"""PCA analysis (pure NumPy implementation).

3This module provides Principal Component Analysis (PCA) for dimensionality

4reduction of return data. PCA is commonly used to construct factor models

5for portfolio optimization.

7Example:

8 Perform PCA on stock returns:

10 >>> import numpy as np

11 >>> from cvx.linalg import pca

12 >>> np.random.seed(42)

13 >>> returns = np.random.randn(100, 5)

14 >>> result = pca(returns, n_components=3)

15 >>> len(result.explained_variance)

16 3

17 >>> result.factors.shape

18 (100, 3)

19 >>> result.exposure.shape

20 (3, 5)

22"""

24from __future__ import annotations

26from collections import namedtuple

28import numpy as np

30from ..core.exceptions import InvalidComponentsError

31from ..core.types import Matrix

32from ..decomposition.svd import svd

34PCA = namedtuple(

35 "PCA",

36 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"],

37)

38"""Named tuple containing the results of PCA analysis.

40Attributes:

41 explained_variance: Explained variance ratio for each component.

42 Shape (n_components,).

43 factors: Factor returns (principal components). Shape (n_samples, n_components).

44 exposure: Factor exposures (loadings). Shape (n_components, n_assets).

45 cov: Covariance matrix of the factors. Shape (n_components, n_components).

46 systematic: Returns explained by the factors. Shape (n_samples, n_assets).

47 idiosyncratic: Residual returns not explained by factors. Shape (n_samples, n_assets).

49Example:

50 >>> import numpy as np

51 >>> from cvx.linalg import pca

52 >>> np.random.seed(42)

53 >>> returns = np.random.randn(50, 4)

54 >>> result = pca(returns, n_components=2)

55 >>> result.explained_variance.sum() < 1

56 True

57 >>> np.allclose(result.systematic + result.idiosyncratic, returns, atol=1e-10)

58 True

60"""

63def pca(returns: Matrix, n_components: int = 10) -> PCA:

64 """Compute the first n principal components for a return matrix using SVD.

66 Unlike most functions in this package, ``pca`` is not NaN-aware: *returns*

67 must contain only finite values. Clean or impute missing data first.

69 Args:

70 returns: Array of asset returns with shape (n_samples, n_assets).

71 Must not contain NaN or infinite values.

72 n_components: Number of principal components to extract. Must be

73 between 1 and ``min(n_samples, n_assets)``. Defaults to 10.

75 Raises:

76 InvalidComponentsError: If *n_components* is smaller than 1 or larger

77 than ``min(n_samples, n_assets)``.

79 Returns:

80 PCA named tuple containing:

81 - explained_variance: Ratio of variance explained by each component

82 - factors: Factor returns (scores)

83 - exposure: Factor exposures (loadings)

84 - cov: Factor covariance matrix

85 - systematic: Returns explained by factors

86 - idiosyncratic: Residual returns

88 Example:

89 >>> import numpy as np

90 >>> from cvx.linalg import pca

91 >>> np.random.seed(42)

92 >>> returns = np.random.randn(100, 10)

93 >>> result = pca(returns, n_components=3)

94 >>> bool(result.explained_variance[0] > result.explained_variance[1])

95 True

96 >>> factor_corr = np.corrcoef(result.factors.T)

97 >>> bool(np.allclose(factor_corr, np.eye(3), atol=0.1))

98 True

99 >>> VtV = result.exposure @ result.exposure.T

100 >>> bool(np.allclose(VtV, np.eye(3), atol=1e-10))

101 True

102 >>> all(result.explained_variance[i] >= result.explained_variance[i+1]

103 ... for i in range(len(result.explained_variance)-1))

104 True

105 >>> reconstructed = result.factors @ result.exposure

106 >>> centered_systematic = result.systematic - returns.mean(axis=0)

107 >>> bool(np.allclose(reconstructed, centered_systematic, atol=1e-10))

108 True

109

110 """

111 max_components = min(returns.shape)

112 if not 1 <= n_components <= max_components:

113 raise InvalidComponentsError(n_components, max_components)

114

115 x_mean = returns.mean(axis=0)

116 x_centered = returns - x_mean

117

118 u, s_full, vt = svd(x_centered)

119

120 u = u[:, :n_components]

121 s = s_full[:n_components]

122 vt = vt[:n_components, :]

123

124 factors: Matrix = u * s

125 exposure: Matrix = vt

126 explained_variance: Matrix = (s**2) / np.sum(s_full**2)

127 cov: Matrix = np.atleast_2d(np.cov(factors.T))

128 systematic: Matrix = factors @ vt + x_mean

129 idiosyncratic: Matrix = x_centered - factors @ vt

130

131 return PCA(

132 explained_variance=explained_variance,

133 factors=factors,

134 exposure=exposure,

135 cov=cov,

136 systematic=systematic,

137 idiosyncratic=idiosyncratic,

138 )