Coverage for src/cvx/linalg/covariance/pca.py: 100%
25 statements
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-03 18:56 +0000
« prev ^ index » next coverage.py v7.15.0, created at 2026-07-03 18:56 +0000
1"""PCA analysis (pure NumPy implementation).
3This module provides Principal Component Analysis (PCA) for dimensionality
4reduction of return data. PCA is commonly used to construct factor models
5for portfolio optimization.
7Example:
8 Perform PCA on stock returns:
10 >>> import numpy as np
11 >>> from cvx.linalg import pca
12 >>> np.random.seed(42)
13 >>> returns = np.random.randn(100, 5)
14 >>> result = pca(returns, n_components=3)
15 >>> len(result.explained_variance)
16 3
17 >>> result.factors.shape
18 (100, 3)
19 >>> result.exposure.shape
20 (3, 5)
22"""
24from __future__ import annotations
26from collections import namedtuple
28import numpy as np
30from ..core.exceptions import InvalidComponentsError
31from ..core.types import Matrix
32from ..decomposition.svd import svd
34PCA = namedtuple(
35 "PCA",
36 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"],
37)
38"""Named tuple containing the results of PCA analysis.
40Attributes:
41 explained_variance: Explained variance ratio for each component.
42 Shape (n_components,).
43 factors: Factor returns (principal components). Shape (n_samples, n_components).
44 exposure: Factor exposures (loadings). Shape (n_components, n_assets).
45 cov: Covariance matrix of the factors. Shape (n_components, n_components).
46 systematic: Returns explained by the factors. Shape (n_samples, n_assets).
47 idiosyncratic: Residual returns not explained by factors. Shape (n_samples, n_assets).
49Example:
50 >>> import numpy as np
51 >>> from cvx.linalg import pca
52 >>> np.random.seed(42)
53 >>> returns = np.random.randn(50, 4)
54 >>> result = pca(returns, n_components=2)
55 >>> result.explained_variance.sum() < 1
56 True
57 >>> np.allclose(result.systematic + result.idiosyncratic, returns, atol=1e-10)
58 True
60"""
63def pca(returns: Matrix, n_components: int = 10) -> PCA:
64 """Compute the first n principal components for a return matrix using SVD.
66 Unlike most functions in this package, ``pca`` is not NaN-aware: *returns*
67 must contain only finite values. Clean or impute missing data first.
69 Args:
70 returns: Array of asset returns with shape (n_samples, n_assets).
71 Must not contain NaN or infinite values.
72 n_components: Number of principal components to extract. Must be
73 between 1 and ``min(n_samples, n_assets)``. Defaults to 10.
75 Raises:
76 InvalidComponentsError: If *n_components* is smaller than 1 or larger
77 than ``min(n_samples, n_assets)``.
79 Returns:
80 PCA named tuple containing:
81 - explained_variance: Ratio of variance explained by each component
82 - factors: Factor returns (scores)
83 - exposure: Factor exposures (loadings)
84 - cov: Factor covariance matrix
85 - systematic: Returns explained by factors
86 - idiosyncratic: Residual returns
88 Example:
89 >>> import numpy as np
90 >>> from cvx.linalg import pca
91 >>> np.random.seed(42)
92 >>> returns = np.random.randn(100, 10)
93 >>> result = pca(returns, n_components=3)
94 >>> bool(result.explained_variance[0] > result.explained_variance[1])
95 True
96 >>> factor_corr = np.corrcoef(result.factors.T)
97 >>> bool(np.allclose(factor_corr, np.eye(3), atol=0.1))
98 True
99 >>> VtV = result.exposure @ result.exposure.T
100 >>> bool(np.allclose(VtV, np.eye(3), atol=1e-10))
101 True
102 >>> all(result.explained_variance[i] >= result.explained_variance[i+1]
103 ... for i in range(len(result.explained_variance)-1))
104 True
105 >>> reconstructed = result.factors @ result.exposure
106 >>> centered_systematic = result.systematic - returns.mean(axis=0)
107 >>> bool(np.allclose(reconstructed, centered_systematic, atol=1e-10))
108 True
110 """
111 max_components = min(returns.shape)
112 if not 1 <= n_components <= max_components:
113 raise InvalidComponentsError(n_components, max_components)
115 x_mean = returns.mean(axis=0)
116 x_centered = returns - x_mean
118 u, s_full, vt = svd(x_centered)
120 u = u[:, :n_components]
121 s = s_full[:n_components]
122 vt = vt[:n_components, :]
124 factors: Matrix = u * s
125 exposure: Matrix = vt
126 explained_variance: Matrix = (s**2) / np.sum(s_full**2)
127 cov: Matrix = np.atleast_2d(np.cov(factors.T))
128 systematic: Matrix = factors @ vt + x_mean
129 idiosyncratic: Matrix = x_centered - factors @ vt
131 return PCA(
132 explained_variance=explained_variance,
133 factors=factors,
134 exposure=exposure,
135 cov=cov,
136 systematic=systematic,
137 idiosyncratic=idiosyncratic,
138 )