Coverage for src / basanos / analytics / _stats.py: 100%
279 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-19 05:23 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-19 05:23 +0000
1"""Statistical metrics and ratios for financial returns.
3This module defines the Stats class which operates on a Data instance to
4compute per-asset statistics like skew, kurtosis, volatility, Sharpe,
5VaR/CVaR, and more.
6"""
8import dataclasses
9from collections.abc import Callable, Iterable
10from datetime import timedelta
11from functools import wraps
12from typing import cast
14import numpy as np
15import polars as pl
16from scipy.stats import norm
19def _drawdown_series(series: pl.Series) -> pl.Series:
20 """Compute the drawdown percentage series from a returns series.
22 Treats ``series`` as additive daily returns and builds a normalised NAV
23 starting at 1.0. The high-water mark is the running maximum of that NAV;
24 drawdown is expressed as the fraction below the high-water mark.
26 Args:
27 series: A Polars Series of additive returns (profit / AUM).
29 Returns:
30 A Polars Float64 Series whose values are in [0, 1]. A value of 0
31 means the NAV is at its all-time high; a value of 0.2 means the NAV
32 is 20 % below its previous peak.
34 Examples:
35 >>> import polars as pl
36 >>> s = pl.Series([0.0, -0.1, 0.2])
37 >>> [round(x, 10) for x in _drawdown_series(s).to_list()]
38 [0.0, 0.1, 0.0]
39 """
40 nav = 1.0 + series.cast(pl.Float64).cum_sum()
41 hwm = nav.cum_max()
42 # Guard against division by zero: a NAV of exactly 0 would make the
43 # drawdown fraction undefined. In practice NAV starts at 1.0 so this can
44 # only occur for extremely large cumulative losses; the 1e-10 floor avoids
45 # a ZeroDivisionError while having no effect on normal data.
46 hwm_safe = hwm.clip(lower_bound=1e-10)
47 return ((hwm - nav) / hwm_safe).clip(lower_bound=0.0)
50def _to_float(value: object) -> float:
51 """Safely convert a Polars aggregation result to float.
53 Examples:
54 >>> _to_float(2.0)
55 2.0
56 >>> _to_float(None)
57 0.0
58 """
59 if value is None:
60 return 0.0
61 if isinstance(value, timedelta):
62 return value.total_seconds()
63 return float(cast(float, value))
66def _to_float_or_none(value: object) -> float | None:
67 """Safely convert a Polars aggregation result to float or None."""
68 if value is None:
69 return None
70 if isinstance(value, timedelta):
71 return value.total_seconds()
72 return float(cast(float, value))
75@dataclasses.dataclass(frozen=True)
76class Stats:
77 """Statistical analysis tools for financial returns data.
79 This class provides a comprehensive set of methods for calculating various
80 financial metrics and statistics on returns data, including:
82 - Basic statistics (mean, skew, kurtosis)
83 - Risk metrics (volatility, value-at-risk, drawdown)
84 - Performance ratios (Sharpe, information ratio)
85 - Win/loss metrics (win rate, profit factor, payoff ratio)
87 The class is designed to work with the _Data class and operates on Polars DataFrames
88 for efficient computation.
90 Attributes:
91 data: The _Data object containing returns data.
93 Examples:
94 >>> import polars as pl
95 >>> from datetime import date
96 >>> data = pl.DataFrame({
97 ... "date": [date(2020, 1, 1), date(2020, 1, 2), date(2020, 1, 3)],
98 ... "returns": [0.01, -0.02, 0.03],
99 ... })
100 >>> stats = Stats(data=data)
101 >>> stats.assets
102 ['returns']
103 """
105 data: pl.DataFrame
107 def __post_init__(self) -> None:
108 """Validate the input data frame after initialization.
110 Ensures that `data` is a Polars DataFrame and contains at least one
111 row. Raises TypeError or ValueError otherwise.
112 """
113 if not isinstance(self.data, pl.DataFrame):
114 raise TypeError
115 if self.data.height == 0:
116 raise ValueError
118 @property
119 def assets(self) -> list[str]:
120 """List of asset column names (numeric columns excluding 'date')."""
121 return [c for c in self.data.columns if c != "date" and self.data[c].dtype.is_numeric()]
123 @staticmethod
124 def _mean_positive_expr(series: pl.Series) -> float:
125 """Return the mean of strictly positive values, or 0.0 if none exist."""
126 result = series.filter(series > 0).mean()
127 return _to_float(result)
129 @staticmethod
130 def _mean_negative_expr(series: pl.Series) -> float:
131 """Return the mean of strictly negative values, or 0.0 if none exist."""
132 result = series.filter(series < 0).mean()
133 return _to_float(result)
135 @staticmethod
136 def columnwise_stat(func: Callable[..., float | int | None]) -> Callable[..., dict[str, float | int | None]]:
137 """Apply a column-wise statistical function to all numeric columns.
139 Args:
140 func (Callable): The function to decorate.
142 Returns:
143 Callable: The decorated function.
145 """
147 @wraps(func)
148 def wrapper(self: "Stats", *args: object, **kwargs: object) -> dict[str, float | int | None]:
149 """Apply the wrapped stat function to each asset column and return results as a dict."""
150 return {asset: func(self, self.data[asset], *args, **kwargs) for asset in self.assets}
152 return wrapper
154 @columnwise_stat
155 def skew(self, series: pl.Series) -> float | None:
156 """Calculate skewness (asymmetry) for each numeric column.
158 Args:
159 series (pl.Series): The series to calculate skewness for.
161 Returns:
162 float: The skewness value.
164 """
165 return _to_float_or_none(series.skew(bias=False))
167 @columnwise_stat
168 def kurtosis(self, series: pl.Series) -> float | None:
169 """Calculate the excess kurtosis of returns (Fisher definition).
171 Uses an unbiased estimator when possible. For short samples where an
172 unbiased estimator is undefined (Polars returns None when < 4 non-null
173 observations), falls back to the biased estimator. If the series is
174 still too short or variance is zero, computes the moment-based excess
175 kurtosis m4/m2^2 - 3.0, returning 0.0 for constant series.
176 """
177 # Drop nulls to match test expectations (ignore missing values)
178 s = series.drop_nulls()
179 # Use biased estimator first (Fisher=True by default in Polars)
180 return _to_float_or_none(s.kurtosis(bias=True))
182 @columnwise_stat
183 def avg_return(self, series: pl.Series) -> float:
184 """Calculate average return per non-zero, non-null value.
186 Args:
187 series (pl.Series): The series to calculate average return for.
189 Returns:
190 float: The average return value.
192 """
193 result = series.filter(series.is_not_null() & (series != 0)).mean()
194 return _to_float(result)
196 @columnwise_stat
197 def avg_win(self, series: pl.Series) -> float:
198 """Calculate the average winning return/trade for an asset.
200 Args:
201 series (pl.Series): The series to calculate average win for.
203 Returns:
204 float: The average winning return.
206 """
207 return self._mean_positive_expr(series)
209 @columnwise_stat
210 def avg_loss(self, series: pl.Series) -> float:
211 """Calculate the average loss return/trade for a period.
213 Args:
214 series (pl.Series): The series to calculate average loss for.
216 Returns:
217 float: The average loss return.
219 """
220 return self._mean_negative_expr(series)
222 @columnwise_stat
223 def volatility(self, series: pl.Series, periods: int | float | None = None, annualize: bool = True) -> float:
224 """Calculate the volatility of returns.
226 - Std dev of returns
227 - Annualized by sqrt(periods) if `annualize` is True.
229 Args:
230 series (pl.Series): The series to calculate volatility for.
231 periods (int, optional): Number of periods per year. Defaults to 252.
232 annualize (bool, optional): Whether to annualize the result. Defaults to True.
234 Returns:
235 float: The volatility value.
237 """
238 raw_periods = periods or self.periods_per_year
240 # Ensure it's numeric
241 if not isinstance(raw_periods, int | float):
242 raise TypeError
244 factor = np.sqrt(raw_periods) if annualize else 1.0
245 return _to_float(series.std()) * factor
247 @columnwise_stat
248 def value_at_risk(self, series: pl.Series, sigma: float = 1.0, alpha: float = 0.05) -> float:
249 """Calculate the daily value-at-risk.
251 Uses variance-covariance calculation with confidence level.
253 Args:
254 series (pl.Series): The series to calculate value at risk for.
255 alpha (float, optional): Confidence level. Defaults to 0.05.
256 sigma (float, optional): Standard deviation multiplier. Defaults to 1.0.
258 Returns:
259 float: The value at risk.
261 """
262 mu = _to_float(series.mean())
263 sigma *= _to_float(series.std())
265 return float(norm.ppf(alpha, mu, sigma))
267 @columnwise_stat
268 def conditional_value_at_risk(self, series: pl.Series, sigma: float = 1.0, alpha: float = 0.05) -> float:
269 """Calculate the conditional value-at-risk.
271 Also known as CVaR or expected shortfall, calculated for each numeric column.
273 Args:
274 series (pl.Series): The series to calculate conditional value at risk for.
275 alpha (float, optional): Confidence level. Defaults to 0.05.
276 sigma (float, optional): Standard deviation multiplier. Defaults to 1.0.
278 Returns:
279 float: The conditional value at risk.
281 """
282 mu = _to_float(series.mean())
283 sigma *= _to_float(series.std())
285 var = norm.ppf(alpha, mu, sigma)
287 # Compute mean of returns less than or equal to VaR.
288 # Return NaN when no empirical observations fall below the parametric
289 # VaR threshold (empty filter), rather than the misleading 0.0 that
290 # _to_float(None) would otherwise produce.
291 mask = cast(Iterable[bool], series < var)
292 filtered = series.filter(mask)
293 if filtered.is_empty():
294 return float("nan")
295 return _to_float(filtered.mean())
297 @columnwise_stat
298 def best(self, series: pl.Series) -> float | None:
299 """Find the maximum return per column (best period).
301 Args:
302 series (pl.Series): The series to find the best return for.
304 Returns:
305 float: The maximum return value.
307 """
308 return _to_float_or_none(series.max())
310 @columnwise_stat
311 def worst(self, series: pl.Series) -> float | None:
312 """Find the minimum return per column (worst period).
314 Args:
315 series (pl.Series): The series to find the worst return for.
317 Returns:
318 float: The minimum return value.
320 """
321 return _to_float_or_none(series.min())
323 @columnwise_stat
324 def win_rate(self, series: pl.Series) -> float:
325 """Calculate the win rate (fraction of profitable periods).
327 Counts the proportion of non-null periods where the return is strictly
328 positive.
330 Args:
331 series (pl.Series): The series to calculate win rate for.
333 Returns:
334 float: Win rate in [0, 1], or NaN when the series contains no
335 non-null observations.
337 """
338 non_null = series.drop_nulls()
339 if non_null.is_empty():
340 return float("nan")
341 n_positive = int((non_null > 0).sum())
342 return n_positive / len(non_null)
344 @columnwise_stat
345 def profit_factor(self, series: pl.Series) -> float:
346 """Calculate the profit factor (gross wins / absolute gross losses).
348 A profit factor greater than 1.0 indicates the strategy produces more
349 gross profit than gross loss. Returns ``inf`` when there are no losing
350 periods, ``0.0`` when there are no winning periods, and ``nan`` when
351 there are neither wins nor losses (and no losses).
353 Args:
354 series (pl.Series): The series to calculate profit factor for.
356 Returns:
357 float: The profit factor.
359 """
360 gross_wins = _to_float(series.filter(series > 0).sum())
361 gross_losses = abs(_to_float(series.filter(series < 0).sum()))
362 if gross_losses == 0.0:
363 return float("inf") if gross_wins > 0 else float("nan")
364 return gross_wins / gross_losses
366 @columnwise_stat
367 def payoff_ratio(self, series: pl.Series) -> float:
368 """Calculate the payoff ratio (average win / absolute average loss).
370 Separates edge type — a high payoff ratio implies the strategy wins
371 infrequently but with large magnitude; a low payoff ratio implies
372 frequent small wins. Returns ``nan`` when either the average win or
373 the average loss is zero (no profitable / no losing periods).
375 Args:
376 series (pl.Series): The series to calculate payoff ratio for.
378 Returns:
379 float: The payoff ratio.
381 """
382 avg_w = self._mean_positive_expr(series)
383 avg_l = self._mean_negative_expr(series)
384 if avg_l == 0.0:
385 return float("nan")
386 return avg_w / abs(avg_l)
388 def monthly_win_rate(self) -> dict[str, float]:
389 """Calculate the monthly win rate (fraction of profitable months).
391 Groups the daily returns data by calendar month, computes the
392 compounded return for each month, then returns the fraction of months
393 that had a positive compounded return.
395 Requires a ``date`` column in ``self.data``. When no ``date`` column
396 is present, each asset entry is ``nan``.
398 Returns:
399 dict[str, float]: Monthly win rate in [0, 1] per asset.
401 """
402 if "date" not in self.data.columns:
403 return {asset: float("nan") for asset in self.assets}
405 result: dict[str, float] = {}
406 for asset in self.assets:
407 df = (
408 self.data.select(["date", asset])
409 .drop_nulls()
410 .with_columns(
411 [
412 pl.col("date").dt.year().alias("_year"),
413 pl.col("date").dt.month().alias("_month"),
414 ]
415 )
416 )
417 monthly = (
418 df.group_by(["_year", "_month"])
419 .agg((pl.col(asset) + 1.0).product().alias("gross"))
420 .with_columns((pl.col("gross") - 1.0).alias("monthly_return"))
421 )
422 n_total = len(monthly)
423 if n_total == 0:
424 result[asset] = float("nan")
425 else:
426 n_positive = int((monthly["monthly_return"] > 0).sum())
427 result[asset] = n_positive / n_total
428 return result
430 def worst_n_periods(self, n: int = 5) -> dict[str, list[float | None]]:
431 """Return the N worst return periods per asset.
433 Sorts each asset's returns in ascending order and returns the first
434 ``n`` values. If the series has fewer than ``n`` non-null
435 observations the list is padded with ``None`` on the right.
437 Args:
438 n (int, optional): Number of worst periods to return. Defaults to 5.
440 Returns:
441 dict[str, list[float | None]]: Sorted worst returns per asset.
443 """
444 result: dict[str, list[float | None]] = {}
445 for asset in self.assets:
446 series = self.data[asset].drop_nulls()
447 worst: list[float | None] = series.sort(descending=False).head(n).to_list()
448 while len(worst) < n:
449 worst.append(None)
450 result[asset] = worst
451 return result
453 def up_capture(self, benchmark: pl.Series) -> dict[str, float]:
454 """Calculate the up-market capture ratio relative to a benchmark.
456 Measures the fraction of the benchmark's upside that the strategy
457 captures. Uses geometric means over benchmark up-periods
458 (benchmark > 0). A value greater than 1.0 means the strategy
459 outperformed the benchmark in rising markets.
461 Args:
462 benchmark (pl.Series): Benchmark return series aligned row-by-row
463 with ``self.data``.
465 Returns:
466 dict[str, float]: Up capture ratio per asset.
468 """
469 result: dict[str, float] = {}
470 up_mask = benchmark > 0
471 bench_up = benchmark.filter(up_mask).drop_nulls()
472 if bench_up.is_empty():
473 return {asset: float("nan") for asset in self.assets}
475 bench_geom = float((bench_up + 1.0).product()) ** (1.0 / len(bench_up)) - 1.0
476 if bench_geom == 0.0:
477 return {asset: float("nan") for asset in self.assets}
479 for asset in self.assets:
480 strat_up = self.data[asset].filter(up_mask).drop_nulls()
481 if strat_up.is_empty():
482 result[asset] = float("nan")
483 else:
484 strat_geom = float((strat_up + 1.0).product()) ** (1.0 / len(strat_up)) - 1.0
485 result[asset] = strat_geom / bench_geom
486 return result
488 def down_capture(self, benchmark: pl.Series) -> dict[str, float]:
489 """Calculate the down-market capture ratio relative to a benchmark.
491 Measures the fraction of the benchmark's downside that the strategy
492 captures. Uses geometric means over benchmark down-periods
493 (benchmark < 0). A value less than 1.0 means the strategy lost less
494 than the benchmark in falling markets (a desirable property).
496 Args:
497 benchmark (pl.Series): Benchmark return series aligned row-by-row
498 with ``self.data``.
500 Returns:
501 dict[str, float]: Down capture ratio per asset.
503 """
504 result: dict[str, float] = {}
505 down_mask = benchmark < 0
506 bench_down = benchmark.filter(down_mask).drop_nulls()
507 if bench_down.is_empty():
508 return {asset: float("nan") for asset in self.assets}
510 bench_geom = float((bench_down + 1.0).product()) ** (1.0 / len(bench_down)) - 1.0
511 if bench_geom == 0.0:
512 return {asset: float("nan") for asset in self.assets}
514 for asset in self.assets:
515 strat_down = self.data[asset].filter(down_mask).drop_nulls()
516 if strat_down.is_empty():
517 result[asset] = float("nan")
518 else:
519 strat_geom = float((strat_down + 1.0).product()) ** (1.0 / len(strat_down)) - 1.0
520 result[asset] = strat_geom / bench_geom
521 return result
523 @columnwise_stat
524 def sharpe(self, series: pl.Series, periods: int | float | None = None) -> float:
525 """Calculate the Sharpe ratio of asset returns.
527 Args:
528 series (pl.Series): The series to calculate Sharpe ratio for.
529 periods (int, optional): Number of periods per year. Defaults to 252.
531 Returns:
532 float: The Sharpe ratio value.
534 """
535 periods = periods or self.periods_per_year
537 mean_val = _to_float(series.mean())
538 divisor = _to_float(series.std(ddof=1))
540 # Treat as zero-variance if divisor is zero or indistinguishable from
541 # floating-point noise (i.e. smaller than 10x machine epsilon x |mean|).
542 _eps = np.finfo(np.float64).eps
543 if divisor <= _eps * max(abs(mean_val), _eps) * 10:
544 return float("nan")
546 res = mean_val / divisor
547 factor = periods or 1
548 return float(res * np.sqrt(factor))
550 @columnwise_stat
551 def max_drawdown(self, series: pl.Series) -> float:
552 """Maximum drawdown as a fraction of the high-water mark.
554 Computes the largest peak-to-trough decline in the cumulative additive
555 NAV (starting at 1.0) expressed as a percentage of the peak.
557 Args:
558 series (pl.Series): Series of additive daily returns.
560 Returns:
561 float: Maximum drawdown in the range [0, 1].
563 """
564 return _to_float(_drawdown_series(series).max())
566 @columnwise_stat
567 def avg_drawdown(self, series: pl.Series) -> float:
568 """Average drawdown across all underwater periods.
570 Computes the mean drawdown percentage for every observation where the
571 portfolio is below its previous peak. Returns 0.0 if there are no
572 underwater periods.
574 Args:
575 series (pl.Series): Series of additive daily returns.
577 Returns:
578 float: Mean drawdown in the range [0, 1].
580 """
581 dd = _drawdown_series(series)
582 in_dd = dd.filter(dd > 0)
583 if in_dd.is_empty():
584 return 0.0
585 return _to_float(in_dd.mean())
587 def max_drawdown_duration(self) -> dict[str, float | int | None]:
588 """Maximum drawdown duration in calendar days (or periods) per asset.
590 Identifies consecutive runs of observations where the portfolio NAV is
591 below its high-water mark and returns the length of the longest such
592 run.
594 When a ``date`` column is present the duration is expressed as the
595 number of calendar days spanned by the run (inclusive of both
596 endpoints). When no ``date`` column exists each row counts as one
597 period, so the result is a count of consecutive underwater periods.
599 Returns:
600 dict[str, float | int | None]: Mapping from asset name to maximum
601 drawdown duration. Returns 0 when there are no underwater
602 periods.
604 """
605 has_date = "date" in self.data.columns
606 result: dict[str, float | int | None] = {}
607 for asset in self.assets:
608 series = self.data[asset]
609 nav = 1.0 + series.cast(pl.Float64).cum_sum()
610 hwm = nav.cum_max()
611 in_dd = nav < hwm
613 if not in_dd.any():
614 result[asset] = 0
615 continue
617 if has_date:
618 frame = pl.DataFrame({"date": self.data["date"], "in_dd": in_dd})
619 else:
620 frame = pl.DataFrame({"date": pl.Series(list(range(len(series))), dtype=pl.Int64), "in_dd": in_dd})
622 frame = frame.with_columns(pl.col("in_dd").rle_id().alias("run_id"))
624 dd_runs = (
625 frame.filter(pl.col("in_dd"))
626 .group_by("run_id")
627 .agg(
628 [
629 pl.col("date").min().alias("start"),
630 pl.col("date").max().alias("end"),
631 ]
632 )
633 )
635 if has_date:
636 dd_runs = dd_runs.with_columns(
637 ((pl.col("end") - pl.col("start")).dt.total_days() + 1).alias("duration")
638 )
639 else:
640 dd_runs = dd_runs.with_columns((pl.col("end") - pl.col("start") + 1).alias("duration"))
642 result[asset] = int(_to_float(dd_runs["duration"].max()))
644 return result
646 @columnwise_stat
647 def calmar(self, series: pl.Series, periods: int | float | None = None) -> float:
648 """Calmar ratio (annualized return divided by maximum drawdown).
650 A standard complement to the Sharpe ratio for trend-following and
651 momentum strategies. Returns ``nan`` when the maximum drawdown is
652 zero (no drawdown observed).
654 Args:
655 series (pl.Series): Series of additive daily returns.
656 periods (int | float | None): Annualisation factor (observations
657 per year). Defaults to ``periods_per_year``.
659 Returns:
660 float: Calmar ratio, or ``nan`` if max drawdown is zero.
662 """
663 raw_periods = periods or self.periods_per_year
664 max_dd = _to_float(_drawdown_series(series).max())
665 if max_dd <= 0:
666 return float("nan")
667 ann_return = _to_float(series.mean()) * raw_periods
668 return ann_return / max_dd
670 @columnwise_stat
671 def recovery_factor(self, series: pl.Series) -> float:
672 """Recovery factor (total return divided by maximum drawdown).
674 A robustness signal for systematic strategies: values well above 1
675 indicate that cumulative profits are large relative to the worst
676 historical loss. Returns ``nan`` when the maximum drawdown is zero.
678 Args:
679 series (pl.Series): Series of additive daily returns.
681 Returns:
682 float: Recovery factor, or ``nan`` if max drawdown is zero.
684 """
685 max_dd = _to_float(_drawdown_series(series).max())
686 if max_dd <= 0:
687 return float("nan")
688 total_return = _to_float(series.sum())
689 return total_return / max_dd
691 def rolling_sharpe(self, window: int = 63, periods: int | float | None = None) -> pl.DataFrame:
692 """Compute rolling annualised Sharpe ratio over a sliding window.
694 Args:
695 window: Number of periods in the rolling window. Defaults to 63.
696 periods: Number of periods per year for annualisation. Defaults to
697 ``periods_per_year``.
699 Returns:
700 pl.DataFrame: A DataFrame with the date column (when present) and
701 one column per asset. The first ``window - 1`` rows will be
702 null.
704 Raises:
705 ValueError: If ``window`` is not a positive integer.
707 """
708 if not isinstance(window, int) or window <= 0:
709 raise ValueError
711 scale = np.sqrt(periods or self.periods_per_year)
713 exprs = [
714 (
715 pl.col(asset).rolling_mean(window_size=window) / pl.col(asset).rolling_std(window_size=window) * scale
716 ).alias(asset)
717 for asset in self.assets
718 ]
720 cols: list[str | pl.Expr] = (["date"] if "date" in self.data.columns else []) + exprs
721 return self.data.select(cols)
723 def rolling_volatility(
724 self, window: int = 63, periods: int | float | None = None, annualize: bool = True
725 ) -> pl.DataFrame:
726 """Compute rolling volatility over a sliding window.
728 Args:
729 window: Number of periods in the rolling window. Defaults to 63.
730 periods: Number of periods per year for annualisation. Defaults to
731 ``periods_per_year``.
732 annualize: Whether to annualise the result by multiplying by
733 ``sqrt(periods)``. Defaults to True.
735 Returns:
736 pl.DataFrame: A DataFrame with the date column (when present) and
737 one column per asset. The first ``window - 1`` rows will be
738 null.
740 Raises:
741 ValueError: If ``window`` is not a positive integer.
742 TypeError: If ``periods`` is not numeric.
744 """
745 if not isinstance(window, int) or window <= 0:
746 raise ValueError
748 raw_periods = periods or self.periods_per_year
749 if not isinstance(raw_periods, int | float):
750 raise TypeError
752 factor = np.sqrt(raw_periods) if annualize else 1.0
754 exprs = [(pl.col(asset).rolling_std(window_size=window) * factor).alias(asset) for asset in self.assets]
756 cols: list[str | pl.Expr] = (["date"] if "date" in self.data.columns else []) + exprs
757 return self.data.select(cols)
759 def annual_breakdown(self) -> pl.DataFrame:
760 """Return summary statistics broken down by calendar year.
762 Groups the data by calendar year using the ``date`` column, computes
763 a full :py:meth:`summary` for each year, and stacks the results into
764 a single DataFrame with an additional ``year`` column.
766 Returns:
767 pl.DataFrame: A DataFrame with columns ``year``, ``metric``, and
768 one column per asset, sorted by ``year``.
770 Raises:
771 ValueError: If the DataFrame has no ``date`` column.
773 """
774 if "date" not in self.data.columns:
775 raise ValueError
777 years = self.data["date"].dt.year().unique().sort().to_list()
779 frames: list[pl.DataFrame] = []
780 for year in years:
781 year_data = self.data.filter(self.data["date"].dt.year() == year)
782 if year_data.height < 2:
783 continue
784 year_summary = Stats(year_data).summary()
785 year_summary = year_summary.with_columns(pl.lit(year).alias("year"))
786 frames.append(year_summary)
788 if not frames:
789 # Build empty DataFrame with expected schema
790 schema = {"year": pl.Int32, "metric": pl.String, **dict.fromkeys(self.assets, pl.Float64)}
791 return pl.DataFrame(schema=schema)
793 result = pl.concat(frames)
794 # Move 'year' to front
795 ordered = ["year", "metric", *[c for c in result.columns if c not in ("year", "metric")]]
796 return result.select(ordered)
798 def summary(self) -> pl.DataFrame:
799 """Return a DataFrame summarising all statistics for each asset.
801 Each row corresponds to one statistical metric; each column (beyond
802 the ``metric`` column) corresponds to one asset in the portfolio.
804 Returns:
805 pl.DataFrame: A DataFrame with a ``metric`` column followed by one
806 column per asset, containing the computed statistic values.
808 """
809 metrics: dict[str, dict[str, float | int | None] | dict[str, float | int]] = {
810 "avg_return": self.avg_return(),
811 "avg_win": self.avg_win(),
812 "avg_loss": self.avg_loss(),
813 "win_rate": self.win_rate(),
814 "profit_factor": self.profit_factor(),
815 "payoff_ratio": self.payoff_ratio(),
816 "monthly_win_rate": self.monthly_win_rate(),
817 "best": self.best(),
818 "worst": self.worst(),
819 "volatility": self.volatility(),
820 "sharpe": self.sharpe(),
821 "skew": self.skew(),
822 "kurtosis": self.kurtosis(),
823 "value_at_risk": self.value_at_risk(),
824 "conditional_value_at_risk": self.conditional_value_at_risk(),
825 "max_drawdown": self.max_drawdown(),
826 "avg_drawdown": self.avg_drawdown(),
827 "max_drawdown_duration": self.max_drawdown_duration(),
828 "calmar": self.calmar(),
829 "recovery_factor": self.recovery_factor(),
830 }
832 rows: list[dict[str, object]] = [
833 {"metric": name, **{asset: values[asset] for asset in self.assets}} for name, values in metrics.items()
834 ]
836 return pl.DataFrame(rows)
838 @property
839 def periods_per_year(self) -> float:
840 """Estimate the number of periods per year from timestamp spacing.
842 Computes the average spacing (in seconds) between consecutive timestamps using
843 plain Python datetimes to avoid ambiguity around Polars Duration arithmetic,
844 then returns 365 * 24 * 3600 divided by that spacing.
846 Returns:
847 float: Estimated number of observations per calendar year.
848 """
849 # Extract datetime values as Python objects (assuming a single datetime column)
850 col_name = self.data.columns[0]
851 dates = self.data[col_name]
853 # Index is guaranteed to have at least two rows by __post_init__,
854 # so we can compute gaps directly after sorting.
855 dates = dates.sort()
856 # Compute successive differences in seconds
857 gaps = dates.diff().drop_nulls()
859 mean_diff = gaps.mean()
861 # Convert Duration (timedelta) to seconds
862 if isinstance(mean_diff, timedelta):
863 seconds = mean_diff.total_seconds()
864 elif mean_diff is not None:
865 seconds = _to_float(mean_diff)
866 else:
867 # Fallback to daily if mean_diff is None
868 seconds = 86400.0
870 return (365.0 * 24.0 * 60.0 * 60.0) / seconds