Coverage for src / jquantstats / data.py: 99%
183 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-07 14:28 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-07 14:28 +0000
1"""Financial returns data container and manipulation utilities."""
3from __future__ import annotations
5import dataclasses
6import warnings
7from collections.abc import Iterator
8from datetime import date, datetime, timedelta
9from typing import TYPE_CHECKING, Literal, cast
11import narwhals as nw
12import polars as pl
14from ._types import NativeFrame, NativeFrameOrScalar
15from .exceptions import NullsInReturnsError
17if TYPE_CHECKING:
18 from ._plots import DataPlots
19 from ._reports import Reports
20 from ._stats import Stats
21 from ._utils import DataUtils
24def _to_polars(df: NativeFrame) -> pl.DataFrame:
25 """Convert any narwhals-compatible DataFrame to a polars DataFrame."""
26 if isinstance(df, pl.DataFrame):
27 return df
28 return nw.from_native(df, eager_only=True).to_polars()
31def _apply_null_strategy(
32 dframe: pl.DataFrame,
33 date_col: str,
34 frame_name: str,
35 null_strategy: Literal["raise", "drop", "forward_fill"] | None,
36) -> pl.DataFrame:
37 """Check for nulls in *dframe* and apply *null_strategy*.
39 Parameters
40 ----------
41 dframe : pl.DataFrame
42 DataFrame to inspect. The date column is excluded from the null scan.
43 date_col : str
44 Name of the column to treat as the date index (excluded from null check).
45 frame_name : str
46 Descriptive name used in the error message (e.g. ``"returns"``).
47 null_strategy : {"raise", "drop", "forward_fill"} | None
48 How to handle null values:
50 - ``None`` — leave nulls as-is (current default behaviour; nulls will
51 propagate through calculations).
52 - ``"raise"`` — raise :exc:`~jquantstats.exceptions.NullsInReturnsError`
53 if any null is found.
54 - ``"drop"`` — drop every row that contains at least one null value.
55 - ``"forward_fill"`` — fill each null with the most recent non-null
56 value in the same column.
58 Returns:
59 -------
60 pl.DataFrame
61 The original DataFrame (``None`` / ``"raise"``), a filtered DataFrame
62 (``"drop"``), or a filled DataFrame (``"forward_fill"``).
64 Raises:
65 ------
66 NullsInReturnsError
67 When *null_strategy* is ``"raise"`` and nulls are present.
69 """
70 if null_strategy is None:
71 return dframe
73 value_cols = [c for c in dframe.columns if c != date_col]
74 null_counts = dframe.select(value_cols).null_count().row(0)
75 cols_with_nulls = [col for col, count in zip(value_cols, null_counts, strict=False) if count > 0]
77 if not cols_with_nulls:
78 return dframe
80 if null_strategy == "raise":
81 raise NullsInReturnsError(frame_name, cols_with_nulls)
82 if null_strategy == "drop":
83 return dframe.drop_nulls(subset=value_cols)
84 # forward_fill
85 return dframe.with_columns([pl.col(c).forward_fill() for c in value_cols])
88def _subtract_risk_free(dframe: pl.DataFrame, rf: float | pl.DataFrame, date_col: str) -> pl.DataFrame:
89 """Subtract the risk-free rate from all numeric columns in the DataFrame.
91 Parameters
92 ----------
93 dframe : pl.DataFrame
94 DataFrame containing returns data with a date column
95 and one or more numeric columns representing asset returns.
97 rf : float | pl.DataFrame
98 Risk-free rate to subtract from returns.
100 - If float: A constant risk-free rate applied to all dates.
101 - If pl.DataFrame: A DataFrame with a date column and a second column
102 containing time-varying risk-free rates.
104 date_col : str
105 Name of the date column in both DataFrames for joining
106 when rf is a DataFrame.
108 Returns:
109 -------
110 pl.DataFrame
111 DataFrame with the risk-free rate subtracted from all numeric columns,
112 preserving the original column names.
114 """
115 if isinstance(rf, float):
116 rf_dframe = dframe.select([pl.col(date_col), pl.lit(rf).alias("rf")])
117 else:
118 if not isinstance(rf, pl.DataFrame):
119 raise TypeError("rf must be a float or DataFrame") # noqa: TRY003
120 if rf.columns[1] != "rf":
121 warnings.warn(
122 f"Risk-free rate column '{rf.columns[1]}' has been renamed to 'rf' for internal alignment.",
123 stacklevel=3,
124 )
125 rf_dframe = rf.rename({rf.columns[1]: "rf"}) if rf.columns[1] != "rf" else rf
127 dframe = dframe.join(rf_dframe, on=date_col, how="inner")
128 return dframe.select(
129 [pl.col(date_col)]
130 + [(pl.col(col) - pl.col("rf")).alias(col) for col in dframe.columns if col not in {date_col, "rf"}]
131 )
134@dataclasses.dataclass(frozen=True, slots=True)
135class Data:
136 """A container for financial returns data and an optional benchmark.
138 This class provides methods for analyzing and manipulating financial returns data,
139 including converting returns to prices, calculating drawdowns, and resampling data
140 to different time periods. It also provides access to statistical metrics through
141 the stats property and visualization through the plots property.
143 Attributes:
144 returns (pl.DataFrame): DataFrame containing returns data with assets as columns.
145 benchmark (pl.DataFrame, optional): DataFrame containing benchmark returns data.
146 Defaults to None.
147 index (pl.DataFrame): DataFrame containing the date index for the returns data.
149 """
151 returns: pl.DataFrame
152 index: pl.DataFrame
153 benchmark: pl.DataFrame | None = None
155 def __post_init__(self) -> None:
156 """Validate the Data object after initialization."""
157 # You need at least two points
158 if self.index.shape[0] < 2:
159 raise ValueError("Index must contain at least two timestamps.") # noqa: TRY003
161 # Check index is monotonically increasing
162 datetime_col = self.index[self.index.columns[0]]
163 if not datetime_col.is_sorted():
164 raise ValueError("Index must be monotonically increasing.") # noqa: TRY003
166 # Check row count matches returns
167 if self.returns.shape[0] != self.index.shape[0]:
168 raise ValueError("Returns and index must have the same number of rows.") # noqa: TRY003
170 # Check row count matches benchmark (if provided)
171 if self.benchmark is not None and self.benchmark.shape[0] != self.index.shape[0]:
172 raise ValueError("Benchmark and index must have the same number of rows.") # noqa: TRY003
174 @classmethod
175 def from_returns(
176 cls,
177 returns: NativeFrame,
178 rf: NativeFrameOrScalar = 0.0,
179 benchmark: NativeFrame | None = None,
180 date_col: str = "Date",
181 null_strategy: Literal["raise", "drop", "forward_fill"] | None = None,
182 ) -> Data:
183 """Create a Data object from returns and optional benchmark.
185 Parameters
186 ----------
187 returns : NativeFrame
188 Financial returns data. First column should be the date column,
189 remaining columns are asset returns.
191 rf : float | NativeFrame, optional
192 Risk-free rate. Default is 0.0 (no risk-free rate adjustment).
194 - If float: Constant risk-free rate applied to all dates.
195 - If NativeFrame: Time-varying risk-free rate with dates matching returns.
197 benchmark : NativeFrame | None, optional
198 Benchmark returns. Default is None (no benchmark).
199 First column should be the date column, remaining columns are benchmark returns.
201 date_col : str, optional
202 Name of the date column in the DataFrames. Default is "Date".
204 null_strategy : {"raise", "drop", "forward_fill"} | None, optional
205 How to handle ``null`` (missing) values in *returns* and *benchmark*.
206 Default is ``None`` (nulls are left as-is and will propagate through
207 calculations, matching the current Polars behaviour).
209 - ``None`` — no null checking; nulls propagate through all
210 downstream calculations. This matches Polars' default semantics.
211 - ``"raise"`` — raise :exc:`~jquantstats.exceptions.NullsInReturnsError`
212 if any null is found. Use this to be notified of missing data
213 and clean it yourself before construction.
214 - ``"drop"`` — silently drop every row that contains at least one null.
215 Mirrors the pandas/QuantStats silent-drop behaviour.
216 - ``"forward_fill"`` — fill each null with the most recent non-null value
217 in the same column.
219 .. note::
220 This parameter affects only Polars ``null`` values (i.e. ``None`` /
221 missing entries). IEEE-754 ``NaN`` values (``float("nan")``) are not
222 nulls in Polars and are **not** affected — they continue to propagate
223 through calculations as per IEEE-754 semantics.
225 Returns:
226 -------
227 Data
228 Object containing excess returns and benchmark (if any), with methods for
229 analysis and visualization through the ``stats`` and ``plots`` properties.
231 Raises:
232 ------
233 NullsInReturnsError
234 If *null_strategy* is ``"raise"`` and the data contains null values.
235 ValueError
236 If there are no overlapping dates between returns and benchmark.
238 Examples:
239 --------
240 Basic usage:
242 ```python
243 from jquantstats import Data
244 import polars as pl
246 returns = pl.DataFrame({
247 "Date": ["2023-01-01", "2023-01-02", "2023-01-03"],
248 "Asset1": [0.01, -0.02, 0.03]
249 }).with_columns(pl.col("Date").str.to_date())
251 data = Data.from_returns(returns=returns)
252 ```
254 With benchmark and risk-free rate:
256 ```python
257 benchmark = pl.DataFrame({
258 "Date": ["2023-01-01", "2023-01-02", "2023-01-03"],
259 "Market": [0.005, -0.01, 0.02]
260 }).with_columns(pl.col("Date").str.to_date())
262 data = Data.from_returns(returns=returns, benchmark=benchmark, rf=0.0002)
263 ```
265 Handling nulls automatically:
267 ```python
268 returns_with_nulls = pl.DataFrame({
269 "Date": ["2023-01-01", "2023-01-02", "2023-01-03"],
270 "Asset1": [0.01, None, 0.03]
271 }).with_columns(pl.col("Date").str.to_date())
273 # Drop rows with nulls (mirrors pandas/QuantStats behaviour)
274 data = Data.from_returns(returns=returns_with_nulls, null_strategy="drop")
276 # Or forward-fill nulls
277 data = Data.from_returns(returns=returns_with_nulls, null_strategy="forward_fill")
278 ```
280 """
281 returns_pl = _to_polars(returns)
282 benchmark_pl = _to_polars(benchmark) if benchmark is not None else None
283 rf_converted: float | pl.DataFrame
284 if isinstance(rf, pl.DataFrame) or (not isinstance(rf, float) and not isinstance(rf, int)):
285 rf_converted = _to_polars(rf)
286 else:
287 rf_converted = rf # int is not float/DataFrame: _subtract_risk_free raises TypeError
289 returns_pl = _apply_null_strategy(returns_pl, date_col, "returns", null_strategy)
290 if benchmark_pl is not None:
291 benchmark_pl = _apply_null_strategy(benchmark_pl, date_col, "benchmark", null_strategy)
293 if benchmark_pl is not None:
294 joined_dates = returns_pl.join(benchmark_pl, on=date_col, how="inner").select(date_col)
295 if joined_dates.is_empty():
296 raise ValueError("No overlapping dates between returns and benchmark.") # noqa: TRY003
297 returns_pl = returns_pl.join(joined_dates, on=date_col, how="inner")
298 benchmark_pl = benchmark_pl.join(joined_dates, on=date_col, how="inner")
300 index = returns_pl.select(date_col)
301 excess_returns = _subtract_risk_free(returns_pl, rf_converted, date_col).drop(date_col)
302 excess_benchmark = (
303 _subtract_risk_free(benchmark_pl, rf_converted, date_col).drop(date_col)
304 if benchmark_pl is not None
305 else None
306 )
308 return cls(returns=excess_returns, benchmark=excess_benchmark, index=index)
310 @classmethod
311 def from_prices(
312 cls,
313 prices: NativeFrame,
314 rf: NativeFrameOrScalar = 0.0,
315 benchmark: NativeFrame | None = None,
316 date_col: str = "Date",
317 null_strategy: Literal["raise", "drop", "forward_fill"] | None = None,
318 ) -> Data:
319 """Create a Data object from prices and optional benchmark.
321 Converts price levels to returns via percentage change and delegates
322 to :meth:`from_returns`. The first row of each asset is dropped
323 because no prior price is available to compute a return.
325 Parameters
326 ----------
327 prices : NativeFrame
328 Price-level data. First column should be the date column;
329 remaining columns are asset prices.
331 rf : float | NativeFrame, optional
332 Risk-free rate. Forwarded unchanged to :meth:`from_returns`.
333 Default is 0.0 (no risk-free rate adjustment).
335 benchmark : NativeFrame | None, optional
336 Benchmark prices. Converted to returns in the same way as
337 ``prices`` before being forwarded to :meth:`from_returns`.
338 Default is None (no benchmark).
340 date_col : str, optional
341 Name of the date column in the DataFrames. Default is ``"Date"``.
343 null_strategy : {"raise", "drop", "forward_fill"} | None, optional
344 How to handle ``null`` (missing) values after converting prices to
345 returns. Forwarded unchanged to :meth:`from_returns`.
346 Default is ``None`` (nulls propagate through calculations).
348 - ``None`` — no null checking; nulls propagate.
349 - ``"raise"`` — raise :exc:`~jquantstats.exceptions.NullsInReturnsError`
350 if any null is found in the derived returns.
351 - ``"drop"`` — silently drop every row that contains at least one null.
352 - ``"forward_fill"`` — fill each null with the most recent non-null value.
354 .. note::
355 Prices that contain nulls will produce null returns via
356 ``pct_change()``. If you expect missing price entries, pass
357 ``null_strategy="drop"`` or ``null_strategy="forward_fill"``.
359 Returns:
360 -------
361 Data
362 Object containing excess returns derived from the supplied prices,
363 with methods for analysis and visualization through the ``stats``
364 and ``plots`` properties.
366 Examples:
367 --------
368 ```python
369 from jquantstats import Data
370 import polars as pl
372 prices = pl.DataFrame({
373 "Date": ["2023-01-01", "2023-01-02", "2023-01-03"],
374 "Asset1": [100.0, 101.0, 99.0]
375 }).with_columns(pl.col("Date").str.to_date())
377 data = Data.from_prices(prices=prices)
378 ```
380 """
381 prices_pl = _to_polars(prices)
382 asset_cols = [c for c in prices_pl.columns if c != date_col]
383 returns_pl = prices_pl.with_columns([pl.col(c).pct_change().alias(c) for c in asset_cols]).slice(1)
385 benchmark_returns: NativeFrame | None = None
386 if benchmark is not None:
387 benchmark_pl = _to_polars(benchmark)
388 bench_cols = [c for c in benchmark_pl.columns if c != date_col]
389 benchmark_returns = benchmark_pl.with_columns([pl.col(c).pct_change().alias(c) for c in bench_cols]).slice(
390 1
391 )
393 return cls.from_returns(
394 returns=returns_pl,
395 rf=rf,
396 benchmark=benchmark_returns,
397 date_col=date_col,
398 null_strategy=null_strategy,
399 )
401 def __repr__(self) -> str:
402 """Return a string representation of the Data object."""
403 rows = len(self.index)
404 date_cols = self.date_col
405 if date_cols:
406 date_column = date_cols[0]
407 start = self.index[date_column].min()
408 end = self.index[date_column].max()
409 return f"Data(assets={self.assets}, rows={rows}, start={start}, end={end})"
410 return f"Data(assets={self.assets}, rows={rows})" # pragma: no cover # __post_init__ requires ≥1 index column
412 @property
413 def plots(self) -> DataPlots:
414 """Provides access to visualization methods for the financial data.
416 Returns:
417 DataPlots: An instance of the DataPlots class initialized with this data.
419 """
420 from ._plots import DataPlots
422 return DataPlots(self)
424 @property
425 def stats(self) -> Stats:
426 """Provides access to statistical analysis methods for the financial data.
428 Returns:
429 Stats: An instance of the Stats class initialized with this data.
431 """
432 from ._stats import Stats
434 return Stats(self)
436 @property
437 def reports(self) -> Reports:
438 """Provides access to reporting methods for the financial data.
440 Returns:
441 Reports: An instance of the Reports class initialized with this data.
443 """
444 from ._reports import Reports
446 return Reports(self)
448 @property
449 def utils(self) -> DataUtils:
450 """Provides access to utility transforms and conversions for the financial data.
452 Returns:
453 DataUtils: An instance of the DataUtils class initialized with this data.
455 """
456 from ._utils import DataUtils
458 return DataUtils(self)
460 @property
461 def date_col(self) -> list[str]:
462 """Return the column names of the index DataFrame.
464 Returns:
465 list[str]: List of column names in the index DataFrame, typically containing
466 the date column name.
468 """
469 return list(self.index.columns)
471 @property
472 def assets(self) -> list[str]:
473 """Return the combined list of asset column names from returns and benchmark.
475 Returns:
476 list[str]: List of all asset column names from both returns and benchmark
477 (if available).
479 """
480 if self.benchmark is not None:
481 return list(self.returns.columns) + list(self.benchmark.columns)
482 return list(self.returns.columns)
484 @property
485 def all(self) -> pl.DataFrame:
486 """Combine index, returns, and benchmark data into a single DataFrame.
488 This property provides a convenient way to access all data in a single DataFrame,
489 which is useful for analysis and visualization.
491 Returns:
492 pl.DataFrame: A DataFrame containing the index, all returns data, and benchmark data
493 (if available) combined horizontally.
495 """
496 if self.benchmark is None:
497 return pl.concat([self.index, self.returns], how="horizontal")
498 else:
499 return pl.concat([self.index, self.returns, self.benchmark], how="horizontal")
501 def resample(self, every: str = "1mo") -> Data:
502 """Resamples returns and benchmark to a different frequency using Polars.
504 Args:
505 every (str, optional): Resampling frequency (e.g., '1mo', '1y'). Defaults to '1mo'.
507 Returns:
508 Data: Resampled data.
510 """
512 def resample_frame(dframe: pl.DataFrame) -> pl.DataFrame:
513 """Resample a single DataFrame to the target frequency using compound returns."""
514 dframe = self.index.hstack(dframe) # Add the date column for resampling
516 return dframe.group_by_dynamic(
517 index_column=self.index.columns[0], every=every, period=every, closed="right", label="right"
518 ).agg(
519 [
520 ((pl.col(col) + 1.0).product() - 1.0).alias(col)
521 for col in dframe.columns
522 if col != self.index.columns[0]
523 ]
524 )
526 resampled_returns = resample_frame(self.returns)
527 resampled_benchmark = resample_frame(self.benchmark) if self.benchmark is not None else None
528 resampled_index = resampled_returns.select(self.index.columns[0])
530 return Data(
531 returns=resampled_returns.drop(self.index.columns[0]),
532 benchmark=resampled_benchmark.drop(self.index.columns[0]) if resampled_benchmark is not None else None,
533 index=resampled_index,
534 )
536 def describe(self) -> pl.DataFrame:
537 """Return a tidy summary of shape, date range and asset names.
539 Returns:
540 -------
541 pl.DataFrame
542 One row per asset with columns: asset, start, end, rows, has_benchmark.
544 """
545 date_column = self.date_col[0]
546 start = self.index[date_column].min()
547 end = self.index[date_column].max()
548 rows = len(self.index)
549 return pl.DataFrame(
550 {
551 "asset": self.returns.columns,
552 "start": [start] * len(self.returns.columns),
553 "end": [end] * len(self.returns.columns),
554 "rows": [rows] * len(self.returns.columns),
555 "has_benchmark": [self.benchmark is not None] * len(self.returns.columns),
556 }
557 )
559 def copy(self) -> Data:
560 """Create a deep copy of the Data object.
562 Returns:
563 Data: A new Data object with copies of the returns and benchmark.
565 """
566 if self.benchmark is not None:
567 return Data(returns=self.returns.clone(), benchmark=self.benchmark.clone(), index=self.index.clone())
568 return Data(returns=self.returns.clone(), index=self.index.clone())
570 def head(self, n: int = 5) -> Data:
571 """Return the first n rows of the combined returns and benchmark data.
573 Args:
574 n (int, optional): Number of rows to return. Defaults to 5.
576 Returns:
577 Data: A new Data object containing the first n rows of the combined data.
579 """
580 benchmark_head = self.benchmark.head(n) if self.benchmark is not None else None
581 return Data(returns=self.returns.head(n), benchmark=benchmark_head, index=self.index.head(n))
583 def tail(self, n: int = 5) -> Data:
584 """Return the last n rows of the combined returns and benchmark data.
586 Args:
587 n (int, optional): Number of rows to return. Defaults to 5.
589 Returns:
590 Data: A new Data object containing the last n rows of the combined data.
592 """
593 benchmark_tail = self.benchmark.tail(n) if self.benchmark is not None else None
594 return Data(returns=self.returns.tail(n), benchmark=benchmark_tail, index=self.index.tail(n))
596 def truncate(
597 self,
598 start: date | datetime | str | int | None = None,
599 end: date | datetime | str | int | None = None,
600 ) -> Data:
601 """Return a new Data object truncated to the inclusive [start, end] range.
603 When the index is temporal (Date/Datetime), truncation is performed by
604 comparing the date column against ``start`` and ``end`` values.
606 When the index is integer-based, row slicing is used instead, and
607 ``start`` and ``end`` must be non-negative integers. Passing
608 non-integer bounds to an integer-indexed Data raises :exc:`TypeError`.
610 Args:
611 start: Optional lower bound (inclusive). A date/datetime value
612 when the index is temporal; a non-negative :class:`int` row
613 index when the data has no temporal index.
614 end: Optional upper bound (inclusive). Same type rules as
615 ``start``.
617 Returns:
618 Data: A new Data object filtered to the specified range.
620 Raises:
621 TypeError: When the index is not temporal and a non-integer bound
622 is supplied.
624 """
625 date_column = self.index.columns[0]
626 is_temporal = self.index[date_column].dtype.is_temporal()
628 if is_temporal:
629 cond = pl.lit(True)
630 if start is not None:
631 cond = cond & (pl.col(date_column) >= pl.lit(start))
632 if end is not None:
633 cond = cond & (pl.col(date_column) <= pl.lit(end))
634 mask = self.index.select(cond.alias("mask"))["mask"]
635 new_index = self.index.filter(mask)
636 new_returns = self.returns.filter(mask)
637 new_benchmark = self.benchmark.filter(mask) if self.benchmark is not None else None
638 else:
639 if start is not None and not isinstance(start, int):
640 raise TypeError(f"start must be an integer, got {type(start).__name__}.") # noqa: TRY003
641 if end is not None and not isinstance(end, int):
642 raise TypeError(f"end must be an integer, got {type(end).__name__}.") # noqa: TRY003
643 row_start = start if start is not None else 0
644 row_end = end + 1 if end is not None else self.index.height
645 length = max(0, row_end - row_start)
646 new_index = self.index.slice(row_start, length)
647 new_returns = self.returns.slice(row_start, length)
648 new_benchmark = self.benchmark.slice(row_start, length) if self.benchmark is not None else None
650 return Data(returns=new_returns, benchmark=new_benchmark, index=new_index)
652 @property
653 def _periods_per_year(self) -> float:
654 """Estimate the number of periods per year based on average frequency in the index.
656 For temporal (Date/Datetime) indices, computes the mean gap between observations
657 and converts to an annualised period count (e.g. ~252 for daily, ~52 for weekly).
659 For integer indices (date-free portfolios), falls back to 252 trading days per year
660 because integer diffs have no time meaning.
661 """
662 datetime_col = self.index[self.index.columns[0]]
664 if not datetime_col.dtype.is_temporal():
665 return 252.0
667 sorted_dt = datetime_col.sort()
668 diffs = sorted_dt.diff().drop_nulls()
669 mean_diff = diffs.mean()
671 if isinstance(mean_diff, timedelta):
672 seconds = mean_diff.total_seconds()
673 else: # pragma: no cover # Polars always returns timedelta for temporal diff
674 seconds = cast(float, mean_diff) if mean_diff is not None else 1.0
676 return (365 * 24 * 60 * 60) / seconds
678 def items(self) -> Iterator[tuple[str, pl.Series]]:
679 """Iterate over all assets and their corresponding data series.
681 This method provides a convenient way to iterate over all assets in the data,
682 yielding each asset name and its corresponding data series.
684 Yields:
685 tuple[str, pl.Series]: A tuple containing the asset name and its data series.
687 """
688 matrix = self.all
690 for col in self.assets:
691 yield col, matrix.get_column(col)