Skip to content
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`)
- Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`)
- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
- Performance improvement in ``DataFrame.plot(kind="line")``: very wide DataFrames (more than 200 columns) are now rendered with a single :class:`matplotlib.collections.LineCollection` instead of one ``Line2D`` per column, reducing draw time by roughly 5x on a 2000-column frame. (:issue:`61532`)
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
- Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`)
Expand Down
123 changes: 88 additions & 35 deletions pandas/plotting/_matplotlib/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@
Series,
)

import itertools

from matplotlib.collections import LineCollection


def holds_integer(column: Index) -> bool:
return column.inferred_type in {"integer", "mixed-integer"}
Expand Down Expand Up @@ -1549,66 +1553,115 @@ def __init__(self, data, **kwargs) -> None:
self.data = self.data.fillna(value=0)

def _make_plot(self, fig: Figure) -> None:
"""
Draw a DataFrame line plot. For very wide frames (> 200 columns) that are
*not* time-series and have no stacking or error bars, all columns are
rendered with a single LineCollection for a large speed-up while keeping
public behaviour identical to the original per-column path.

GH#61764
"""
# decide once whether we can use the LineCollection fast draw
threshold = 200
use_collection = (
not self._is_ts_plot()
and not self.stacked
and not com.any_not_none(*self.errors.values())
and len(self.data.columns) > threshold
)

# choose ts-plot helper vs. regular helper
if self._is_ts_plot():
data = maybe_convert_index(self._get_ax(0), self.data)

x = data.index # dummy, not used
x = data.index # dummy; _ts_plot ignores it
plotf = self._ts_plot
it = data.items()
else:
x = self._get_xticks()
# error: Incompatible types in assignment (expression has type
# "Callable[[Any, Any, Any, Any, Any, Any, KwArg(Any)], Any]", variable has
# type "Callable[[Any, Any, Any, Any, KwArg(Any)], Any]")
plotf = self._plot # type: ignore[assignment]
# error: Incompatible types in assignment (expression has type
# "Iterator[tuple[Hashable, ndarray[Any, Any]]]", variable has
# type "Iterable[tuple[Hashable, Series]]")
it = self._iter_data(data=self.data) # type: ignore[assignment]

# shared state
stacking_id = self._get_stacking_id()
is_errorbar = com.any_not_none(*self.errors.values())

colors = self._get_colors()
segments: list[np.ndarray] = [] # vertices for LineCollection

# unified per-column loop
for i, (label, y) in enumerate(it):
ax = self._get_ax(i)
ax = self._get_ax(i if not use_collection else 0)

kwds = self.kwds.copy()
if self.color is not None:
kwds["color"] = self.color

style, kwds = self._apply_style_colors(
colors,
kwds,
i,
# error: Argument 4 to "_apply_style_colors" of "MPLPlot" has
# incompatible type "Hashable"; expected "str"
label, # type: ignore[arg-type]
)
kwds.update(self._get_errorbars(label=label, index=i))

label_str = self._mark_right_label(pprint_thing(label), index=i)
kwds["label"] = label_str

if use_collection:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still not generally fond of having a different code path if some condition is met, especially since the condition is requires a magic number threshold

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a reasonable concern. is there a downside to always using LineCollection?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke @jbrockmendel, If we want to completely get rid of the path split and the magic threshold number, i have to patch few things:

  1. pandas/plotting/_matplotlib/core.py (LinePlot._make_plot)
    • Remove the current threshold (use_collection) condition.
    • Always render DataFrame line plots using a single LineCollection.
    • Add tiny proxy Line2D objects (invisible) to keep legends working as usual.
    • Stacked plots and error-bar plots remain unchanged (they use separate code paths already).

  2. pandas/plotting/_matplotlib/tools.py
    • Adjust get_all_lines to return segments from any existing LineCollection.
    (Needed for existing tests and autoscaling.)
    • Adjust get_xlim similarly to compute limits directly from the LineCollection vertices.

  3. pandas/tests/plotting/common.py and plotting tests
    • Update tests to handle the new structure. Instead of direct access like ax.lines[...], tests will use a helper function aware of the new single-collection setup.

  4. Documentation and Release Notes
    • Clearly note in docs/whatsnew that ax.lines will be empty for DataFrame line plots.
    • Users accessing line data directly should switch to pandas.plotting.get_all_lines(ax) or check ax.collections[0].

(No changes for Series plots or other plot types like scatter, area, bar, etc.)

Advantages:
• One simple and predictable rendering path for all DataFrame line plots.
• Significant speed-up for large DataFrames, negligible overhead for small DataFrames.
• Lower memory use (single artist instead of many) and easier future maintenance.

Potential Downsides (but manageable):
• Users relying on ax.lines[i] directly must adapt (addressed clearly in docs and deprecation shim).
• Interactive plots using “picker” callbacks may need minor code updates.
• A small batch of tests will need straightforward adjustments.

If you're comfortable with this, i can start an implementation. Are there any additional concerns i should keep in mind before coding?

Happy to iterate!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@EvMossan the unfortunate situation is that there aren't any maintainers with expertise in matplotlib, so the idea of reviewing everything you described is daunting. Is there a minimal version?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel, I’ve tried every variant I can think of, but I still can’t get a single-path implementation that both preserves the ~5× speed-up and passes the full test suite-at this point I’m stuck and would welcome any ideas or guidance.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and passes the full test suite

How many/bad are the failures we're looking at? e.g. no one really cares about ax.lines[0] or whatever as long as the graphs look right.

The ideas that come to mind are 1) convince @mroeschke to be OK with multiple code paths, 2) ask a matplotlib maintainer for help, 3) spend a lot of time on this myself, 4) decide the affected tests are OK to change.

I'm hoping that 4 is viable. Keep in mind that if we go that route, you're tacitly volunteering to have me ping you next time an issue comes up in this part of the code/tests.

# collect vertices; defer drawing
segments.append(np.column_stack((x, y)))

# tiny proxy only if legend is requested
if self.legend:
proxy = mpl.lines.Line2D(
[],
[],
color=kwds.get("color"),
linewidth=kwds.get(
"linewidth", mpl.rcParams["lines.linewidth"]
),
linestyle=kwds.get("linestyle", "-"),
marker=kwds.get("marker"),
)
self._append_legend_handles_labels(proxy, label_str)
else:
newlines = plotf(
ax,
x,
y,
style=style,
column_num=i,
stacking_id=stacking_id,
is_errorbar=is_errorbar,
**kwds,
)
self._append_legend_handles_labels(newlines[0], label_str)

errors = self._get_errorbars(label=label, index=i)
kwds = dict(kwds, **errors)
# reset x-limits for true ts plots
if self._is_ts_plot():
lines = get_all_lines(ax)
left, right = get_xlim(lines)
ax.set_xlim(left, right)

label = pprint_thing(label)
label = self._mark_right_label(label, index=i)
kwds["label"] = label

newlines = plotf(
ax,
x,
y,
style=style,
column_num=i,
stacking_id=stacking_id,
is_errorbar=is_errorbar,
**kwds,
# single draw call for fast path
if use_collection and segments:
if self.legend:
lc_colors = [
cast(mpl.lines.Line2D, h).get_color() # mypy: h is Line2D
for h in self.legend_handles
]
else:
# no legend - repeat default colour cycle
base = mpl.rcParams["axes.prop_cycle"].by_key()["color"]
lc_colors = list(itertools.islice(itertools.cycle(base), len(segments)))

lc = LineCollection(
segments,
colors=lc_colors,
linewidths=self.kwds.get("linewidth", mpl.rcParams["lines.linewidth"]),
)
self._append_legend_handles_labels(newlines[0], label)

if self._is_ts_plot():
# reset of xlim should be used for ts data
# TODO: GH28021, should find a way to change view limit on xaxis
lines = get_all_lines(ax)
left, right = get_xlim(lines)
ax.set_xlim(left, right)
ax0 = self._get_ax(0)
ax0.add_collection(lc)
ax0.margins(0.05)

# error: Signature of "_plot" incompatible with supertype "MPLPlot"
@classmethod
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/plotting/frame/test_linecollection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
Ensure wide DataFrame.line plots use a single LineCollection
instead of one Line2D per column (GH #61764).
"""

import numpy as np
import pytest

import pandas as pd

# Skip this entire module if matplotlib is not installed
mpl = pytest.importorskip("matplotlib")
plt = pytest.importorskip("matplotlib.pyplot")
from matplotlib.collections import LineCollection


def test_linecollection_used_for_wide_dataframe():
rng = np.random.default_rng(0)
df = pd.DataFrame(rng.standard_normal((10, 201)).cumsum(axis=0))

ax = df.plot(legend=False)

# exactly one LineCollection, and no Line2D artists
assert sum(isinstance(c, LineCollection) for c in ax.collections) == 1
assert len(ax.lines) == 0

plt.close(ax.figure)
Loading