From 21a3b2f90b824ea9d7d5cad3a1048c765b898aed Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 28 Feb 2022 14:27:00 -0800 Subject: [PATCH] DOC: Remove computation.rst in favor of better docstrings (#46170) * DOC: Remove computation.rst in favor of better docstrings: * Remove other ref --- doc/source/user_guide/computation.rst | 212 -------------------------- doc/source/user_guide/index.rst | 1 - doc/source/user_guide/window.rst | 14 +- doc/source/whatsnew/v0.6.0.rst | 2 +- doc/source/whatsnew/v0.6.1.rst | 4 +- doc/source/whatsnew/v0.8.0.rst | 2 +- pandas/core/frame.py | 40 ++++- pandas/core/generic.py | 14 +- pandas/core/series.py | 10 +- 9 files changed, 74 insertions(+), 225 deletions(-) delete mode 100644 doc/source/user_guide/computation.rst diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst deleted file mode 100644 index 6007129e96ba0..0000000000000 --- a/doc/source/user_guide/computation.rst +++ /dev/null @@ -1,212 +0,0 @@ -.. _computation: - -{{ header }} - -Computational tools -=================== - - -Statistical functions ---------------------- - -.. _computation.pct_change: - -Percent change -~~~~~~~~~~~~~~ - -``Series`` and ``DataFrame`` have a method -:meth:`~DataFrame.pct_change` to compute the percent change over a given number -of periods (using ``fill_method`` to fill NA/null values *before* computing -the percent change). - -.. ipython:: python - - ser = pd.Series(np.random.randn(8)) - - ser.pct_change() - -.. ipython:: python - - df = pd.DataFrame(np.random.randn(10, 4)) - - df.pct_change(periods=3) - -.. _computation.covariance: - -Covariance -~~~~~~~~~~ - -:meth:`Series.cov` can be used to compute covariance between series -(excluding missing values). - -.. ipython:: python - - s1 = pd.Series(np.random.randn(1000)) - s2 = pd.Series(np.random.randn(1000)) - s1.cov(s2) - -Analogously, :meth:`DataFrame.cov` to compute pairwise covariances among the -series in the DataFrame, also excluding NA/null values. - -.. _computation.covariance.caveats: - -.. note:: - - Assuming the missing data are missing at random this results in an estimate - for the covariance matrix which is unbiased. However, for many applications - this estimate may not be acceptable because the estimated covariance matrix - is not guaranteed to be positive semi-definite. This could lead to - estimated correlations having absolute values which are greater than one, - and/or a non-invertible covariance matrix. See `Estimation of covariance - matrices `_ - for more details. - -.. ipython:: python - - frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) - frame.cov() - -``DataFrame.cov`` also supports an optional ``min_periods`` keyword that -specifies the required minimum number of observations for each column pair -in order to have a valid result. - -.. ipython:: python - - frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) - frame.loc[frame.index[:5], "a"] = np.nan - frame.loc[frame.index[5:10], "b"] = np.nan - - frame.cov() - - frame.cov(min_periods=12) - - -.. _computation.correlation: - -Correlation -~~~~~~~~~~~ - -Correlation may be computed using the :meth:`~DataFrame.corr` method. -Using the ``method`` parameter, several methods for computing correlations are -provided: - -.. csv-table:: - :header: "Method name", "Description" - :widths: 20, 80 - - ``pearson (default)``, Standard correlation coefficient - ``kendall``, Kendall Tau correlation coefficient - ``spearman``, Spearman rank correlation coefficient - -.. \rho = \cov(x, y) / \sigma_x \sigma_y - -All of these are currently computed using pairwise complete observations. -Wikipedia has articles covering the above correlation coefficients: - -* `Pearson correlation coefficient `_ -* `Kendall rank correlation coefficient `_ -* `Spearman's rank correlation coefficient `_ - -.. note:: - - Please see the :ref:`caveats ` associated - with this method of calculating correlation matrices in the - :ref:`covariance section `. - -.. ipython:: python - - frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) - frame.iloc[::2] = np.nan - - # Series with Series - frame["a"].corr(frame["b"]) - frame["a"].corr(frame["b"], method="spearman") - - # Pairwise correlation of DataFrame columns - frame.corr() - -Note that non-numeric columns will be automatically excluded from the -correlation calculation. - -Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: - -.. ipython:: python - - frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) - frame.loc[frame.index[:5], "a"] = np.nan - frame.loc[frame.index[5:10], "b"] = np.nan - - frame.corr() - - frame.corr(min_periods=12) - - -The ``method`` argument can also be a callable for a generic correlation -calculation. In this case, it should be a single function -that produces a single value from two ndarray inputs. Suppose we wanted to -compute the correlation based on histogram intersection: - -.. ipython:: python - - # histogram intersection - def histogram_intersection(a, b): - return np.minimum(np.true_divide(a, a.sum()), np.true_divide(b, b.sum())).sum() - - - frame.corr(method=histogram_intersection) - -A related method :meth:`~DataFrame.corrwith` is implemented on DataFrame to -compute the correlation between like-labeled Series contained in different -DataFrame objects. - -.. ipython:: python - - index = ["a", "b", "c", "d", "e"] - columns = ["one", "two", "three", "four"] - df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns) - df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) - df1.corrwith(df2) - df2.corrwith(df1, axis=1) - -.. _computation.ranking: - -Data ranking -~~~~~~~~~~~~ - -The :meth:`~Series.rank` method produces a data ranking with ties being -assigned the mean of the ranks (by default) for the group: - -.. ipython:: python - - s = pd.Series(np.random.randn(5), index=list("abcde")) - s["d"] = s["b"] # so there's a tie - s.rank() - -:meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows -(``axis=0``) or the columns (``axis=1``). ``NaN`` values are excluded from the -ranking. - -.. ipython:: python - - df = pd.DataFrame(np.random.randn(10, 6)) - df[4] = df[2][:5] # some ties - df - df.rank(1) - -``rank`` optionally takes a parameter ``ascending`` which by default is true; -when false, data is reverse-ranked, with larger values assigned a smaller rank. - -``rank`` supports different tie-breaking methods, specified with the ``method`` -parameter: - - - ``average`` : average rank of tied group - - ``min`` : lowest rank in the group - - ``max`` : highest rank in the group - - ``first`` : ranks assigned in the order they appear in the array - -.. _computation.windowing: - -Windowing functions -~~~~~~~~~~~~~~~~~~~ - -See :ref:`the window operations user guide ` for an overview of windowing functions. diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 59c9a9afb7f95..a6392706eb7a3 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -76,7 +76,6 @@ Guides boolean visualization style - computation groupby window timeseries diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index d1244f62cc1e4..f8c1f89be5d41 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -427,10 +427,16 @@ can even be omitted: .. note:: Missing values are ignored and each entry is computed using the pairwise - complete observations. Please see the :ref:`covariance section - ` for :ref:`caveats - ` associated with this method of - calculating covariance and correlation matrices. + complete observations. + + Assuming the missing data are missing at random this results in an estimate + for the covariance matrix which is unbiased. However, for many applications + this estimate may not be acceptable because the estimated covariance matrix + is not guaranteed to be positive semi-definite. This could lead to + estimated correlations having absolute values which are greater than one, + and/or a non-invertible covariance matrix. See `Estimation of covariance + matrices `_ + for more details. .. ipython:: python diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index 19e2e85c09a87..5ddcd5d90e65c 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -24,7 +24,7 @@ New features - :ref:`Added ` multiple levels to groupby (:issue:`103`) - :ref:`Allow ` multiple columns in ``by`` argument of ``DataFrame.sort_index`` (:issue:`92`, :issue:`362`) - :ref:`Added ` fast ``get_value`` and ``put_value`` methods to DataFrame (:issue:`360`) -- :ref:`Added ` ``cov`` instance methods to Series and DataFrame (:issue:`194`, :issue:`362`) +- Added ``cov`` instance methods to Series and DataFrame (:issue:`194`, :issue:`362`) - :ref:`Added ` ``kind='bar'`` option to ``DataFrame.plot`` (:issue:`348`) - :ref:`Added ` ``idxmin`` and ``idxmax`` to Series and DataFrame (:issue:`286`) - :ref:`Added ` ``read_clipboard`` function to parse DataFrame from clipboard (:issue:`300`) diff --git a/doc/source/whatsnew/v0.6.1.rst b/doc/source/whatsnew/v0.6.1.rst index 4e72a630ad9f1..58a7d1ee13278 100644 --- a/doc/source/whatsnew/v0.6.1.rst +++ b/doc/source/whatsnew/v0.6.1.rst @@ -7,7 +7,7 @@ Version 0.6.1 (December 13, 2011) New features ~~~~~~~~~~~~ - Can append single rows (as Series) to a DataFrame -- Add Spearman and Kendall rank :ref:`correlation ` +- Add Spearman and Kendall rank correlation options to Series.corr and DataFrame.corr (:issue:`428`) - :ref:`Added ` ``get_value`` and ``set_value`` methods to Series, DataFrame, and Panel for very low-overhead access (>2x faster in many @@ -19,7 +19,7 @@ New features - Implement new :ref:`SparseArray ` and ``SparseList`` data structures. SparseSeries now derives from SparseArray (:issue:`463`) - :ref:`Better console printing options ` (:issue:`453`) -- Implement fast :ref:`data ranking ` for Series and +- Implement fast data ranking for Series and DataFrame, fast versions of scipy.stats.rankdata (:issue:`428`) - Implement ``DataFrame.from_items`` alternate constructor (:issue:`444`) diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 490175914cef1..ce02525a69ace 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -145,7 +145,7 @@ Other new features - Add :ref:`'kde' ` plot option for density plots - Support for converting DataFrame to R data.frame through rpy2 - Improved support for complex numbers in Series and DataFrame -- Add :ref:`pct_change ` method to all data structures +- Add ``pct_change`` method to all data structures - Add max_colwidth configuration option for DataFrame console output - :ref:`Interpolate ` Series values using index values - Can select multiple columns from GroupBy diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cf1988808bbb0..9d17827d55951 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9592,6 +9592,14 @@ def corr( DataFrame or Series. Series.corr : Compute the correlation between two Series. + Notes + ----- + Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. + + * `Pearson correlation coefficient `_ + * `Kendall rank correlation coefficient `_ + * `Spearman's rank correlation coefficient `_ + Examples -------- >>> def histogram_intersection(a, b): @@ -9603,7 +9611,14 @@ def corr( dogs cats dogs 1.0 0.3 cats 0.3 1.0 - """ + + >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], + ... columns=['dogs', 'cats']) + >>> df.corr(min_periods=3) + dogs cats + dogs 1.0 NaN + cats NaN 1.0 + """ # noqa:E501 numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() @@ -9797,7 +9812,28 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie See Also -------- DataFrame.corr : Compute pairwise correlation of columns. - """ + + Examples + -------- + >>> index = ["a", "b", "c", "d", "e"] + >>> columns = ["one", "two", "three", "four"] + >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) + >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) + >>> df1.corrwith(df2) + one 1.0 + two 1.0 + three 1.0 + four 1.0 + dtype: float64 + + >>> df2.corrwith(df1, axis=1) + a 1.0 + b 1.0 + c 1.0 + d 1.0 + e NaN + dtype: float64 + """ # noqa:E501 axis = self._get_axis_number(axis) this = self._get_numeric_data() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9a9697b201b43..fc682b848b054 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8522,6 +8522,18 @@ def rank( 3 spider 8.0 4 snake NaN + Ties are assigned the mean of the ranks (by default) for the group. + + >>> s = pd.Series(range(5), index=list("abcde")) + >>> s["d"] = s["b"] + >>> s.rank() + a 1.0 + b 2.5 + c 4.0 + d 2.5 + e 5.0 + dtype: float64 + The following example shows how the method behaves with the above parameters: @@ -10251,7 +10263,7 @@ def pct_change( periods : int, default 1 Periods to shift for forming percent change. fill_method : str, default 'pad' - How to handle NAs before computing percent changes. + How to handle NAs **before** computing percent changes. limit : int, default None The number of consecutive NAs to fill before stopping. freq : DateOffset, timedelta, or str, optional diff --git a/pandas/core/series.py b/pandas/core/series.py index e565e124ac7f9..78f353ff8c70c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2566,6 +2566,14 @@ def corr(self, other, method="pearson", min_periods=None) -> float: DataFrame.corrwith : Compute pairwise correlation with another DataFrame or Series. + Notes + ----- + Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. + + * `Pearson correlation coefficient `_ + * `Kendall rank correlation coefficient `_ + * `Spearman's rank correlation coefficient `_ + Examples -------- >>> def histogram_intersection(a, b): @@ -2575,7 +2583,7 @@ def corr(self, other, method="pearson", min_periods=None) -> float: >>> s2 = pd.Series([.3, .6, .0, .1]) >>> s1.corr(s2, method=histogram_intersection) 0.3 - """ + """ # noqa:E501 this, other = self.align(other, join="inner", copy=False) if len(this) == 0: return np.nan