From 3f2fd8d53da13a0278a7540edc6ffa68baab74be Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 11:12:22 -0500 Subject: [PATCH 01/11] Use sumprod() to speed-up and improve accuracy of correlation() --- Lib/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 7d5d750193a5ab..69dc33d931b4ff 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -134,7 +134,7 @@ from fractions import Fraction from decimal import Decimal -from itertools import count, groupby, repeat +from itertools import count, groupby, repeat, tee from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod from functools import reduce @@ -1076,9 +1076,9 @@ def correlation(x, y, /, *, method='linear'): y = _rank(y, start=start) xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((d := xi - xbar) * d for xi in x) - syy = fsum((d := yi - ybar) * d for yi in y) + sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y)) + sxx = sumprod(*tee(xi - xbar for xi in x)) + syy = sumprod(*tee(yi - ybar for yi in y)) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: From 3241045212b341ca3396eff48b5db1ddce75efed Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 11:30:00 -0500 Subject: [PATCH 02/11] Only compute differences once --- Lib/statistics.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 69dc33d931b4ff..8340829c7fc8f5 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -134,7 +134,7 @@ from fractions import Fraction from decimal import Decimal -from itertools import count, groupby, repeat, tee +from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod from functools import reduce @@ -1076,9 +1076,11 @@ def correlation(x, y, /, *, method='linear'): y = _rank(y, start=start) xbar = fsum(x) / n ybar = fsum(y) / n - sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y)) - sxx = sumprod(*tee(xi - xbar for xi in x)) - syy = sumprod(*tee(yi - ybar for yi in y)) + centered_x = [xi - xbar for xi in x] + centered_y = [yi - ybar for yi in y] + sxy = sumprod(centered_x, centered_y) + sxx = sumprod(centered_x, centered_x) + syy = sumprod(centered_y, centered_y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: From 52c75847b952c91c948435969fc3dad79567e55a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 11:40:36 -0500 Subject: [PATCH 03/11] Use sumprod() in covariance(). --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 8340829c7fc8f5..bf182f0234cb93 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1036,7 +1036,7 @@ def covariance(x, y, /): raise StatisticsError('covariance requires at least two data points') xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) + sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y)) return sxy / (n - 1) From 16b2745301a6212dd9eb0cce646255fc35bccb9d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:02:41 -0500 Subject: [PATCH 04/11] Use sumprod() in linear_regression(). --- Lib/statistics.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index bf182f0234cb93..f153cbd4453676 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1134,13 +1134,15 @@ def linear_regression(x, y, /, *, proportional=False): if n < 2: raise StatisticsError('linear regression requires at least two data points') if proportional: - sxy = fsum(xi * yi for xi, yi in zip(x, y)) - sxx = fsum(xi * xi for xi in x) + sxy = sumprod(x, y) + sxx = sumprod(x, x) else: xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((d := xi - xbar) * d for xi in x) + centered_x = [xi - xbar for xi in x] + centered_y = (yi - ybar for yi in y) + sxy = sumprod(centered_x, centered_y) + sxx = sumprod(centered_x, centered_x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) except ZeroDivisionError: From b9eeee536d939d0b6c2165fb0ac92ce1a412a718 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:18:15 -0500 Subject: [PATCH 05/11] Can skip centering if xbar or ybar is already zero --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index f153cbd4453676..29746e3d8db9c3 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1076,8 +1076,8 @@ def correlation(x, y, /, *, method='linear'): y = _rank(y, start=start) xbar = fsum(x) / n ybar = fsum(y) / n - centered_x = [xi - xbar for xi in x] - centered_y = [yi - ybar for yi in y] + centered_x = [xi - xbar for xi in x] if xbar else x + centered_y = [yi - ybar for yi in y] if ybar else y sxy = sumprod(centered_x, centered_y) sxx = sumprod(centered_x, centered_x) syy = sumprod(centered_y, centered_y) From c51e520b7c8f9f8e14ca6b2537de46f8e42cada2 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:22:04 -0500 Subject: [PATCH 06/11] Factor-out common logic in linear_regression --- Lib/statistics.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 29746e3d8db9c3..cfd0819b4e5f51 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1074,13 +1074,14 @@ def correlation(x, y, /, *, method='linear'): start = (n - 1) / -2 # Center rankings around zero x = _rank(x, start=start) y = _rank(y, start=start) - xbar = fsum(x) / n - ybar = fsum(y) / n - centered_x = [xi - xbar for xi in x] if xbar else x - centered_y = [yi - ybar for yi in y] if ybar else y - sxy = sumprod(centered_x, centered_y) - sxx = sumprod(centered_x, centered_x) - syy = sumprod(centered_y, centered_y) + else: + xbar = fsum(x) / n + ybar = fsum(y) / n + x = [xi - xbar for xi in x] + y = [yi - ybar for yi in y] + sxy = sumprod(x, y) + sxx = sumprod(x, x) + syy = sumprod(y, y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: From 39be9f8ec8f576ddb61eb4b5fa5b7366edab5443 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:25:22 -0500 Subject: [PATCH 07/11] Factor-out common logic in linear_regression --- Lib/statistics.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index cfd0819b4e5f51..2571d5dd02cf3b 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1134,16 +1134,13 @@ def linear_regression(x, y, /, *, proportional=False): raise StatisticsError('linear regression requires that both inputs have same number of data points') if n < 2: raise StatisticsError('linear regression requires at least two data points') - if proportional: - sxy = sumprod(x, y) - sxx = sumprod(x, x) - else: + if not proportional: xbar = fsum(x) / n ybar = fsum(y) / n - centered_x = [xi - xbar for xi in x] - centered_y = (yi - ybar for yi in y) - sxy = sumprod(centered_x, centered_y) - sxx = sumprod(centered_x, centered_x) + x = [xi - xbar for xi in x] + y = [yi - ybar for yi in y] + sxy = sumprod(x, y) + sxx = sumprod(x, x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) except ZeroDivisionError: From a4486157c189a61d67cec6a25463bb60cb8f26d8 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:28:23 -0500 Subject: [PATCH 08/11] Use generator for y array in linear_regression() --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 2571d5dd02cf3b..9cb76f5562387b 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1137,8 +1137,8 @@ def linear_regression(x, y, /, *, proportional=False): if not proportional: xbar = fsum(x) / n ybar = fsum(y) / n - x = [xi - xbar for xi in x] - y = [yi - ybar for yi in y] + x = [xi - xbar for xi in x] # List because used three times below + y = (yi - ybar for yi in y) # Generator because only used once below sxy = sumprod(x, y) sxx = sumprod(x, x) try: From c9a4e4d39acb2dd607b3ddffa3d539f772f5ab86 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 15:48:28 -0500 Subject: [PATCH 09/11] Coerce result to a float --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 9cb76f5562387b..6bd214bbfe2ff5 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1139,7 +1139,7 @@ def linear_regression(x, y, /, *, proportional=False): ybar = fsum(y) / n x = [xi - xbar for xi in x] # List because used three times below y = (yi - ybar for yi in y) # Generator because only used once below - sxy = sumprod(x, y) + sxy = sumprod(x, y) + 0.0 # Add zero to coerce result to a float sxx = sumprod(x, x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) From 5fe1be5898a0e649be8bb4c8127c05ef5d642b30 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 17:42:18 -0500 Subject: [PATCH 10/11] Add test for the result type being a float. --- Lib/test/test_statistics.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 31a3cb6b53a6f2..f0fa6454b1f91a 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -1,4 +1,4 @@ -"""Test suite for statistics module, including helper NumericTestCase and +x = """Test suite for statistics module, including helper NumericTestCase and approx_equal function. """ @@ -2610,6 +2610,16 @@ def test_proportional(self): self.assertAlmostEqual(slope, 20 + 1/150) self.assertEqual(intercept, 0.0) + def test_float_output(self): + x = [Fraction(2, 3), Fraction(3, 4)] + y = [Fraction(4, 5), Fraction(5, 6)] + slope, intercept = statistics.linear_regression(x, y) + self.assertTrue(isinstance(slope, float)) + self.assertTrue(isinstance(intercept, float)) + slope, intercept = statistics.linear_regression(x, y, proportional=True) + self.assertTrue(isinstance(slope, float)) + self.assertTrue(isinstance(intercept, float)) + class TestNormalDist: # General note on precision: The pdf(), cdf(), and overlap() methods From 45ffe7e100011b92c0fb073cb4ca6b71f9b52de9 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 18:27:20 -0500 Subject: [PATCH 11/11] Add blurb --- .../next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst diff --git a/Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst b/Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst new file mode 100644 index 00000000000000..3de09f86754f3e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst @@ -0,0 +1,2 @@ +Optimized fmean(), correlation(), covariance(), and linear_regression() +using the new math.sumprod() function. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy