Delay import of sklearn ijson (#40)

xadupre · web-flow · commit 9753f32b958c · 2024-07-15T11:30:53.000+02:00
* delay import of sklearn ijson

* ruff

* remove rstcheck

* remove circecli

* complex

* fix documentation
diff --git a/.circleci/config.yml b/.circleci/config.yml
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -77,7 +77,7 @@ jobs:
             grep ERROR doc.txt
             exit 1
           fi
-          if [[ $(grep WARNING doc.txt) ]]; then
+          if [[ $(grep WARNING doc.txt | grep -v 'std:term:y') ]]; then
             echo "Documentation produces warnings."
             grep WARNING doc.txt
             exit 1
diff --git a/.github/workflows/rstcheck.yml b/.github/workflows/rstcheck.yml
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -24,7 +24,7 @@ jobs:
   - script: pip install -r requirements-dev.txt
     displayName: 'Install Requirements dev'
   - script: |
-      ruff .
+      ruff check .
     displayName: 'Ruff'
   - script: |
       black --diff .
@@ -76,11 +76,8 @@ jobs:
   - script: pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn
     displayName: 'Install scikit-learn nightly'
   - script: |
-      ruff .
+      ruff check .
     displayName: 'Ruff'
-  - script: |
-      rstcheck -r ./_doc ./pandas_streaming
-    displayName: 'rstcheck'
   - script: |
       black --diff .
     displayName: 'Black'
@@ -117,11 +114,8 @@ jobs:
   - script: pip install -r requirements-dev.txt
     displayName: 'Install Requirements dev'
   - script: |
-      ruff .
+      ruff check .
     displayName: 'Ruff'
-  - script: |
-      rstcheck -r ./_doc ./pandas_streaming
-    displayName: 'rstcheck'
   - script: |
       black --diff .
     displayName: 'Black'
diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py
@@ -2,7 +2,6 @@
 from logging import getLogger
 import pandas
 import numpy
-from sklearn.model_selection import train_test_split
 from .dataframe_helpers import dataframe_shuffle
 
 logger = getLogger("pandas-streaming")
@@ -61,6 +60,8 @@ def train_test_split_weights(
             raise ValueError(
                 f"test_size={test_size} or train_size={train_size} cannot be null (1)."
             )
+        from sklearn.model_selection import train_test_split
+
         return train_test_split(
             df, test_size=test_size, train_size=train_size, random_state=random_state
         )
diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
@@ -640,10 +640,10 @@ def _reservoir_sampling(
                 if len(indices) < n:
                     indices.append((i, ir))
                 else:
-                    x = nrandom.random()  # pylint: disable=E1101
+                    x = nrandom.random()
                     if x * n < (seen - n):
                         k = nrandom.randint(0, len(indices) - 1)
-                        indices[k] = (i, ir)  # pylint: disable=E1126
+                        indices[k] = (i, ir)
         indices = set(indices)
 
         def reservoir_iterate(sdf, indices, chunksize):
diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py
@@ -25,11 +25,9 @@ def numpy_types():
         numpy.uint16,
         numpy.uint32,
         numpy.uint64,
-        numpy.float_,
         numpy.float16,
         numpy.float32,
         numpy.float64,
-        numpy.complex_,
         numpy.complex64,
         numpy.complex128,
     ]
@@ -155,13 +153,13 @@ def hash_floatl(c):
     }  # pylint: disable=R1721
     for c in cols:
         t = coltype[c]
-        if t == int:
+        if t == int:  # noqa: E721
             df[c] = df[c].apply(hash_intl)
         elif t == numpy.int64:
             df[c] = df[c].apply(lambda x: numpy.int64(hash_intl(x)))
-        elif t == float:
+        elif t == float:  # noqa: E721
             df[c] = df[c].apply(hash_floatl)
-        elif t == object:
+        elif t == object:  # noqa: E721
             df[c] = df[c].apply(hash_strl)
         else:
             raise NotImplementedError(  # pragma: no cover
diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py
@@ -5,7 +5,6 @@
     from ujson import dumps
 except ImportError:  # pragma: no cover
     from json import dumps
-import ijson
 
 
 class JsonPerRowsStream:
@@ -257,6 +256,8 @@ def enumerate_json_items(
     else:
         if hasattr(filename, "seek"):
             filename.seek(0)
+        import ijson
+
         parser = ijson.parse(filename)
         current = None
         curkey = None
diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py
@@ -45,7 +45,7 @@ def sklearn_train_test_split(
         )
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", category=ImportWarning)
-        from sklearn.model_selection import train_test_split  # pylint: disable=C0415
+        from sklearn.model_selection import train_test_split
 
     opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"]
     split_ops = {}
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,17 +1,3 @@
-[tool.rstcheck]
-report_level = "INFO"
-ignore_directives = [
-    "autoclass",
-    "autofunction",
-    "automodule",
-    "exreflist",
-    "gdot",
-    "image-sg",
-    "pr",
-    "runpython",
-]
-ignore_roles = ["epkg"]
-
 [tool.ruff]
 
 # Exclude a variety of commonly ignored directories.
@@ -25,11 +11,11 @@ exclude = [
 # Same as Black.
 line-length = 88
 
-[tool.ruff.mccabe]
+[tool.ruff.lint.mccabe]
 # Unlike Flake8, default to a complexity level of 10.
 max-complexity = 10
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "_doc/examples/plot_first_example.py" = ["E402", "F811"]
 "_unittests/ut_df/test_dataframe_io_helpers.py" = ["E501"]
 "pandas_streaming/data/__init__.py" = ["F401"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -14,7 +14,6 @@ pycodestyle
 pylint>=2.14.0
 pytest
 pytest-cov
-rstcheck[sphinx,toml]
 ruff
 scikit-learn
 scipy

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def sklearn_train_test_split(`
`45`	`45`	`)`
`46`	`46`	`with warnings.catch_warnings():`
`47`	`47`	`warnings.filterwarnings("ignore", category=ImportWarning)`
`48`		`- from sklearn.model_selection import train_test_split # pylint: disable=C0415`
	`48`	`+ from sklearn.model_selection import train_test_split`
`49`	`49`
`50`	`50`	`opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"]`
`51`	`51`	`split_ops = {}`