Fix for DataFrameClient issue - seems does not process correctly DateTimeIndex dates (issue influxdata#479) (influxdata#495)

patrickhoebeke · xginn8 · commit bf232a7aef9e · 2017-11-25T11:31:19.000-05:00
* [FIX] : compatibility with new version of pandas pd.tseries.period.PeriodIndex has been moved to pd.PeriodIndex since at least pandas 0.18.1 pd.tseries.period.DatetimeIndex has been moved to pd.DatetimeIndex since at least pandas 0.18.1 * [FIX] : Fixes influxdata#479 : DateTimeIndex not correctly converted to Unix Epoch (e.g .on (some?) Windows machines) * [FIX] : new fix for influxdata#479 : DateTimeIndex not correctly converted to Unix Epoch (e.g .on (some?) Windows machines) * [ENH] : added feature : DataFrame.write_points : NaNs and None values allowed in input DataFrame (corresponding entries are removed from the list of points to push to Influx) * [FIX] : error in unittest dataframe_client test_write_points_from_dataframe_with_all_none
diff --git a/influxdb/_dataframe_client.py b/influxdb/_dataframe_client.py
@@ -10,6 +10,7 @@
 from collections import defaultdict
 
 import pandas as pd
+import numpy as np
 
 from .client import InfluxDBClient
 from .line_protocol import _escape_tag
@@ -257,7 +258,7 @@ def _convert_dataframe_to_json(dataframe,
             {'measurement': measurement,
              'tags': dict(list(tag.items()) + list(tags.items())),
              'fields': rec,
-             'time': int(ts.value / precision_factor)}
+             'time': np.int64(ts.value / precision_factor)}
             for ts, tag, rec in zip(dataframe.index,
                                     dataframe[tag_columns].to_dict('record'),
                                     dataframe[field_columns].to_dict('record'))
@@ -274,6 +275,10 @@ def _convert_dataframe_to_lines(self,
                                     time_precision=None,
                                     numeric_precision=None):
 
+        dataframe = dataframe.dropna(how='all').copy()
+        if len(dataframe) == 0:
+            return []
+
         if not isinstance(dataframe, pd.DataFrame):
             raise TypeError('Must be DataFrame, but type was: {0}.'
                             .format(type(dataframe)))
@@ -319,11 +324,11 @@ def _convert_dataframe_to_lines(self,
 
         # Make array of timestamp ints
         if isinstance(dataframe.index, pd.PeriodIndex):
-            time = ((dataframe.index.to_timestamp().values.astype(int) /
-                     precision_factor).astype(int).astype(str))
+            time = ((dataframe.index.to_timestamp().values.astype(np.int64) /
+                     precision_factor).astype(np.int64).astype(str))
         else:
-            time = ((pd.to_datetime(dataframe.index).values.astype(int) /
-                     precision_factor).astype(int).astype(str))
+            time = ((pd.to_datetime(dataframe.index).values.astype(np.int64) /
+                     precision_factor).astype(np.int64).astype(str))
 
         # If tag columns exist, make an array of formatted tag keys and values
         if tag_columns:
@@ -357,12 +362,16 @@ def _convert_dataframe_to_lines(self,
 
         # Make an array of formatted field keys and values
         field_df = dataframe[field_columns]
+
         field_df = self._stringify_dataframe(field_df,
                                              numeric_precision,
                                              datatype='field')
-        field_df = (field_df.columns.values + '=').tolist() + field_df
-        field_df[field_df.columns[1:]] = ',' + field_df[field_df.columns[1:]]
-        fields = field_df.sum(axis=1)
+
+        def format_line(line):
+            line = line[~line.isnull()]  # drop None entries
+            return ",".join((line.index + '=' + line.values))
+
+        fields = field_df.apply(format_line, axis=1)
         del field_df
 
         # Generate line protocol string
@@ -371,6 +380,13 @@ def _convert_dataframe_to_lines(self,
 
     @staticmethod
     def _stringify_dataframe(dframe, numeric_precision, datatype='field'):
+
+        # Prevent modification of input dataframe
+        dframe = dframe.copy()
+
+        # Keep the positions where Null values are found
+        mask_null = dframe.isnull().values
+
         # Find int and string columns for field-type data
         int_columns = dframe.select_dtypes(include=['integer']).columns
         string_columns = dframe.select_dtypes(include=['object']).columns
@@ -414,6 +430,8 @@ def _stringify_dataframe(dframe, numeric_precision, datatype='field'):
             dframe = dframe.apply(_escape_pandas_series)
 
         dframe.columns = dframe.columns.astype(str)
+
+        dframe = dframe.where(~mask_null, None)
         return dframe
 
     def _datetime_to_epoch(self, datetime, time_precision='s'):
diff --git a/influxdb/tests/dataframe_client_test.py b/influxdb/tests/dataframe_client_test.py
@@ -59,6 +59,81 @@ def test_write_points_from_dataframe(self):
             cli.write_points(dataframe, 'foo', tags=None)
             self.assertEqual(m.last_request.body, expected)
 
+    def test_write_points_from_dataframe_with_none(self):
+        """Test write points from df in TestDataFrameClient object."""
+        now = pd.Timestamp('1970-01-01 00:00+00:00')
+        dataframe = pd.DataFrame(data=[["1", None, 1.0], ["2", 2.0, 2.0]],
+                                 index=[now, now + timedelta(hours=1)],
+                                 columns=["column_one", "column_two",
+                                          "column_three"])
+        expected = (
+            b"foo column_one=\"1\",column_three=1.0 0\n"
+            b"foo column_one=\"2\",column_two=2.0,column_three=2.0 "
+            b"3600000000000\n"
+        )
+
+        with requests_mock.Mocker() as m:
+            m.register_uri(requests_mock.POST,
+                           "http://localhost:8086/write",
+                           status_code=204)
+
+            cli = DataFrameClient(database='db')
+
+            cli.write_points(dataframe, 'foo')
+            self.assertEqual(m.last_request.body, expected)
+
+            cli.write_points(dataframe, 'foo', tags=None)
+            self.assertEqual(m.last_request.body, expected)
+
+    def test_write_points_from_dataframe_with_line_of_none(self):
+        """Test write points from df in TestDataFrameClient object."""
+        now = pd.Timestamp('1970-01-01 00:00+00:00')
+        dataframe = pd.DataFrame(data=[[None, None, None], ["2", 2.0, 2.0]],
+                                 index=[now, now + timedelta(hours=1)],
+                                 columns=["column_one", "column_two",
+                                          "column_three"])
+        expected = (
+            b"foo column_one=\"2\",column_two=2.0,column_three=2.0 "
+            b"3600000000000\n"
+        )
+
+        with requests_mock.Mocker() as m:
+            m.register_uri(requests_mock.POST,
+                           "http://localhost:8086/write",
+                           status_code=204)
+
+            cli = DataFrameClient(database='db')
+
+            cli.write_points(dataframe, 'foo')
+            self.assertEqual(m.last_request.body, expected)
+
+            cli.write_points(dataframe, 'foo', tags=None)
+            self.assertEqual(m.last_request.body, expected)
+
+    def test_write_points_from_dataframe_with_all_none(self):
+        """Test write points from df in TestDataFrameClient object."""
+        now = pd.Timestamp('1970-01-01 00:00+00:00')
+        dataframe = pd.DataFrame(data=[[None, None, None], [None, None, None]],
+                                 index=[now, now + timedelta(hours=1)],
+                                 columns=["column_one", "column_two",
+                                          "column_three"])
+        expected = (
+            b"\n"
+        )
+
+        with requests_mock.Mocker() as m:
+            m.register_uri(requests_mock.POST,
+                           "http://localhost:8086/write",
+                           status_code=204)
+
+            cli = DataFrameClient(database='db')
+
+            cli.write_points(dataframe, 'foo')
+            self.assertEqual(m.last_request.body, expected)
+
+            cli.write_points(dataframe, 'foo', tags=None)
+            self.assertEqual(m.last_request.body, expected)
+
     def test_write_points_from_dataframe_in_batches(self):
         """Test write points in batch from df in TestDataFrameClient object."""
         now = pd.Timestamp('1970-01-01 00:00+00:00')