Lumiwealth · grzesir · Mar 12, 2024 · Mar 10, 2024 · Mar 10, 2024 · Mar 10, 2024
@@ -1,6 +1,6 @@
 import logging
 import traceback
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from datetime import date, timedelta
 
 from polygon import RESTClient
@@ -20,6 +20,10 @@ class PolygonDataBacktesting(PandasData):
     Backtesting implementation of Polygon
     """
 
+    # Size limit for the pandas_data and _data_store (dicts of Pandas DataFrames) in bytes.
+    # Set to None to disable the limit.
+    MAX_STORAGE_BYTES = None
+
     def __init__(
         self,
         datetime_start,
@@ -37,7 +41,17 @@ def __init__(
         # RESTClient API for Polygon.io polygon-api-client
         self.polygon_client = RESTClient(self._api_key)
 
-    def update_pandas_data(self, asset, quote, length, timestep, start_dt=None):
+    @staticmethod
+    def _enforce_storage_limit(pandas_data: OrderedDict):
+        storage_used = sum(data.df.memory_usage().sum() for data in pandas_data.values())
+        logging.info(f"{storage_used = :,} bytes for {len(pandas_data)} items")
+        while storage_used > PolygonDataBacktesting.MAX_STORAGE_BYTES:
+            k, d = pandas_data.popitem(last=False)
+            mu = d.df.memory_usage().sum()
+            storage_used -= mu
+            logging.info(f"Storage limit exceeded. Evicted LRU data: {k} used {mu:,} bytes")
+
+    def _update_pandas_data(self, asset, quote, length, timestep, start_dt=None, update_data_store=False):
         """
         Get asset data and update the self.pandas_data dictionary.
 
@@ -51,11 +65,12 @@ def update_pandas_data(self, asset, quote, length, timestep, start_dt=None):
             The number of data points to get.
         timestep : str
             The timestep to use. For example, "1minute" or "1hour" or "1day".
-
-        Returns
-        -------
-        dict
-            A dictionary with the keys being the asset and the values being the PandasData objects.
+        start_dt : datetime
+            The start datetime to use. If None, the current self.start_datetime will be used.
+        update_data_store : bool
+            If True, the data will also be added to the self._data_store dictionary.
+            That update will not include the adjustments made by PandasData.load_data.
+            See https://github.com/Lumiwealth/lumibot/issues/391 and its PR for further discussion.
         """
         search_asset = asset
         asset_separated = asset
@@ -84,22 +99,22 @@ def update_pandas_data(self, asset, quote, length, timestep, start_dt=None):
             if data_timestep == ts_unit:
                 # Check if we have enough data (5 days is the buffer we subtracted from the start datetime)
                 if (data_start_datetime - start_datetime) < START_BUFFER:
-                    return None
+                    return
 
             # Always try to get the lowest timestep possible because we can always resample
             # If day is requested then make sure we at least have data that's less than a day
             if ts_unit == "day":
                 if data_timestep == "minute":
                     # Check if we have enough data (5 days is the buffer we subtracted from the start datetime)
                     if (data_start_datetime - start_datetime) < START_BUFFER:
-                        return None
+                        return
                     else:
                         # We don't have enough data, so we need to get more (but in minutes)
                         ts_unit = "minute"
                 elif data_timestep == "hour":
                     # Check if we have enough data (5 days is the buffer we subtracted from the start datetime)
                     if (data_start_datetime - start_datetime) < START_BUFFER:
-                        return None
+                        return
                     else:
                         # We don't have enough data, so we need to get more (but in hours)
                         ts_unit = "hour"
@@ -109,7 +124,7 @@ def update_pandas_data(self, asset, quote, length, timestep, start_dt=None):
                 if data_timestep == "minute":
                     # Check if we have enough data (5 days is the buffer we subtracted from the start datetime)
                     if (data_start_datetime - start_datetime) < START_BUFFER:
-                        return None
+                        return
                     else:
                         # We don't have enough data, so we need to get more (but in minutes)
                         ts_unit = "minute"
@@ -177,15 +192,21 @@ def update_pandas_data(self, asset, quote, length, timestep, start_dt=None):
             logging.error(traceback.format_exc())
             raise Exception("Error getting data from Polygon") from e
 
-        if df is None:
-            return None
+        if (df is None) or df.empty:
+            return
 
-        pandas_data = []
         data = Data(asset_separated, df, timestep=ts_unit, quote=quote_asset)
-        pandas_data.append(data)
-        pandas_data_updated = self._set_pandas_data_keys(pandas_data)
-
-        return pandas_data_updated
 self.broker.data_source.load_data() 
 def load_data(self): 
 self.broker.data_source.load_data() 
 def load_data(self): 
 self.broker.data_source.load_data() 
 def load_data(self): 
 self.broker.data_source.load_data() 
 def load_data(self): 
+        pandas_data_update = self._set_pandas_data_keys([data])
+
+        # Add the keys to the self.pandas_data dictionary
+        self.pandas_data.update(pandas_data_update)
+        if PolygonDataBacktesting.MAX_STORAGE_BYTES:
+            self._enforce_storage_limit(self.pandas_data)
+        if update_data_store:
+            # TODO: Why do we have both self.pandas_data and self._data_store?
+            self._data_store.update(pandas_data_update)
+            if PolygonDataBacktesting.MAX_STORAGE_BYTES:
+                self._enforce_storage_limit(self._data_store)
 
     def _pull_source_symbol_bars(
         self,
@@ -202,11 +223,7 @@ def _pull_source_symbol_bars(
         start_dt, ts_unit = self.get_start_datetime_and_ts_unit(length, timestep, current_dt, start_buffer=START_BUFFER)
 
         # Get data from Polygon
-        pandas_data_update = self.update_pandas_data(asset, quote, length, timestep, start_dt)
-
-        if pandas_data_update is not None:
-            # Add the keys to the self.pandas_data dictionary
-            self.pandas_data.update(pandas_data_update)
+        self._update_pandas_data(asset, quote, length, timestep, start_dt)
 
         return super()._pull_source_symbol_bars(
             asset, length, timestep, timeshift, quote, exchange, include_after_hours
@@ -223,10 +240,7 @@ def get_historical_prices_between_dates(
         start_date=None,
         end_date=None,
     ):
-        pandas_data_update = self.update_pandas_data(asset, quote, 1, timestep)
-        if pandas_data_update is not None:
-            # Add the keys to the self.pandas_data dictionary
-            self.pandas_data.update(pandas_data_update)
+        self._update_pandas_data(asset, quote, 1, timestep)
 
         response = super()._pull_source_symbol_bars_between_dates(
             asset, timestep, quote, exchange, include_after_hours, start_date, end_date
@@ -241,11 +255,7 @@ def get_historical_prices_between_dates(
     def get_last_price(self, asset, timestep="minute", quote=None, exchange=None, **kwargs):
         try:
             dt = self.get_datetime()
-            pandas_data_update = self.update_pandas_data(asset, quote, 1, timestep, dt)
-            if pandas_data_update is not None:
-                # Add the keys to the self.pandas_data dictionary
-                self.pandas_data.update(pandas_data_update)
-                self._data_store.update(pandas_data_update)
+            self._update_pandas_data(asset, quote, 1, timestep, dt, update_data_store=True)
         except Exception as e:
             print(f"Error get_last_price from Polygon: {e}")
 

@@ -1,5 +1,5 @@
 import logging
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from datetime import date, timedelta
 
 import pandas as pd
@@ -24,15 +24,16 @@ def __init__(self, *args, pandas_data=None, auto_adjust=True, **kwargs):
         self.name = "pandas"
         self.pandas_data = self._set_pandas_data_keys(pandas_data)
         self.auto_adjust = auto_adjust
-        self._data_store = {}
+        self._data_store = OrderedDict()
         self._date_index = None
         self._date_supply = None
         self._timestep = "minute"
         self._expiries_exist = False
 
     @staticmethod
     def _set_pandas_data_keys(pandas_data):
-        new_pandas_data = {}
+        # OrderedDict tracks the LRU dataframes for when it comes time to do evictions.
+        new_pandas_data = OrderedDict()
 
         def _get_new_pandas_data_key(data):
             # Always save the asset as a tuple of Asset and quote
@@ -61,7 +62,7 @@ def _get_new_pandas_data_key(data):
                 new_pandas_data[key] = data
 
         return new_pandas_data
-
+    
     def load_data(self):
         self._data_store = self.pandas_data
         self._expiries_exist = (

@@ -268,6 +268,11 @@ def trim_data(self, df, date_start, date_end, trading_hours_start, trading_hours
             )
         return df
 
+    # ./lumibot/build/__editable__.lumibot-3.1.14-py3-none-any/lumibot/entities/data.py:280: 
+    # FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. 
+    # Call result.infer_objects(copy=False) instead.
+    # To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
+
     def repair_times_and_fill(self, idx):
         # Trim the global index so that it is within the local data.
         idx = idx[(idx >= self.datetime_start) & (idx <= self.datetime_end)]

@@ -142,7 +142,10 @@ def to_datetime_aware(dt_in):
     """Convert naive time to datetime aware on default timezone."""
     if not dt_in:
         return dt_in
-    elif isinstance(dt_in, dt.datetime) and (dt_in.tzinfo is None or dt_in.tzinfo.utcoffset(dt_in) is None):
+    elif isinstance(dt_in, dt.datetime) and (dt_in.tzinfo is None):
+        return LUMIBOT_DEFAULT_PYTZ.localize(dt_in)
+    elif isinstance(dt_in, dt.datetime) and (dt_in.tzinfo.utcoffset(dt_in) is None):
+        # TODO: This will fail because an exception is thrown if tzinfo is not None.
         return LUMIBOT_DEFAULT_PYTZ.localize(dt_in)
     else:
         return dt_in

@@ -36,7 +36,7 @@ def get_price_data_from_polygon(
     cached in the LUMIBOT_CACHE_FOLDER/polygon folder so that it can be reused later and we don't have to query
     Polygon.io every time we run a backtest.
 
-    If the Polygon respone has missing bars for a date, the missing bars will be added as empty (all NaN) rows
+    If the Polygon response has missing bars for a date, the missing bars will be added as empty (all NaN) rows
     to the cache file to avoid querying Polygon for the same missing bars in the future.  Note that means if
     a request is for a future time then we won't make a request to Polygon for it later when that data might
     be available.  That should result in an error rather than missing data from Polygon, but just in case a