FFM-8300 Target metrics request batching (#76)

erdirowlands · web-flow · commit 751c12cf7839 · 2023-06-28T14:51:00.000+01:00
* FFM-8300 Begin refactoring to use batches for analytics

* FFM-8300 Add additional loop for batching

* FFM-8300 Check unique targets in all batches

* FFM-8300 Fix bug with final batch

* FFM-8300 Fix finally bug

* FFM-8300 Temp set max number of batches to 200

* FFM-8300 Remove commented code

* FFM-8300 Remove commented code

* FFM-8300 Start batching requests

* FFM-8300 Start batching requests

* FFM-8300 Start batching requests

* FFM-8300 Use futures to sesnd requests

* FFM-8300 Use futures to sesnd requests

* FFM-8300 Start from second element

* FFM-8300 Fix exception if no other batches

* FFM-8300 Fix exception if no other batches

* FFM-8300 SDK Codes

* FFM-8300 Fix bug in unpacking dict

* FFM-8300 Exception handling for metrics request

* FFM-8300 Comment

* FFM-8300 Comment

* FFM-8300 Remove whitespace from message

* FFM-8300 Move metrics success message to top level

* FFM-8300 Comment

* FFM-8300 Comment

* FFM-8300 Don't let metrics window be below 60 seconds

* FFM-8300 Update docs

* FFM-8300 Don't let metrics window be below 60 seconds

FFM-8300 Fixup

* FFM-8300 Log unique success codes

* FFM-8300 1.2.0 release prep

* FFM-8300 1.2.0 release prep

* FFM-8300 Change eval to debug

* FFM-8300 merge main
diff --git a/docs/further_reading.md b/docs/further_reading.md
@@ -12,7 +12,7 @@ You can pass the configuration in as options when the SDK client is created.
                       with_events_url("https://events.ff.harness.io/api/1.0"),
                       with_stream_enabled(True),
                       with_analytics_enabled(True),
-                      Config(pull_interval=60))
+                      config=Config(pull_interval=60))
 ```
 
 | Name            | Config Option                                            | Description                                                                                                                                      | default                              |
diff --git a/featureflags/.DS_Store b/featureflags/.DS_Store
diff --git a/featureflags/__init__.py b/featureflags/__init__.py
@@ -2,4 +2,4 @@
 
 __author__ = """Enver Bisevac"""
 __email__ = "enver.bisevac@harness.io"
-__version__ = '1.1.16'
+__version__ = '1.2.0'
diff --git a/featureflags/analytics.py b/featureflags/analytics.py
@@ -1,8 +1,10 @@
 import time
 from threading import Lock, Thread
 from typing import Dict, List, Union
+import concurrent.futures
 
 import attr
+import httpx
 
 from featureflags.models.metrics_data_metrics_type import \
     MetricsDataMetricsType
@@ -20,7 +22,8 @@
 from .models.unset import Unset
 from .sdk_logging_codes import info_metrics_thread_started, \
     info_metrics_success, warn_post_metrics_failed, \
-    info_metrics_thread_existed, info_metrics_target_exceeded
+    info_metrics_thread_existed, info_metrics_target_exceeded, \
+    warn_post_metrics_target_batch_failed, info_metrics_target_batch_success
 from .util import log
 
 FF_METRIC_TYPE = 'FFMETRICS'
@@ -62,7 +65,10 @@ def __init__(self, config: Config, client: AuthenticatedClient,
         self._client = client
         self._environment = environment
         self._data: Dict[str, AnalyticsEvent] = {}
-        self._target_data: Dict[str, MetricTargetData] = {}
+        self._target_data_batches: List[Dict[str, MetricTargetData]] = [{}]
+        self._max_number_of_batches = 200
+        self._max_batch_size = 1000
+        self._current_batch_index = 0
         self.max_target_data_exceeded = False
 
         self._running = False
@@ -89,37 +95,45 @@ def enqueue(self, target: Target, identifier: str,
                 event.count = 1
                 self._data[unique_evaluation_key] = event
 
-            # Store unique targets. If the target already exists
-            # just ignore it.
+            # Check if we're on our final batch - if we are, and we've
+            # exceeded the max batch size just return early.
+            if len(self._target_data_batches) >= self._max_number_of_batches:
+                if len(self._target_data_batches[
+                           self._current_batch_index]) >= \
+                        self._max_batch_size:
+                    if not self.max_target_data_exceeded:
+                        self.max_target_data_exceeded = True
+                        info_metrics_target_exceeded()
+                    return
+
             if event.target is not None and not event.target.anonymous:
                 unique_target_key = self.get_target_key(event)
-                if unique_target_key not in self._target_data:
-                    # Temporary workaround for FFM-8231 - limit max size of
-                    # target
-                    # metrics to 50k, which ff-server can process in around
-                    # 18 seconds. This possibly prevent some targets from
-                    # getting
-                    # registered and showing in the UI, but in theory, they
-                    # should get registered eventually on subsequent
-                    # evaluations.
-                    # We want to eventually use a batching solution
-                    # to avoid this.
-                    max_target_size = 50000
-                    if len(self._target_data) >= max_target_size:
-                        # Only log the info code once per interval
-                        if not self.max_target_data_exceeded:
-                            info_metrics_target_exceeded()
-                            self.max_target_data_exceeded = True
+
+                # Store unique targets. If the target already exists
+                # in any of the batches, don't continue processing it
+                for batch in self._target_data_batches:
+                    if unique_target_key in batch:
                         return
-                    target_name = event.target.name
-                    # If the target has no name use the identifier
-                    if not target_name:
-                        target_name = event.target.identifier
-                    self._target_data[unique_target_key] = MetricTargetData(
+
+                # If we've exceeded the max batch size for the current
+                # batch, then create a new batch and start using it.
+                if len(self._target_data_batches[
+                           self._current_batch_index]) >= self._max_batch_size:
+                    self._target_data_batches.append({})
+                    self._current_batch_index += 1
+
+                target_name = event.target.name
+                # If the target has no name use the identifier
+                if not target_name:
+                    target_name = event.target.identifier
+                self._target_data_batches[
+                    self._current_batch_index][unique_target_key] = \
+                    MetricTargetData(
                         identifier=event.target.identifier,
                         name=target_name,
                         attributes=event.target.attributes
                     )
+
         finally:
             self._lock.release()
 
@@ -177,35 +191,106 @@ def _send_data(self) -> None:
                     attributes=metric_attributes
                 )
                 metrics_data.append(md)
-            for _, unique_target in self._target_data.items():
-                target_attributes: List[KeyValue] = []
-                if not isinstance(unique_target.attributes, Unset):
-                    for key, value in unique_target.attributes.items():
-                        # Attribute values need to be sent as string to
-                        # ff-server so convert all values to strings.
-                        target_attributes.append(KeyValue(key, str(value)))
-                td = TargetData(
-                    identifier=unique_target.identifier,
-                    name=unique_target.name,
-                    attributes=target_attributes
-                )
-                target_data.append(td)
+            for _, unique_target in self._target_data_batches[0].items():
+                self.process_target(target_data, unique_target)
+
+            target_data_batches: List[List[TargetData]] = []
+            target_data_batch_index = 0
+            # We've already accounted for the first batch, so start processing
+            # from the second batch onwards
+            for batch in self._target_data_batches[1:]:
+                target_data_batches.append([])
+                for _, unique_target in batch.items():
+                    self.process_target(
+                        target_data_batches[target_data_batch_index],
+                        unique_target)
+                target_data_batch_index += 1
+
+
         finally:
             self._data = {}
-            self._target_data = {}
+            self._target_data_batches = [{}]
+            self._current_batch_index = 0
             self.max_target_data_exceeded = False
             self._lock.release()
 
         body: Metrics = Metrics(target_data=target_data,
                                 metrics_data=metrics_data)
-        response = post_metrics(client=self._client,
-                                environment=self._environment, json_body=body)
-        log.debug('Metrics server returns: %d', response.status_code)
-        if response.status_code >= 400:
-            warn_post_metrics_failed(response.status_code)
-            return
-        info_metrics_success()
-        return
+        try:
+            response = post_metrics(client=self._client,
+                                    environment=self._environment,
+                                    json_body=body)
+
+            log.debug('Metrics server returns: %d', response.status_code)
+            if response.status_code >= 400:
+                warn_post_metrics_failed(response.status_code)
+                return
+            if len(target_data_batches) > 0:
+                log.info('Sending %s target batches to metrics',
+                         len(target_data_batches))
+                unique_responses_codes = {}
+
+                # Process batches concurrently
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    futures = []
+                    for batch in target_data_batches:
+                        # Staggering requests over 0.02 seconds mean that we
+                        # will send 200 requests every four seconds, so that
+                        # the backend isn't hit too hard.
+                        time.sleep(0.02)
+                        future = executor.submit(
+                            self.process_target_data_batch,
+                            batch)
+                        futures.append(future)
+
+                    # Wait for all batches to complete
+                    concurrent.futures.wait(futures)
+
+                    # Get unique status codes
+                    for future in futures:
+                        status_code = future.result()
+                        if status_code in unique_responses_codes:
+                            unique_responses_codes[status_code] += 1
+                        else:
+                            unique_responses_codes[status_code] = 1
+
+                # Log any error codes
+                for unique_code, count in unique_responses_codes.items():
+                    if response.status_code >= 400:
+                        warn_post_metrics_target_batch_failed(
+                            f'{count} batches received code {unique_code}')
+                    info_metrics_target_batch_success(
+                        f'{count} batches successful')
+
+
+            info_metrics_success()
+        except httpx.RequestError as ex:
+            warn_post_metrics_failed(ex)
+
+
+    def process_target_data_batch(self, target_data_batch):
+        batch_request_body: Metrics = Metrics(
+            target_data=target_data_batch, metrics_data=[]
+        )
+        response = post_metrics(
+            client=self._client, environment=self._environment,
+            json_body=batch_request_body
+        )
+        return response.status_code
+
+    def process_target(self, target_data, unique_target):
+        target_attributes: List[KeyValue] = []
+        if not isinstance(unique_target.attributes, Unset):
+            for key, value in unique_target.attributes.items():
+                # Attribute values need to be sent as string to
+                # ff-server so convert all values to strings.
+                target_attributes.append(KeyValue(key, str(value)))
+        td = TargetData(
+            identifier=unique_target.identifier,
+            name=unique_target.name,
+            attributes=target_attributes
+        )
+        target_data.append(td)
 
     def close(self) -> None:
         self._running = False
diff --git a/featureflags/config.py b/featureflags/config.py
@@ -5,6 +5,7 @@
 
 from .interface import Cache
 from .lru_cache import LRUCache
+from .util import log
 
 BASE_URL = "https://config.ff.harness.io/api/1.0"
 EVENTS_URL = "https://events.ff.harness.io/api/1.0"
@@ -32,7 +33,13 @@ def __init__(
         self.events_url = events_url
         self.pull_interval = pull_interval
         self.persist_interval = persist_interval
-        self.events_sync_interval = events_sync_interval
+        if events_sync_interval < EVENTS_SYNC_INTERVAL:
+            log.warning("Metrics events sync interval cannot be lower than "
+                        "60 seconds. Default of 60 seconds will be used")
+            self.events_sync_interval = EVENTS_SYNC_INTERVAL
+        else:
+            self.events_sync_interval = events_sync_interval
+
         self.cache = cache
         if self.cache is None:
             self.cache = LRUCache()
diff --git a/featureflags/evaluations/variation.py b/featureflags/evaluations/variation.py
@@ -23,7 +23,7 @@ def bool(self, target: Target, flag_identifier: str,
              default: bool = False) -> bool:
         if self.value:
             result = self.value.lower() == "true"
-            log.info(
+            log.debug(
                 "SDKCODE:6000: Evaluated bool variation successfully:"
                 "%s", {"result": result, "flag identifier": flag_identifier,
                        "target": target})
@@ -38,7 +38,7 @@ def string(self, target: Target, flag_identifier: str,
                default: str) -> str:
         if self.value:
             result = self.value
-            log.info(
+            log.debug(
                 "SDKCODE:6000: Evaluated string variation successfully:"
                 "%s", {"result": result, "flag identifier": flag_identifier,
                        "target": target})
@@ -53,7 +53,7 @@ def number(self, target: Target, flag_identifier: str,
                default: float) -> float:
         if self.value:
             result = float(self.value)
-            log.info(
+            log.debug(
                 "SDKCODE:6000: Evaluated number variation successfully:"
                 "%s", {"result": result, "flag identifier": flag_identifier,
                        "target": target})
@@ -68,7 +68,7 @@ def int(self, target: Target, flag_identifier: str,
             default: int) -> int:
         if self.value:
             result = int(self.value)
-            log.info(
+            log.debug(
                 "SDKCODE:6000: Evaluated number variation successfully:"
                 "%s", {"result": result, "flag identifier": flag_identifier,
                        "target": target})
@@ -83,7 +83,7 @@ def json(self, target: Target, flag_identifier: str,
              default: dict) -> dict:
         if self.value:
             result = json.loads(self.value)
-            log.info(
+            log.debug(
                 "SDKCODE:6000: Evaluated json variation successfully:"
                 "%s", {"result": result, "flag identifier": flag_identifier,
                        "target": target})
diff --git a/featureflags/sdk_logging_codes.py b/featureflags/sdk_logging_codes.py
@@ -36,6 +36,8 @@ def get_sdk_code_message(key):
         7003: "Metrics posted successfully",
         7004: "Target metrics exceeded max size, remaining targets for this "
               "analytics interval will not be sent",
+        7005: "Target metrics batches succeeded:",
+        7006: "Target metrics batch/batches failed:",
     }
     if key in sdk_codes:
         return sdk_codes[key]
@@ -105,6 +107,10 @@ def info_metrics_target_exceeded():
     log.info(sdk_err_msg(7004))
 
 
+def info_metrics_target_batch_success(message):
+    log.info(sdk_err_msg(7005, message))
+
+
 def info_metrics_thread_existed():
     log.info(sdk_err_msg(7001))
 
@@ -143,6 +149,10 @@ def warn_post_metrics_failed(reason):
     log.warning(sdk_err_msg(7002, reason))
 
 
+def warn_post_metrics_target_batch_failed(message):
+    log.warning(sdk_err_msg(7006, message))
+
+
 def warn_default_variation_served(flag, target, default):
     log.warning(sdk_err_msg(6001,
                             f"flag={flag}, "
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.1.16
+current_version = 1.2.0
 commit = True
 tag = True
 
diff --git a/setup.py b/setup.py
@@ -57,6 +57,6 @@
     test_suite="tests",
     tests_require=test_requirements,
     url="https://github.com/harness/ff-python-server-sdk",
-    version='1.1.16',
+    version='1.2.0',
     zip_safe=False,
 )
diff --git a/tests/unit/test_sdk_logging_codes.py b/tests/unit/test_sdk_logging_codes.py
@@ -26,4 +26,5 @@ def test_logs_dont_raise_exception():
     sdk_codes.warn_stream_disconnected("example reason")
     sdk_codes.warn_stream_retrying(5)
     sdk_codes.warn_post_metrics_failed("example reason")
+    sdk_codes.warn_post_metrics_target_batch_failed("example reason")
     sdk_codes.warn_default_variation_served("identifier", target, "default")

Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,6 @@`
`57`	`57`	`test_suite="tests",`
`58`	`58`	`tests_require=test_requirements,`
`59`	`59`	`url="https://github.com/harness/ff-python-server-sdk",`
`60`		`- version='1.1.16',`
	`60`	`+ version='1.2.0',`
`61`	`61`	`zip_safe=False,`
`62`	`62`	`)`