Skip to content

Commit a659e33

Browse files
committed
Add errors metadata in agent debug info with granular sdk client error metrics
Also, adding metrics which tells us how many of RFN exception will try to create profiling group. ErrorsMetadata is part of AgentDebugInfo which is part of the Profile. So, everytime we submit a profile we submit these metrics along with it. SdkClientErrors will tell us total failures happening because of API calls. Individual API call failures are captured with the same name as APIs and the rfne resulting in auto creation of PG is also captured.
1 parent 53c7695 commit a659e33

File tree

13 files changed

+449
-60
lines changed

13 files changed

+449
-60
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from codeguru_profiler_agent.utils.synchronization import synchronized
2+
3+
4+
class ErrorsMetadata:
5+
def __init__(self):
6+
self.errors_count = 0
7+
self.sdk_client_errors = 0
8+
self.configure_agent_errors = 0
9+
self.configure_agent_rnfe_auto_create_enabled_errors = 0
10+
self.create_profiling_group_errors = 0
11+
self.post_agent_profile_errors = 0
12+
self.post_agent_profile_rnfe_auto_create_enabled_errors = 0
13+
14+
def reset(self):
15+
self.errors_count = 0
16+
self.sdk_client_errors = 0
17+
self.configure_agent_errors = 0
18+
self.configure_agent_rnfe_auto_create_enabled_errors = 0
19+
self.create_profiling_group_errors = 0
20+
self.post_agent_profile_errors = 0
21+
self.post_agent_profile_rnfe_auto_create_enabled_errors = 0
22+
23+
"""
24+
This needs to be compliant with errors count schema.
25+
https://code.amazon.com/packages/SkySailProfileIonSchema/blobs/811cc0e7e406e37a5b878acf31468be3dcd2963d/--/src/main/resources/schema/DebugInfo.isl#L21
26+
"""
27+
28+
def serialize_to_json(self):
29+
return {
30+
"errorsCount": self.errors_count,
31+
"sdkClientErrors": self.sdk_client_errors,
32+
"configureAgentErrors": self.configure_agent_errors,
33+
"configureAgentRnfeAutoCreateEnabledErrors": self.configure_agent_rnfe_auto_create_enabled_errors,
34+
"createProfilingGroupErrors": self.create_profiling_group_errors,
35+
"postAgentProfileErrors": self.post_agent_profile_errors,
36+
"postAgentProfileRnfeAutoCreateEnabledErrors": self.post_agent_profile_rnfe_auto_create_enabled_errors
37+
}
38+
39+
@synchronized
40+
def increment_sdk_error(self, error_type):
41+
self.errors_count += 1
42+
self.sdk_client_errors += 1
43+
44+
if error_type == "configureAgentErrors":
45+
self.configure_agent_errors += 1
46+
elif error_type == "configureAgentRnfeAutoCreateEnabledErrors":
47+
self.configure_agent_rnfe_auto_create_enabled_errors += 1
48+
elif error_type == "createProfilingGroupErrors":
49+
self.create_profiling_group_errors += 1
50+
elif error_type == "postAgentProfileErrors":
51+
self.post_agent_profile_errors += 1
52+
elif error_type == "postAgentProfileRnfeAutoCreateEnabledErrors":
53+
self.post_agent_profile_rnfe_auto_create_enabled_errors += 1
54+
55+
def record_sdk_error(self, error_type):
56+
self.increment_sdk_error(error_type)
57+
58+
59+
class AgentDebugInfo:
60+
def __init__(self, errors_metadata):
61+
self.errors_metadata = errors_metadata
62+
63+
def serialize_to_json(self):
64+
"""
65+
This needs to be compliant with agent debug info schema.
66+
https://code.amazon.com/packages/SkySailProfileIonSchema/blobs/811cc0e7e406e37a5b878acf31468be3dcd2963d/--/src/main/resources/schema/DebugInfo.isl#L21
67+
"""
68+
return {
69+
"errorsCount": self.errors_metadata.serialize_to_json()
70+
}

codeguru_profiler_agent/local_aggregator.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import time
33
import datetime
44

5+
from codeguru_profiler_agent.agent_metadata.agent_debug_info import AgentDebugInfo
56
from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration
67
from codeguru_profiler_agent.metrics.with_timer import with_timer
78
from codeguru_profiler_agent.model.profile import Profile
@@ -30,13 +31,15 @@ def __init__(self, reporter, environment=dict()):
3031
:param host_weight: (required inside environment) scale factor used to rescale the profile collected in this
3132
host to make the profile representative of the whole fleet
3233
:param timer: (required inside environment) timer to be used for metrics
34+
:param errors_metadata: (required inside environment) metadata capturing errors in the current profile.
3335
:param profile_factory: (inside environment) the factory to created profiler; default Profile.
3436
:param clock: (inside environment) clock to be used; default is time.time
3537
"""
3638
self.reporter = reporter
3739
self.profiling_group_name = environment["profiling_group_name"]
3840
self.host_weight = environment["host_weight"]
3941
self.timer = environment["timer"]
42+
self.errors_metadata = environment["errors_metadata"]
4043

4144
self.profile_factory = environment.get("profile_factory") or Profile
4245
self.clock = environment.get("clock") or time.time
@@ -71,11 +74,13 @@ def _check_memory_limit(self):
7174
self.flush(force=True)
7275

7376
def reset(self):
77+
self.errors_metadata.reset()
7478
self.profile = self.profile_factory(
7579
profiling_group_name=self.profiling_group_name,
7680
sampling_interval_seconds=AgentConfiguration.get().sampling_interval.total_seconds(),
7781
host_weight=self.host_weight,
7882
start=current_milli_time(clock=self.clock),
83+
agent_debug_info=AgentDebugInfo(self.errors_metadata),
7984
clock=self.clock
8085
)
8186
self.timer.reset()

codeguru_profiler_agent/model/profile.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
class Profile:
14-
def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, clock=time.time):
14+
def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, agent_debug_info, clock=time.time):
1515
"""
1616
A profile holds the root node of the call graph and the metadata related to the profile
1717
"""
@@ -35,6 +35,7 @@ def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight,
3535
self.host_weight = int(host_weight)
3636
self._start_process_time = time.process_time() # provides process time in fractional seconds as float.
3737
self.overhead_ms = 0
38+
self.agent_debug_info = agent_debug_info
3839

3940
@property
4041
def end(self):
@@ -97,6 +98,9 @@ def _insert_stack(self, stack, runnable_count_increase=1):
9798
def get_memory_usage_bytes(self):
9899
return self.memory_counter.get_memory_usage_bytes()
99100

101+
def serialize_agent_debug_info_to_json(self):
102+
return self.agent_debug_info.serialize_to_json()
103+
100104
def pause(self):
101105
if self.last_pause is not None:
102106
# pause gets called when profile is paused

codeguru_profiler_agent/profiler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from datetime import timedelta
88
from random import SystemRandom
99
from types import MappingProxyType as UnmodifiableDict
10+
11+
from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata
1012
from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata
1113
from codeguru_profiler_agent.profiler_disabler import ProfilerDisabler
1214
from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration, AgentConfigurationMerger
@@ -167,6 +169,7 @@ def _setup_final_environment(self, environment, environment_override):
167169
frozenset({environment['profiler_thread_name']}.union(environment['excluded_threads']))
168170
# TODO delay metadata lookup until we need it
169171
environment['agent_metadata'] = environment.get('agent_metadata') or AgentMetadata()
172+
environment['errors_metadata'] = environment.get('errors_metadata') or ErrorsMetadata()
170173
environment['collector'] = environment.get('collector') or self._select_collector(environment)
171174
environment["profiler_disabler"] = environment.get('profiler_disabler') or ProfilerDisabler(environment)
172175
return UnmodifiableDict(environment)

codeguru_profiler_agent/sdk_reporter/profile_encoder.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,15 @@ def encode_content(self):
9999
"start": int(self._profile.start),
100100
"end": int(self._profile.end),
101101
"agentMetadata": self._encode_agent_metadata(),
102-
"callgraph": self._encode_call_graph(self._profile.callgraph)
102+
"callgraph": self._encode_call_graph(self._profile.callgraph),
103+
"debugInfo": self._encode_debug_info()
103104
}
104105

105106
return json.dumps(profile_in_map)
106107

108+
def _encode_debug_info(self):
109+
return self._profile.serialize_agent_debug_info_to_json()
110+
107111
def _encode_agent_metadata(self):
108112
profile_duration_seconds = self._profile.get_active_millis_since_start() / 1000.0
109113
sample_weight = 1.0 if (profile_duration_seconds == 0) else self._profile.total_sample_count / profile_duration_seconds

codeguru_profiler_agent/sdk_reporter/sdk_reporter.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@
1515
logger = logging.getLogger(__name__)
1616
AWS_EXECUTION_ENV_KEY = "AWS_EXECUTION_ENV"
1717

18+
1819
class SdkReporter(Reporter):
1920
"""
2021
Handles communication with the CodeGuru Profiler Service backend.
2122
Encodes profiles using the ProfilerEncoder and reports them using the CodeGuru profiler SDK.
2223
"""
2324
is_create_pg_called_during_submit_profile = False
25+
2426
def __init__(self, environment):
2527
"""
2628
:param environment: dependency container dictionary for the current profiler.
@@ -35,6 +37,7 @@ def __init__(self, environment):
3537
self.timer = environment.get("timer")
3638
self.metadata = environment["agent_metadata"]
3739
self.agent_config_merger = environment["agent_config_merger"]
40+
self.errors_metadata = environment["errors_metadata"]
3841

3942
def _encode_profile(self, profile):
4043
output_profile_stream = io.BytesIO()
@@ -76,18 +79,23 @@ def refresh_configuration(self):
7679
# We handle service exceptions like this in boto3
7780
# see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
7881
if error.response['Error']['Code'] == 'ValidationException':
82+
self.errors_metadata.record_sdk_error("configureAgentErrors")
7983
self.agent_config_merger.disable_profiling()
8084
self._log_request_failed(operation="configure_agent", exception=error)
81-
if error.response['Error']['Code'] == 'ResourceNotFoundException':
85+
elif error.response['Error']['Code'] == 'ResourceNotFoundException':
8286
if self.should_auto_create_profiling_group():
87+
self.errors_metadata.record_sdk_error("configureAgentRnfeAutoCreateEnabledErrors")
8388
logger.info(
8489
"Profiling group not found. Will try to create a profiling group "
8590
"with name = {} and compute platform = {} and retry calling configure agent after 5 minutes. "
8691
"Make sure that Lambda's execution role has AmazonCodeGuruProfilerAgentAccess policy added."
8792
.format(self.profiling_group_name, 'AWSLambda'))
8893
self.create_profiling_group()
8994
else:
95+
self.errors_metadata.record_sdk_error("configureAgentErrors")
9096
self.agent_config_merger.disable_profiling()
97+
else:
98+
self.errors_metadata.record_sdk_error("configureAgentErrors")
9199
except Exception as e:
92100
self._log_request_failed(operation="configure_agent", exception=e)
93101

@@ -117,12 +125,17 @@ def report(self, profile):
117125
if error.response['Error']['Code'] == 'ResourceNotFoundException':
118126
if self.should_auto_create_profiling_group():
119127
self.__class__.is_create_pg_called_during_submit_profile = True
128+
self.errors_metadata.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors")
120129
logger.info(
121130
"Profiling group not found. Will try to create a profiling group "
122131
"with name = {} and compute platform = {} and retry reporting during next invocation. "
123132
"Make sure that Lambda's execution role has AmazonCodeGuruProfilerAgentAccess policy added."
124133
.format(self.profiling_group_name, 'AWSLambda'))
125134
self.create_profiling_group()
135+
else:
136+
self.errors_metadata.record_sdk_error("postAgentProfileErrors")
137+
else:
138+
self.errors_metadata.record_sdk_error("postAgentProfileErrors")
126139
return False
127140
except Exception as e:
128141
self._log_request_failed(operation="post_agent_profile", exception=e)
@@ -143,6 +156,8 @@ def create_profiling_group(self):
143156
if error.response['Error']['Code'] == 'ConflictException':
144157
logger.info("Profiling Group with name {} already exists. Please use a different name."
145158
.format(self.profiling_group_name))
159+
else:
160+
self.errors_metadata.record_sdk_error("createProfilingGroupErrors")
146161
except Exception as e:
147162
self._log_request_failed(operation="create_profiling_group", exception=e)
148163

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import functools
2+
import threading
3+
4+
5+
def synchronized(wrapped):
6+
"""The missing @synchronized decorator
7+
8+
https://git.io/vydTA"""
9+
_lock = threading.RLock()
10+
11+
@functools.wraps(wrapped)
12+
def _wrapper(*args, **kwargs):
13+
with _lock:
14+
return wrapped(*args, **kwargs)
15+
return _wrapper

test/integration/test_live_backend_reporting.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from datetime import timedelta
77

8+
from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo
89
from test.help_utils import MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS
910
from test.pytestutils import before
1011

@@ -32,8 +33,10 @@ def before(self):
3233
stacks=[[Frame(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS)]],
3334
attempted_sample_threads_count=1,
3435
seen_threads_count=1)
36+
errors_metadata = ErrorsMetadata()
3537

36-
self.profile = Profile(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, 1.0, 1.0, five_minutes_ago_millis)
38+
self.profile = Profile(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, 1.0, 1.0, five_minutes_ago_millis,
39+
AgentDebugInfo(errors_metadata))
3740
# FIXME: Remove adding the end time manually below after feature fully support
3841
self.profile.end = now_millis
3942
self.profile.add(sample)
@@ -47,7 +50,8 @@ def before(self):
4750
"minimum_time_reporting": timedelta(minutes=6),
4851
"max_stack_depth": 2345,
4952
"cpu_limit_percentage": 29,
50-
"agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo())
53+
"agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()),
54+
"errors_metadata": errors_metadata
5155
}
5256
self.environment["codeguru_profiler_builder"] = CodeGuruClientBuilder(self.environment)
5357
self.agent_config = AgentConfiguration(

0 commit comments

Comments
 (0)