@@ -88,6 +88,7 @@ use std::fmt::Display;
8888use std:: intrinsics:: unlikely;
8989use std:: path:: Path ;
9090use std:: sync:: Arc ;
91+ use std:: sync:: atomic:: { AtomicU64 , Ordering } ;
9192use std:: time:: { Duration , Instant } ;
9293use std:: { fs, process} ;
9394
@@ -105,6 +106,8 @@ bitflags::bitflags! {
105106 struct EventFilter : u16 {
106107 const GENERIC_ACTIVITIES = 1 << 0 ;
107108 const QUERY_PROVIDERS = 1 << 1 ;
109+ /// Store detailed instant events, including timestamp and thread ID,
110+ /// per each query cache hit. Note that this is quite expensive.
108111 const QUERY_CACHE_HITS = 1 << 2 ;
109112 const QUERY_BLOCKED = 1 << 3 ;
110113 const INCR_CACHE_LOADS = 1 << 4 ;
@@ -113,16 +116,20 @@ bitflags::bitflags! {
113116 const FUNCTION_ARGS = 1 << 6 ;
114117 const LLVM = 1 << 7 ;
115118 const INCR_RESULT_HASHING = 1 << 8 ;
116- const ARTIFACT_SIZES = 1 << 9 ;
119+ const ARTIFACT_SIZES = 1 << 9 ;
120+ /// Store aggregated counts of cache hits per query invocation.
121+ const QUERY_CACHE_HIT_COUNTS = 1 << 10 ;
117122
118123 const DEFAULT = Self :: GENERIC_ACTIVITIES . bits( ) |
119124 Self :: QUERY_PROVIDERS . bits( ) |
120125 Self :: QUERY_BLOCKED . bits( ) |
121126 Self :: INCR_CACHE_LOADS . bits( ) |
122127 Self :: INCR_RESULT_HASHING . bits( ) |
123- Self :: ARTIFACT_SIZES . bits( ) ;
128+ Self :: ARTIFACT_SIZES . bits( ) |
129+ Self :: QUERY_CACHE_HIT_COUNTS . bits( ) ;
124130
125131 const ARGS = Self :: QUERY_KEYS . bits( ) | Self :: FUNCTION_ARGS . bits( ) ;
132+ const QUERY_CACHE_HIT_COMBINED = Self :: QUERY_CACHE_HITS . bits( ) | Self :: QUERY_CACHE_HIT_COUNTS . bits( ) ;
126133 }
127134}
128135
@@ -134,6 +141,7 @@ const EVENT_FILTERS_BY_NAME: &[(&str, EventFilter)] = &[
134141 ( "generic-activity" , EventFilter :: GENERIC_ACTIVITIES ) ,
135142 ( "query-provider" , EventFilter :: QUERY_PROVIDERS ) ,
136143 ( "query-cache-hit" , EventFilter :: QUERY_CACHE_HITS ) ,
144+ ( "query-cache-hit-count" , EventFilter :: QUERY_CACHE_HITS ) ,
137145 ( "query-blocked" , EventFilter :: QUERY_BLOCKED ) ,
138146 ( "incr-cache-load" , EventFilter :: INCR_CACHE_LOADS ) ,
139147 ( "query-keys" , EventFilter :: QUERY_KEYS ) ,
@@ -411,13 +419,24 @@ impl SelfProfilerRef {
411419 #[ inline( never) ]
412420 #[ cold]
413421 fn cold_call ( profiler_ref : & SelfProfilerRef , query_invocation_id : QueryInvocationId ) {
414- profiler_ref. instant_query_event (
415- |profiler| profiler. query_cache_hit_event_kind ,
416- query_invocation_id,
417- ) ;
422+ if profiler_ref. event_filter_mask . contains ( EventFilter :: QUERY_CACHE_HIT_COUNTS ) {
423+ profiler_ref
424+ . profiler
425+ . as_ref ( )
426+ . unwrap ( )
427+ . increment_query_cache_hit_counters ( QueryInvocationId ( query_invocation_id. 0 ) ) ;
428+ }
429+ if unlikely ( profiler_ref. event_filter_mask . contains ( EventFilter :: QUERY_CACHE_HITS ) ) {
430+ profiler_ref. instant_query_event (
431+ |profiler| profiler. query_cache_hit_event_kind ,
432+ query_invocation_id,
433+ ) ;
434+ }
418435 }
419436
420- if unlikely ( self . event_filter_mask . contains ( EventFilter :: QUERY_CACHE_HITS ) ) {
437+ // We check both kinds of query cache hit events at once, to reduce overhead in the
438+ // common case (with self-profile disabled).
439+ if unlikely ( self . event_filter_mask . intersects ( EventFilter :: QUERY_CACHE_HIT_COMBINED ) ) {
421440 cold_call ( self , query_invocation_id) ;
422441 }
423442 }
@@ -489,6 +508,30 @@ impl SelfProfilerRef {
489508 self . profiler . as_ref ( ) . map ( |p| p. get_or_alloc_cached_string ( s) )
490509 }
491510
511+ /// Store query cache hits to the self-profile log.
512+ /// Should be called once at the end of the compilation session.
513+ ///
514+ /// The cache hits are stored per **query invocation**, not **per query kind/type**.
515+ /// `analyzeme` can later deduplicate individual query labels from the QueryInvocationId event
516+ /// IDs.
517+ pub fn store_query_cache_hits ( & self ) {
518+ if self . event_filter_mask . contains ( EventFilter :: QUERY_CACHE_HIT_COUNTS ) {
519+ let profiler = self . profiler . as_ref ( ) . unwrap ( ) ;
520+ let query_hits = profiler. query_hits . read ( ) ;
521+ let builder = EventIdBuilder :: new ( & profiler. profiler ) ;
522+ let thread_id = get_thread_id ( ) ;
523+ for ( query_invocation, hit_count) in query_hits. iter ( ) . enumerate ( ) {
524+ let event_id = builder. from_label ( StringId :: new_virtual ( query_invocation as u64 ) ) ;
525+ profiler. profiler . record_integer_event (
526+ profiler. query_cache_hit_count_event_kind ,
527+ event_id,
528+ thread_id,
529+ hit_count. load ( Ordering :: Relaxed ) ,
530+ ) ;
531+ }
532+ }
533+ }
534+
492535 #[ inline]
493536 pub fn enabled ( & self ) -> bool {
494537 self . profiler . is_some ( )
@@ -537,13 +580,28 @@ pub struct SelfProfiler {
537580
538581 string_cache : RwLock < FxHashMap < String , StringId > > ,
539582
583+ /// Recording individual query cache hits as "instant" measureme events
584+ /// is incredibly expensive. Instead of doing that, we simply aggregate
585+ /// cache hit *counts* per query invocation, and then store the final count
586+ /// of cache hits per invocation at the end of the compilation session.
587+ ///
588+ /// With this approach, we don't know the individual thread IDs and timestamps
589+ /// of cache hits, but it has very little overhead on top of `-Zself-profile`.
590+ /// Recording the cache hits as individual events made compilation 3-5x slower.
591+ ///
592+ /// Query invocation IDs should be monotonic integers, so we can store them in a vec,
593+ /// rather than using a hashmap.
594+ query_hits : RwLock < Vec < AtomicU64 > > ,
595+
540596 query_event_kind : StringId ,
541597 generic_activity_event_kind : StringId ,
542598 incremental_load_result_event_kind : StringId ,
543599 incremental_result_hashing_event_kind : StringId ,
544600 query_blocked_event_kind : StringId ,
545601 query_cache_hit_event_kind : StringId ,
546602 artifact_size_event_kind : StringId ,
603+ /// Total cache hits per query invocation
604+ query_cache_hit_count_event_kind : StringId ,
547605}
548606
549607impl SelfProfiler {
@@ -573,6 +631,7 @@ impl SelfProfiler {
573631 let query_blocked_event_kind = profiler. alloc_string ( "QueryBlocked" ) ;
574632 let query_cache_hit_event_kind = profiler. alloc_string ( "QueryCacheHit" ) ;
575633 let artifact_size_event_kind = profiler. alloc_string ( "ArtifactSize" ) ;
634+ let query_cache_hit_count_event_kind = profiler. alloc_string ( "QueryCacheHitCount" ) ;
576635
577636 let mut event_filter_mask = EventFilter :: empty ( ) ;
578637
@@ -618,6 +677,8 @@ impl SelfProfiler {
618677 query_blocked_event_kind,
619678 query_cache_hit_event_kind,
620679 artifact_size_event_kind,
680+ query_cache_hit_count_event_kind,
681+ query_hits : Default :: default ( ) ,
621682 } )
622683 }
623684
@@ -627,6 +688,25 @@ impl SelfProfiler {
627688 self . profiler . alloc_string ( s)
628689 }
629690
691+ /// Store a cache hit of a query invocation
692+ pub fn increment_query_cache_hit_counters ( & self , id : QueryInvocationId ) {
693+ // Fast path: assume that the query was already encountered before, and just record
694+ // a cache hit.
695+ let mut guard = self . query_hits . upgradable_read ( ) ;
696+ let query_hits = & guard;
697+ let index = id. 0 as usize ;
698+ if index < query_hits. len ( ) {
699+ // We only want to increment the count, no other synchronization is required
700+ query_hits[ index] . fetch_add ( 1 , Ordering :: Relaxed ) ;
701+ } else {
702+ // If not, we need to extend the query hit map to the highest observed ID
703+ guard. with_upgraded ( |vec| {
704+ vec. resize_with ( index + 1 , || AtomicU64 :: new ( 0 ) ) ;
705+ vec[ index] = AtomicU64 :: from ( 1 ) ;
706+ } ) ;
707+ }
708+ }
709+
630710 /// Gets a `StringId` for the given string. This method makes sure that
631711 /// any strings going through it will only be allocated once in the
632712 /// profiling data.
0 commit comments