diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
new file mode 100644
index 00000000..23016e0b
--- /dev/null
+++ b/examples/netstacklat.bpf.c
@@ -0,0 +1,846 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This is an ebpf_exporter variant of the netstacklat tool
+ *
+ * Netstacklat - is a tool that "Monitor RX latency within the network stack"
+ *  - https://github.com/xdp-project/bpf-examples/tree/main/netstacklat
+ *  - Developed by Simon Sundberg <simon.sundberg@kau.se>
+ *
+ * This variant have been code optimized heavily towards Cloudflare's use-case.
+ * Many hooks and features have been disabled, via constructs that lets both the
+ * compiler and BPF verifier do dead-code elimination.
+ */
+#include <vmlinux.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include "netstacklat.h"
+#include "bits.bpf.h"
+
+#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+// Mimic macros from /include/net/tcp.h
+#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
+#define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
+
+char LICENSE[] SEC("license") = "GPL";
+
+/* The ebpf_exporter variant of netstacklat is not runtime configurable at
+ * BPF-load time. Thus, below user_config isn't define as 'volatile', instead
+ * the 'const' allows the compiler to do dead-code elimination.
+ */
+const __s64 TAI_OFFSET = (37LL * NS_PER_S);
+const struct netstacklat_bpf_config user_config = {
+	.network_ns = 0,
+	.filter_min_sockqueue_len = 0, /* zero means filter is inactive */
+	.filter_nth_packet = 0, /* reduce recorded event to every nth packet, use power-of-2 */
+	.filter_pid = false,
+	.filter_ifindex = true,
+	.filter_cgroup = true,
+	.groupby_ifindex = false, /* If true also define CONFIG_GROUPBY_IFINDEX */
+	.groupby_cgroup = true,
+	.include_hol_blocked = false,
+};
+
+/* This provide easy way compile-time to disable some hooks */
+/* #define CONFIG_HOOKS_EARLY_RCV 1 */
+#undef CONFIG_HOOKS_EARLY_RCV
+/* #define CONFIG_HOOKS_ENQUEUE 1 */
+#undef CONFIG_HOOKS_ENQUEUE
+#define CONFIG_HOOKS_DEQUEUE 1
+#define CONFIG_ENABLE_IP_HOOKS 1
+#define CONFIG_ENABLE_TCP_HOOKS 1
+/* #define CONFIG_ENABLE_UDP_HOOKS 1 */
+
+/* Allows to compile-time disable ifindex map as YAML cannot conf this */
+/* #define CONFIG_IFINDEX_FILTER_MAP 1 */
+#undef CONFIG_IFINDEX_FILTER_MAP
+
+/* Allows to compile-time disable PID filter map as it is very large */
+/* #define CONFIG_PID_FILTER_MAP 1 */
+#undef CONFIG_PID_FILTER_MAP
+
+/*
+ * Alternative definition of sk_buff to handle renaming of the field
+ * mono_delivery_time to tstamp_type. See
+ * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
+ */
+struct sk_buff___old {
+	union {
+		ktime_t tstamp;
+		u64 skb_mstamp_ns;
+	};
+	__u8 mono_delivery_time: 1;
+} __attribute__((preserve_access_index));
+
+/* NOTICE: max_entries need to be adjusted based on maximum
+ *  number of cgroups and ifindex (that are "groupby" collecting)
+ *  and "enabled" hooks.
+ */
+#define N_CGROUPS	2 /* depend on cgroup_id_map matches in YAML config*/
+#ifdef CONFIG_GROUPBY_IFINDEX
+#define N_IFACES	6 /* On prod only interested in ext0 and vlan100@ext0 */
+#else
+#define N_IFACES	1 /* With groupby_ifindex==false */
+#endif
+#define N_HOOKS	1
+
+#if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE || CONFIG_ENABLE_UDP_HOOKS)
+#err "Please update N_HOOKS"
+#endif
+
+struct tcp_sock_ooo_range {
+	u32 prev_n_ooopkts;
+	u32 ooo_seq_end;
+	/* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */
+	bool active;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+	__uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES);
+	__type(key, struct hist_key);
+	__type(value, u64);
+} netstack_latency_seconds SEC(".maps");
+
+#ifdef CONFIG_PID_FILTER_MAP
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, PID_MAX_LIMIT);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_pidfilter SEC(".maps");
+#endif
+
+#ifdef CONFIG_IFINDEX_FILTER_MAP
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, IFINDEX_MAX);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_ifindexfilter SEC(".maps");
+#endif
+
+/* Eval two different cgroup_id_map types*/
+/* #define CONFIG_CGRP_STORAGE 1 */
+#ifdef CONFIG_CGRP_STORAGE
+struct {
+	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);  /* type: cgrp_storage */
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_cgroupfilter SEC(".maps");
+#else
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH); /* type: hash */
+	__uint(max_entries, MAX_PARSED_CGROUPS);
+	__type(key, u64);
+	__type(value, u64);
+} netstack_cgroupfilter SEC(".maps");
+#endif
+
+/* Per-CPU counter for down sampling the recorded events to every nth event */
+struct {
+    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+    __uint(max_entries, NETSTACKLAT_N_HOOKS);
+    __type(key, u32);
+    __type(value, u64);
+} netstack_nth_filter SEC(".maps");
+
+static ktime_t time_since(ktime_t tstamp)
+{
+	ktime_t now;
+
+	if (tstamp <= 0)
+		return -1;
+
+	now = bpf_ktime_get_tai_ns() - TAI_OFFSET;
+	if (tstamp > now)
+		return -1;
+
+	return (now - tstamp) / LATENCY_SCALE;
+}
+
+/*
+ * Is a < b considering u32 wrap around?
+ * Based on the before() function in /include/net/tcp.h
+ */
+static bool u32_lt(u32 a, u32 b)
+{
+	return (s32)(a - b) < 0;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct tcp_sock_ooo_range);
+} netstack_tcp_ooo_range SEC(".maps");
+
+/* Determine if ebpf_exporter macro or local C implementation is used */
+#define CONFIG_MAP_MACROS	1
+#ifdef  CONFIG_MAP_MACROS
+#include "maps.bpf.h"
+#define _record_latency_since(tstamp, key)					\
+	ktime_t latency = time_since(tstamp);					\
+	if (latency >= 0)							\
+		increment_exp2_histogram_nosync(&netstack_latency_seconds,	\
+						key, latency,			\
+						HIST_MAX_LATENCY_SLOT);
+#else /* !CONFIG_MAP_MACROS */
+#define _record_latency_since(tstamp, key)	\
+	record_latency_since(tstamp, &key)
+
+static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key)
+{
+	u64 zero = 0;
+	u64 *val;
+
+	val = bpf_map_lookup_elem(map, key);
+	if (val)
+		return val;
+
+	// Key not in map - try insert it and lookup again
+	bpf_map_update_elem(map, key, &zero, BPF_NOEXIST);
+	return bpf_map_lookup_elem(map, key);
+}
+
+static u32 get_exp2_histogram_bucket_idx(u64 value, u32 max_bucket)
+{
+	u32 bucket = log2l(value);
+
+	// Right-inclusive histogram, so "round up" the log value
+	if (bucket > 0 && 1ULL << bucket < value)
+		bucket++;
+
+	if (bucket > max_bucket)
+		bucket = max_bucket;
+
+	return bucket;
+}
+
+/*
+ * Same call signature as the increment_exp2_histogram_nosync macro from
+ * https://github.com/cloudflare/ebpf_exporter/blob/master/examples/maps.bpf.h
+ * but provided as a function.
+ *
+ * Unlike the macro, only works with keys of type struct hist_key. The hist_key
+ * struct must be provided by value (rather than as a pointer) to keep the same
+ * call signature as the ebpf-exporter macro, although this will get inefficent
+ * if struct hist_key grows large.
+ */
+static void increment_exp2_histogram_nosync(void *map, struct hist_key key,
+					    u64 value, u32 max_bucket)
+{
+	u64 *bucket_count;
+
+	// Increment histogram
+	key.bucket = get_exp2_histogram_bucket_idx(value, max_bucket);
+	bucket_count = lookup_or_zeroinit_histentry(map, &key);
+	if (bucket_count)
+		(*bucket_count)++;
+
+	// Increment sum at end of histogram
+	if (value == 0)
+		return;
+
+	key.bucket = max_bucket + 1;
+	bucket_count = lookup_or_zeroinit_histentry(map, &key);
+	if (bucket_count)
+		*bucket_count += value;
+}
+
+static void record_latency(ktime_t latency, const struct hist_key *key)
+{
+	increment_exp2_histogram_nosync(&netstack_latency_seconds, *key, latency,
+					HIST_MAX_LATENCY_SLOT);
+}
+static void record_latency_since(ktime_t tstamp, const struct hist_key *key)
+{
+	ktime_t latency = time_since(tstamp);
+	if (latency >= 0)
+		record_latency(latency, key);
+}
+#endif /* !CONFIG_MAP_MACROS */
+
+/* Debug facility to count errors */
+#define MAX_ERROR_TYPES 8
+enum error_types {
+    ERR_UNKNOWN = 0,
+    ERR_sk_storage = 1,
+    ERR_READ_TCP_rcv_wup = 2,
+    ERR_READ_TCP_rcv_wnd = 3,
+    ERR_READ_TCP_rcv_nxt = 4,
+    ERR_READ_TCP_last_skb_cb = 5,
+    ERR_READ_TCP_cp_seq = 6,
+    ERR_READ_TCP_rcv_ooopack = 7,
+};
+struct {
+    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+    __uint(max_entries, MAX_ERROR_TYPES);
+    __type(key, u32);
+    __type(value, u64);
+} netstacklat_errors_total SEC(".maps");
+
+/* This provide easy way to disable debug feature for errors.
+ * Disabling this reduces BPF code size.
+ */
+#define CONFIG_TRACK_ERRORS 1
+/* #define CONFIG_PRINT_ERRORS 1 */
+#undef CONFIG_PRINT_ERRORS
+
+void record_errors(u32 err)
+{
+#ifdef CONFIG_TRACK_ERRORS
+    u32 key = ERR_UNKNOWN;
+
+    if (err < MAX_ERROR_TYPES)
+	    key = err;
+
+    increment_map_nosync(&netstacklat_errors_total, &key, 1);
+#endif /* CONFIG_TRACK_ERRORS */
+}
+
+#ifdef CONFIG_PRINT_ERRORS
+#define my_printk(fmt, ...) bpf_printk(fmt, ##__VA_ARGS__)
+#else /* !CONFIG_PRINT_ERRORS */
+#define my_printk(fmt, ...)
+#endif
+
+/* Debug macro that can be disabled compile time */
+#define dbg(__ERR_NR, fmt, ...)			\
+	({						\
+		record_errors(__ERR_NR);		\
+		my_printk(fmt, ##__VA_ARGS__);		\
+	})
+
+static inline bool filter_nth_packet(const enum netstacklat_hook hook)
+{
+	u32 key = hook;
+	u64 pkt_cnt;
+	u64 *nth;
+
+	/* Zero and one means disabled */
+	if (user_config.filter_nth_packet <= 1)
+		return true;
+
+	nth = bpf_map_lookup_elem(&netstack_nth_filter, &key);
+	if (!nth)
+		return false;
+
+	/* The hooks (like tcp-socket-read) runs outside the socket lock in a
+	 * preempt/migrate-able user context. Thus, atomic updates are needed
+	 * for correctness, but keep PERCPU map to limit cache-line bouncing.
+	 */
+	pkt_cnt = __sync_fetch_and_add(nth, 1);
+	if ((pkt_cnt % user_config.filter_nth_packet) == 0) {
+		return true;
+	}
+	return false;
+}
+
+static bool filter_ifindex(u32 ifindex)
+{
+	if (!user_config.filter_ifindex)
+		// No ifindex filter - all ok
+		return true;
+
+#ifdef CONFIG_IFINDEX_FILTER_MAP
+	u64 *ifindex_ok;
+
+	ifindex_ok = bpf_map_lookup_elem(&netstack_ifindexfilter, &ifindex);
+	if (!ifindex_ok)
+		return false;
+
+	return *ifindex_ok > 0;
+#else
+	/* Hack for production:
+	 * - We want to exclude 'lo' which have ifindex==1.
+	 * - We want to filter on ext0 (ifindex 2) and vlan100@ext0 (ifindex 5)
+	 *   unfortunately ifindex'es are not stable, some production metals have
+	 *   ifindex==6 for vlan100@link0. Relax filter until adding YAML config.
+	 */
+	if (ifindex > 1 && ifindex < 12)
+		return true;
+
+	return false;
+#endif
+}
+
+static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk)
+{
+	/*
+	 * Favor reading from sk due to less redirection (fewer probe reads)
+	 * and skb->dev is not always set.
+	 */
+	if (sk)
+		return BPF_CORE_READ(sk->__sk_common.skc_net.net, ns.inum);
+	else if (skb)
+		return BPF_CORE_READ(skb->dev, nd_net.net, ns.inum);
+	return 0;
+}
+
+static bool filter_network_ns(struct sk_buff *skb, struct sock *sk)
+{
+	if (user_config.network_ns == 0)
+		return true;
+
+	return get_network_ns(skb, sk) == user_config.network_ns;
+}
+
+#if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE)
+static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook)
+{
+	struct hist_key key = { .hook = hook };
+	u32 ifindex;
+
+	if (bpf_core_field_exists(skb->tstamp_type)) {
+		/*
+		 * For kernels >= v6.11 the tstamp_type being non-zero
+		 * (SKB_CLOCK_REALTIME) implies that skb->tstamp holds a
+		 * preserved TX timestamp rather than a RX timestamp. See
+		 * https://lore.kernel.org/all/20240509211834.3235191-2-quic_abchauha@quicinc.com/
+		 */
+		if (BPF_CORE_READ_BITFIELD(skb, tstamp_type) > 0)
+			return;
+
+	} else {
+		/*
+		 * For kernels < v6.11, the field was called mono_delivery_time
+		 * instead, see https://lore.kernel.org/all/20220302195525.3480280-1-kafai@fb.com/
+		 * Kernels < v5.18 do not have the mono_delivery_field either,
+		 * but we do not support those anyways (as they lack the
+		 * bpf_ktime_get_tai_ns helper)
+		 */
+		struct sk_buff___old *skb_old = (void *)skb;
+		if (BPF_CORE_READ_BITFIELD(skb_old, mono_delivery_time) > 0)
+			return;
+	}
+
+	ifindex = skb->skb_iif;
+	if (!filter_ifindex(ifindex))
+		return;
+
+	if (!filter_network_ns(skb, sk))
+		return;
+
+	if (!filter_nth_packet(hook))
+		return;
+
+#ifdef CONFIG_GROUPBY_IFINDEX
+	if (user_config.groupby_ifindex)
+		key.ifindex = ifindex;
+#endif
+
+	_record_latency_since(skb->tstamp, key);
+}
+#endif
+
+#ifdef CONFIG_PID_FILTER_MAP
+static bool filter_pid(u32 pid)
+{
+	u64 *pid_ok;
+
+	if (!user_config.filter_pid)
+		// No PID filter - all PIDs ok
+		return true;
+
+	pid_ok = bpf_map_lookup_elem(&netstack_pidfilter, &pid);
+	if (!pid_ok)
+		return false;
+
+	return *pid_ok > 0;
+}
+#endif /* CONFIG_PID_FILTER_MAP */
+
+#ifdef CONFIG_CGRP_STORAGE
+static bool filter_cgroup(u64 *cgroup_id)
+{
+	if (!user_config.filter_cgroup) {
+		if (user_config.groupby_cgroup)
+			*cgroup_id = bpf_get_current_cgroup_id();
+		// No cgroup filter - all cgroups ok
+		return true;
+	}
+
+	struct task_struct *task = bpf_get_current_task_btf();
+	struct cgroup *cgrp = task->cgroups->dfl_cgrp;
+
+	if (user_config.groupby_cgroup)
+		/* no need to call bpf_get_current_cgroup_id() */
+		*cgroup_id = BPF_CORE_READ(cgrp, kn, id);
+
+	return bpf_cgrp_storage_get(&netstack_cgroupfilter, cgrp, 0, 0) != NULL;
+}
+#else /* !CONFIG_CGRP_STORAGE */
+static bool filter_cgroup(u64 *cgroup_id)
+{
+	if (!user_config.filter_cgroup) {
+		if (user_config.groupby_cgroup)
+			*cgroup_id = bpf_get_current_cgroup_id();
+		// No cgroup filter - all cgroups ok
+		return true;
+	}
+	*cgroup_id = bpf_get_current_cgroup_id();
+
+	return bpf_map_lookup_elem(&netstack_cgroupfilter, cgroup_id) != NULL;
+}
+#endif /* !CONFIG_CGRP_STORAGE */
+
+static bool filter_current_task()
+{
+	bool ok = true;
+
+#ifdef CONFIG_PID_FILTER_MAP
+	__u32 tgid;
+
+	if (user_config.filter_pid) {
+		tgid = bpf_get_current_pid_tgid() >> 32;
+		ok = ok && filter_pid(tgid);
+	}
+#endif
+	return ok;
+}
+
+static inline bool sk_backlog_empty(const struct sock *sk)
+{
+	return READ_ONCE(sk->sk_backlog.tail) == NULL;
+}
+
+/* To lower runtime overhead, skip recording timestamps for sockets with very
+ * few packets. Use sk_buff_head->qlen to see if e.g. queue have more than 2
+ * elements
+ */
+static inline __u32 sk_queue_len(const struct sk_buff_head *list_)
+{
+	return READ_ONCE(list_->qlen);
+}
+
+static bool filter_min_sockqueue_len(struct sock *sk)
+{
+	const u32 min_qlen = user_config.filter_min_sockqueue_len;
+
+	if (min_qlen == 0)
+		return true;
+
+	if (sk_queue_len(&sk->sk_receive_queue) >= min_qlen)
+		return true;
+
+	/* Packets can also be on the sk_backlog, but we don't know the number
+	 * of SKBs on the queue, because sk_backlog.len is in bytes (based on
+	 * skb->truesize).  Thus, if any backlog exists we don't filter.
+	 */
+	if (!sk_backlog_empty(sk))
+		return true;
+
+	return false;
+}
+
+#if (CONFIG_HOOKS_DEQUEUE || CONFIG_HOOKS_ENQUEUE)
+static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
+					  u64 *cgroup_id, const enum netstacklat_hook hook)
+{
+	if (!filter_min_sockqueue_len(sk))
+		return false;
+
+	if (!filter_cgroup(cgroup_id))
+		return false;
+
+	if (!filter_nth_packet(hook))
+		return false;
+
+	return true;
+}
+#endif
+
+/* Get the current receive window end sequence for tp
+ * In the kernel receive window checks are done against
+ * tp->rcv_nxt + tcp_receive_window(tp). This function should give a compareable
+ * result, i.e. rcv_wup + rcv_wnd or rcv_nxt, whichever is higher
+ */
+static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq)
+{
+	u32 rcv_wup, rcv_wnd, window = 0;
+	int err;
+
+	err = bpf_core_read(&rcv_wup, sizeof(rcv_wup), &tp->rcv_wup);
+	if (err) {
+		dbg(ERR_READ_TCP_rcv_wup,
+		    "failed to read tcp_sock->rcv_wup, err=%d", err);
+		goto exit;
+	}
+
+	err = bpf_core_read(&rcv_wnd, sizeof(rcv_wnd), &tp->rcv_wnd);
+	if (err) {
+		dbg(ERR_READ_TCP_rcv_wnd,
+		    "failed to read tcp_sock->rcv_wnd, err=%d", err);
+		goto exit;
+	}
+
+	window = rcv_wup + rcv_wnd;
+	if (u32_lt(window, rcv_nxt))
+		window = rcv_nxt;
+
+exit:
+	*seq = window;
+	return err;
+}
+
+static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
+{
+	u32 rcv_nxt, cur_rcv_window, max_seq = 0;
+	struct tcp_skb_cb cb;
+	int err = 0;
+
+	err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt);
+	if (err) {
+		dbg(ERR_READ_TCP_rcv_nxt,
+		    "failed reading tcp_sock->rcv_nxt, err=%d", err);
+		goto exit;
+	}
+
+	if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) {
+		/* No ooo-segments currently in ooo-queue
+		 * Any ooo-segments must already have been merged to the
+		 * receive queue. Current rcv_nxt must therefore be ahead
+		 * of all ooo-segments that have arrived until now.
+		 */
+		max_seq = rcv_nxt;
+	} else {
+		/*
+		 * Some ooo-segments currently in ooo-queue
+		 * Max out-of-order seq is given by the seq_end of the tail
+		 * skb in the ooo-queue.
+		 */
+		err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb);
+		if (err) {
+			dbg(ERR_READ_TCP_last_skb_cb,
+			    "failed to read tcp_sock->ooo_last_skb->cb, err=%d", err);
+			goto exit;
+		}
+
+		// Sanity check - ooo_last_skb->cb.end_seq within the receive window?
+		err = get_current_rcv_wnd_seq(tp, rcv_nxt, &cur_rcv_window);
+		if (err)
+			goto exit;
+
+		/* While seq 0 can be a valid seq, consider it more likely to
+		 * be the result of reading from an invalid SKB pointer
+		 */
+		if (cb.end_seq == 0 || u32_lt(cur_rcv_window, cb.end_seq))
+			max_seq = cur_rcv_window;
+		else
+			max_seq = cb.end_seq;
+	}
+
+exit:
+	*seq = max_seq;
+	return err;
+}
+
+static bool tcp_read_in_ooo_range(struct tcp_sock *tp,
+				  struct tcp_sock_ooo_range *ooo_range)
+{
+	u32 read_seq;
+	int err;
+
+	if (!ooo_range->active)
+		return false;
+
+	err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq);
+	if (err) {
+		dbg(ERR_READ_TCP_cp_seq,
+		    "failed to read tcp_sock->copied_seq, err=%d", err);
+		return true; // Assume we may be in ooo-range
+	}
+
+	if (u32_lt(ooo_range->ooo_seq_end, read_seq)) {
+		ooo_range->active = false;
+		return false;
+	} else {
+		return true;
+	}
+}
+
+static bool tcp_read_maybe_holblocked(struct sock *sk)
+{
+	struct tcp_sock_ooo_range *ooo_range;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 n_ooopkts, nxt_seq;
+	int err;
+
+	err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack);
+	if (err) {
+		dbg(ERR_READ_TCP_rcv_ooopack,
+		    "failed to read tcp_sock->rcv_ooopack, err=%d\n", err);
+		return true; // Assume we may be in ooo-range
+	}
+
+	if (n_ooopkts == 0)
+		return false;
+
+	ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL,
+				       BPF_SK_STORAGE_GET_F_CREATE);
+	if (!ooo_range) {
+		dbg(ERR_sk_storage,
+		    "failed getting ooo-range socket storage for tcp socket");
+		return true; // Assume we may be in ooo-range
+	}
+
+	// Increase in ooo-packets since last - figure out next safe seq
+	if (n_ooopkts > ooo_range->prev_n_ooopkts) {
+		ooo_range->prev_n_ooopkts = n_ooopkts;
+		err = current_max_possible_ooo_seq(tp, &nxt_seq);
+		if (!err) {
+			ooo_range->ooo_seq_end = nxt_seq;
+			ooo_range->active = true;
+		}
+		return true;
+	}
+
+	return tcp_read_in_ooo_range(tp, ooo_range);
+}
+
+static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
+				  ktime_t tstamp, enum netstacklat_hook hook,
+				  u64 cgroup_id)
+{
+	struct hist_key key = { .hook = hook };
+	u32 ifindex;
+
+	if (!filter_current_task())
+		return;
+
+	ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex;
+	if (!filter_ifindex(ifindex))
+		return;
+
+	if (!filter_network_ns(skb, sk))
+		return;
+
+#ifdef CONFIG_GROUPBY_IFINDEX
+	if (user_config.groupby_ifindex)
+		key.ifindex = ifindex;
+#endif
+	if (user_config.groupby_cgroup)
+		key.cgroup = cgroup_id;
+
+	_record_latency_since(tstamp, key);
+}
+
+#ifdef CONFIG_HOOKS_EARLY_RCV
+# ifdef CONFIG_ENABLE_IP_HOOKS
+SEC("fentry/ip_rcv_core")
+int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block,
+	     void *tp, void *res, bool compat_mode)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV);
+	return 0;
+}
+
+SEC("fentry/ip6_rcv_core")
+int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block,
+	     void *tp, void *res, bool compat_mode)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV);
+	return 0;
+}
+# endif /* CONFIG_ENABLE_IP_HOOKS */
+
+# ifdef CONFIG_ENABLE_TCP_HOOKS
+SEC("fentry/tcp_v4_rcv")
+int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START);
+	return 0;
+}
+
+SEC("fentry/tcp_v6_rcv")
+int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START);
+	return 0;
+}
+# endif /* CONFIG_ENABLE_TCP_HOOKS */
+
+# ifdef CONFIG_ENABLE_UDP_HOOKS
+SEC("fentry/udp_rcv")
+int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START);
+	return 0;
+}
+
+SEC("fentry/udpv6_rcv")
+int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START);
+	return 0;
+}
+# endif /* CONFIG_ENABLE_UDP_HOOKS */
+#endif /* CONFIG_HOOKS_EARLY_RCV */
+
+#ifdef CONFIG_HOOKS_ENQUEUE
+# ifdef CONFIG_ENABLE_TCP_HOOKS
+SEC("fexit/tcp_queue_rcv")
+int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb)
+{
+	record_skb_latency(skb, sk, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED);
+	return 0;
+}
+# endif /* CONFIG_ENABLE_TCP_HOOKS */
+
+# ifdef CONFIG_ENABLE_UDP_HOOKS
+SEC("fexit/__udp_enqueue_schedule_skb")
+int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk,
+	     struct sk_buff *skb, int retval)
+{
+	if (retval == 0)
+		record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED);
+	return 0;
+}
+# endif /* CONFIG_ENABLE_UDP_HOOKS */
+#endif /* CONFIG_HOOKS_ENQUEUE */
+
+#ifdef CONFIG_HOOKS_DEQUEUE
+# ifdef CONFIG_ENABLE_TCP_HOOKS
+SEC("fentry/tcp_recv_timestamp")
+int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
+	     struct scm_timestamping_internal *tss)
+{
+	const enum netstacklat_hook hook = NETSTACKLAT_HOOK_TCP_SOCK_READ;
+	u64 cgroup_id = 0;
+
+	if (!filter_socket(sk, NULL, &cgroup_id, hook))
+		return 0;
+
+	struct timespec64 *ts = &tss->ts[0];
+
+	if (!user_config.include_hol_blocked && tcp_read_maybe_holblocked(sk))
+		return 0;
+
+	record_socket_latency(sk, NULL,
+			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
+			      hook, cgroup_id);
+	return 0;
+}
+# endif /* CONFIG_ENABLE_TCP_HOOKS */
+
+# ifdef CONFIG_ENABLE_UDP_HOOKS
+SEC("fentry/skb_consume_udp")
+int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
+	     int len)
+{
+	const enum netstacklat_hook hook = NETSTACKLAT_HOOK_UDP_SOCK_READ;
+	u64 cgroup_id = 0;
+
+	if (!filter_socket(sk, skb, &cgroup_id, hook))
+		return 0;
+
+	record_socket_latency(sk, skb, skb->tstamp, hook, cgroup_id);
+	return 0;
+}
+# endif /* CONFIG_ENABLE_UDP_HOOKS */
+#endif /* CONFIG_HOOKS_DEQUEUE */
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
new file mode 100644
index 00000000..019d0fef
--- /dev/null
+++ b/examples/netstacklat.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef NETSTACKLAT_H
+#define NETSTACKLAT_H
+
+/* To reduce Prometheus buckets metric reduce/scale latency time resolution.
+ * This LATENCY_SCALE is connected to the YAML bucket_multiplier config.
+ */
+#define LATENCY_SCALE 1000UL
+
+#define HIST_MAX_LATENCY_SLOT 24 // ( 2^24 ns / 1000) usecs -> ~16.7s
+/*
+ * MAX_LATENCY_SLOT + 1 buckets for hist, + 1 "bucket" for the "sum key"
+ * (https://github.com/cloudflare/ebpf_exporter?tab=readme-ov-file#sum-keys)
+ * that ebpf_exporter expects for exp2 hists (see how it's used in the
+ * increment_exp2_histogram_nosync() function)
+ */
+#define HIST_NBUCKETS (HIST_MAX_LATENCY_SLOT + 2)
+
+#define NS_PER_S 1000000000
+
+// The highest possible PID on a Linux system (from /include/linux/threads.h)
+#define PID_MAX_LIMIT (4 * 1024 * 1024)
+// The highest ifindex we expect to encounter
+#define IFINDEX_MAX 16384
+// The maximum number of different cgroups we can filter for
+#define MAX_PARSED_CGROUPS 4096
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+#endif
+
+#ifndef max
+#define max(a, b)                   \
+	({                          \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		_a > _b ? _a : _b;  \
+	})
+#endif
+
+#ifndef min
+#define min(a, b)                   \
+	({                          \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		_a < _b ? _a : _b;  \
+	})
+#endif
+
+enum netstacklat_hook {
+	NETSTACKLAT_HOOK_INVALID = 0,
+	NETSTACKLAT_HOOK_IP_RCV,
+	NETSTACKLAT_HOOK_TCP_START,
+	NETSTACKLAT_HOOK_UDP_START,
+	NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED,
+	NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED,
+	NETSTACKLAT_HOOK_TCP_SOCK_READ,
+	NETSTACKLAT_HOOK_UDP_SOCK_READ,
+	NETSTACKLAT_N_HOOKS,
+};
+
+/* Disabling user_config.groupby_ifindex requires modifying hist_key and YAML
+ */
+/* #define CONFIG_GROUPBY_IFINDEX 1 */
+#undef CONFIG_GROUPBY_IFINDEX
+
+/*
+ * Key used for the histogram map
+ * To be compatible with ebpf-exporter, all histograms need a key struct whose final
+ * member is named "bucket" and is the histogram bucket index.
+ */
+struct hist_key {
+	__u64 cgroup;
+#ifdef CONFIG_GROUPBY_IFINDEX
+	__u32 ifindex;
+#endif
+	__u16 hook; // need well defined size for ebpf-exporter to decode
+	__u16 bucket; // needs to be last to be compatible with ebpf-exporter
+} __attribute__((packed));
+
+struct netstacklat_bpf_config {
+	__u32 network_ns;
+	__u32 filter_min_sockqueue_len;
+	__u64 filter_nth_packet;
+	bool filter_pid;
+	bool filter_ifindex;
+	bool filter_cgroup;
+	bool groupby_ifindex;
+	bool groupby_cgroup;
+	bool include_hol_blocked;
+};
+
+#endif
diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
new file mode 100644
index 00000000..45fe845f
--- /dev/null
+++ b/examples/netstacklat.yaml
@@ -0,0 +1,69 @@
+metrics:
+  histograms:
+    - name: netstack_latency_seconds
+      help: Latency for packets (skbs) to reach various points in the kernel network stack
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 24
+      bucket_multiplier: 0.000001 # microseconds to seconds
+      labels:
+        - name: cgroup
+          size: 8
+          decoders:
+            - name: uint
+            - name: cgroup
+# See: CONFIG_GROUPBY_IFINDEX
+#        - name: iface
+#          size: 4
+#          decoders:
+#            # If including output from a different network namespace than ebpf-exporter
+#            # you probably just want to decode as a uint (ifindex) instead
+#            # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others
+#            - name: ifname
+        - name: hook
+          size: 2
+          decoders:
+            - name: uint
+            - name: static_map
+              static_map:
+                1: "ip-start"
+                2: "tcp-start"
+                3: "udp-start"
+                4: "tcp-socket-enqueued"
+                5: "udp-socket-enqueued"
+                6: "tcp-socket-read"
+                7: "udp-socket-read"
+        - name: bucket
+          size: 2
+          decoders:
+            - name: uint
+  counters:
+    - name: netstacklat_errors_total
+      help: Counter for bpf_core_read errors in code (can be disabled in code)
+      labels:
+        - name: type
+          size: 4
+          decoders:
+            - name: uint
+            - name: static_map
+              static_map:
+                0: unknown
+                1: err_sk_storage
+                2: err_read_tcp_rcv_wup
+                3: err_read_tcp_rcv_wnd
+                4: err_read_tcp_rcv_nxt
+                5: err_read_tcp_last_skb_cb
+                6: err_read_tcp_cp_seq
+                7: err_read_tcp_rcv_ooopack
+
+# Remember to update #define N_CGROUPS in code when adding more matches
+cgroup_id_map:
+  name: netstack_cgroupfilter
+  type: hash
+  regexps:
+    - ^(/sys/fs/cgroup/production.slice/.*/nginx-cache.service).*$
+    - ^(/sys/fs/cgroup/production.slice/.*/nginx-ssl.service).*$
+#    - ^(/sys/fs/cgroup/production.slice/.*/pingora-backend-router.service).*$
+#    - ^(/sys/fs/cgroup/production.slice/.*/pingora-origin.service).*$
+#    - ^.*(system.slice/.*)$
+#    - ^.*(user.slice/.*)$