<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Cilium Archives - Linuxcent</title>
	<atom:link href="https://linuxcent.com/tag/cilium/feed/" rel="self" type="application/rss+xml" />
	<link>https://linuxcent.com/tag/cilium/</link>
	<description>Infrastructure security, from the kernel up.</description>
	<lastBuildDate>Wed, 13 May 2026 15:36:53 +0000</lastBuildDate>
	<language>en-US</language>
	<sy:updatePeriod>
	hourly	</sy:updatePeriod>
	<sy:updateFrequency>
	1	</sy:updateFrequency>
	<generator>https://wordpress.org/?v=7.0</generator>

<image>
	<url>https://linuxcent.com/wp-content/uploads/2026/04/favicon-512x512-1-150x150.png</url>
	<title>Cilium Archives - Linuxcent</title>
	<link>https://linuxcent.com/tag/cilium/</link>
	<width>32</width>
	<height>32</height>
</image> 
<site xmlns="com-wordpress:feed-additions:1">211632295</site>	<item>
		<title>LSM and Tetragon — When the Kernel Says No</title>
		<link>https://linuxcent.com/ebpf-lsm-tetragon-runtime-security/</link>
					<comments>https://linuxcent.com/ebpf-lsm-tetragon-runtime-security/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Fri, 12 Jun 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[Cilium]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[linux-security]]></category>
		<category><![CDATA[LSM]]></category>
		<category><![CDATA[Runtime Security]]></category>
		<category><![CDATA[SRE]]></category>
		<category><![CDATA[Tetragon]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1841</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 9</span> <span class="rt-label rt-postfix">minutes</span></span>LSM hooks with eBPF enforce security policy at the syscall boundary before the operation completes. How Tetragon kills processes from kernel space and why that difference matters.</p>
<p>The post <a href="https://linuxcent.com/ebpf-lsm-tetragon-runtime-security/">LSM and Tetragon — When the Kernel Says No</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 9</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 12</em><br />
<a href="/what-is-ebpf/">What Is eBPF?</a> · <a href="/ebpf-verifier-safety/">The BPF Verifier</a> · <a href="/ebpf-vs-kernel-modules/">eBPF vs Kernel Modules</a> · <a href="/ebpf-program-types/">eBPF Program Types</a> · <a href="/ebpf-maps-persistent-data/">eBPF Maps</a> · <a href="/co-re-libbpf-write-once/">CO-RE and libbpf</a> · <a href="/xdp-network-fast-path/">XDP</a> · <a href="/tc-ebpf-pod-network-policy/">TC eBPF</a> · <a href="/bpftrace-kernel-observability/">bpftrace</a> · <a href="/network-flow-observability-ebpf/">Network Flow Observability</a> · <a href="/dns-kernel-observability/">DNS Observability</a> · <strong>LSM and Tetragon</strong></p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img fetchpriority="high" decoding="async" width="2400" height="1578" src="https://linuxcent.com/wp-content/uploads/2026/05/ep12-lsm-tetragon-og-2.png" alt="LSM BPF and Tetragon — kernel security enforcement architecture showing syscall interception and policy evaluation" class="wp-image-2121" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep12-lsm-tetragon-og-2.png 2400w, https://linuxcent.com/wp-content/uploads/2026/05/ep12-lsm-tetragon-og-2-300x197.png 300w, https://linuxcent.com/wp-content/uploads/2026/05/ep12-lsm-tetragon-og-2-1024x673.png 1024w, https://linuxcent.com/wp-content/uploads/2026/05/ep12-lsm-tetragon-og-2-768x505.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep12-lsm-tetragon-og-2-1536x1010.png 1536w, https://linuxcent.com/wp-content/uploads/2026/05/ep12-lsm-tetragon-og-2-2048x1347.png 2048w" sizes="(max-width: 2400px) 100vw, 2400px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">LSM BPF hooks fire before every sensitive syscall — Tetragon uses them to enforce and kill, not just observe.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>LSM eBPF Tetragon integrates Linux Security Module hooks with eBPF programs — enforcement happens at the syscall boundary, before the operation completes, with no detect-and-respond window<br />
  <em>(LSM hook = Linux Security Module hook: a callback point built into the kernel that fires before a security-relevant operation completes, allowing the security module to approve or reject it)</em></li>
<li>Falco and similar sidecar-based tools detect after the fact — the syscall returns, the file is written, the connection is established, the alert fires; with LSM, the syscall never returns success</li>
<li><code class="" data-line="">BPF_PROG_TYPE_LSM</code> is the eBPF program type that attaches to LSM hooks — introduced in kernel 5.7, stable in 5.10+; available on all current Ubuntu LTS, Fedora, and EKS/GKE nodes</li>
<li>Tetragon attaches eBPF programs to LSM hooks and kprobes simultaneously — observing and enforcing from the same kernel attachment point</li>
<li>Tetragon&#8217;s enforcement sends <code class="" data-line="">SIGKILL</code> from within the kernel context — not from a userspace agent reading an audit log and then killing the process</li>
<li>Production caution: LSM enforce mode without thorough policy testing in audit mode first will kill legitimate workloads; always audit before enforce</li>
</ul>
<hr />
<p>EP11 showed how to observe DNS queries at the kernel level — seeing what a workload resolves before it establishes a connection. But observation is passive. It tells you what happened. LSM eBPF Tetragon changes the question entirely: instead of watching the workload, the kernel refuses the operation. This episode covers how that enforcement layer works and why the difference between &#8220;detect&#8221; and &#8220;prevent&#8221; matters in runtime security.</p>
<h2 id="quick-check-is-your-cluster-running-lsm-based-enforcement">Quick Check: Is Your Cluster Running LSM-Based Enforcement?</h2>
<pre><code class="" data-line=""># On any cluster node — what security modules are active?
cat /sys/kernel/security/lsm

# Expected output on a modern kernel:
# lockdown,capability,landlock,yama,apparmor,bpf
#                                              ^^^
#                            &quot;bpf&quot; here means BPF LSM is enabled
</code></pre>
<pre><code class="" data-line=""># Is Tetragon running on this cluster?
kubectl get pods -n kube-system -l app.kubernetes.io/name=tetragon

# If Tetragon is present, check what TracingPolicies are enforcing:
kubectl get tracingpolicies -A

# Sample output:
# NAMESPACE    NAME                      AGE
# kube-system  block-privileged-exec     3d
# kube-system  restrict-sensitive-paths  3d
</code></pre>
<pre><code class="" data-line=""># See what eBPF programs Tetragon has loaded
bpftool prog list | grep -i tetragon

# Output sample:
# 89: lsm  name tetragon_lsm_bprm  tag 8f2a1c3e4d5b7a9f  gpl
#     loaded_at 2026-04-22T09:13:45+0530  uid 0
#     xlated 3312B  jited 2184B  memlock 8192B
# 91: kprobe  name tetragon_kp_exec tag 3c1d8e2f7a4b5c9d  gpl
</code></pre>
<p><code class="" data-line="">lsm</code> program type confirms LSM hook attachment. If you see <code class="" data-line="">tetragon_lsm_*</code> entries, Tetragon is enforcing at the kernel level on this node.</p>
<blockquote>
<p><strong>Not running Tetragon?</strong> Check if your cluster uses AppArmor or seccomp profiles instead — <code class="" data-line="">kubectl get pod &lt;name&gt; -o jsonpath=&#039;{.metadata.annotations}&#039;</code> and look for <code class="" data-line="">seccomp.security.alpha.kubernetes.io</code> or <code class="" data-line="">container.apparmor.security.beta.kubernetes.io</code> annotations. These are userspace-applied profiles that the kernel enforces. Tetragon is additive — it can run alongside AppArmor/seccomp and provides per-process, dynamic policy that static profiles cannot.</p>
</blockquote>
<hr />
<p>Falco fired at 03:14 AM. The alert: a process inside a production container had opened <code class="" data-line="">/etc/passwd</code> for writing. By the time I was on the call, the container had been restarted by a health check failure — the compromised process had already exited. The file had already been modified. Falco had detected the open, emitted the alert, and by the time any automated response could have acted, the syscall had returned, the write had completed, and the file was changed.</p>
<p>Falco did exactly what it&#8217;s designed to do: observe and alert. The gap isn&#8217;t in Falco — it&#8217;s in the architecture. When a tool detects from userspace by reading kernel audit events, there is always a window between the operation completing and the alert firing. For a fast exploit, that window is the entire attack.</p>
<p>I added a Tetragon TracingPolicy the following week:</p>
<pre><code class="" data-line="">spec:
  kprobes:
    - call: &quot;security_inode_permission&quot;
      syscall: false
      return: false
      args:
        - index: 0
          type: &quot;inode&quot;
      selectors:
        - matchArgs:
            - index: 0
              operator: &quot;Prefix&quot;
              values: [&quot;/etc/passwd&quot;, &quot;/etc/shadow&quot;]
          matchActions:
            - action: Sigkill
</code></pre>
<p>Next time a process tries to open <code class="" data-line="">/etc/passwd</code> for writing in a container covered by that policy, the kernel sends <code class="" data-line="">SIGKILL</code> from within the LSM hook. The open never completes. There is no window.</p>
<hr />
<h2 id="how-lsm-hooks-are-placed-in-the-kernel">How LSM Hooks Are Placed in the Kernel</h2>
<p>Linux Security Modules (LSM) is a framework built into the Linux kernel that inserts hook points before security-sensitive operations. The hook fires before the operation is allowed to complete — the LSM module can return an error code that causes the kernel to reject the operation and return <code class="" data-line="">-EPERM</code> to the calling process.</p>
<pre><code class="" data-line="">Process calls open(&quot;/etc/passwd&quot;, O_WRONLY)
      ↓
VFS (Virtual Filesystem) layer receives the request
      ↓
VFS calls security_inode_permission()   ← LSM hook fires here
      ↓
LSM module checks policy
      ↓
      ├── ALLOW → open() proceeds, file descriptor returned
      └── DENY  → open() returns -EPERM, process gets &quot;Permission denied&quot;
                  File is never touched
</code></pre>
<blockquote>
<p><strong><code class="" data-line="">LSM hook</code></strong> — a callback point embedded in Linux kernel source at every security-sensitive operation: file open, execute, socket connect, capability check, mount, ptrace, and more. The kernel calls registered LSM modules at each hook. Before BPF LSM (kernel 5.7), only statically compiled security modules (SELinux, AppArmor, BPF LSM itself) could register at these hooks.</p>
<p><strong><code class="" data-line="">BPF_PROG_TYPE_LSM</code></strong> — the eBPF program type that attaches to LSM hooks. Introduced in kernel 5.7. Requires BPF LSM to be enabled in the kernel (<code class="" data-line="">lsm=bpf</code> in kernel command line, or present alongside other LSMs). When this program type is loaded and attached to an LSM hook, the eBPF program runs at the hook point and returns 0 (allow) or a negative error code (deny).</p>
</blockquote>
<p>The full list of LSM hooks:</p>
<pre><code class="" data-line=""># All LSM hook points available for eBPF attachment
bpftool feature list | grep lsm_hook | head -20

# Or browse the kernel source list:
# include/linux/security.h — every security_*() function is an LSM hook point
</code></pre>
<p>There are 200+ LSM hook points. The most operationally relevant for container security:</p>
<table>
<thead>
<tr>
<th>LSM Hook</th>
<th>What it guards</th>
</tr>
</thead>
<tbody>
<tr>
<td><code class="" data-line="">security_bprm_check</code></td>
<td>Process execution (execve)</td>
</tr>
<tr>
<td><code class="" data-line="">security_inode_permission</code></td>
<td>File read/write/execute</td>
</tr>
<tr>
<td><code class="" data-line="">security_inode_create</code></td>
<td>File creation</td>
</tr>
<tr>
<td><code class="" data-line="">security_socket_connect</code></td>
<td>Outbound TCP/UDP connect</td>
</tr>
<tr>
<td><code class="" data-line="">security_socket_bind</code></td>
<td>Port binding</td>
</tr>
<tr>
<td><code class="" data-line="">security_ptrace_access_check</code></td>
<td>ptrace (debugger attach)</td>
</tr>
<tr>
<td><code class="" data-line="">security_capable</code></td>
<td>Capability checks (CAP_SYS_ADMIN etc.)</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="how-tetragon-combines-lsm-and-kprobe">How Tetragon Combines LSM and kprobe</h2>
<p>Tetragon attaches two types of programs simultaneously for comprehensive runtime security:</p>
<pre><code class="" data-line="">kprobe programs          LSM programs
(observation layer)      (enforcement layer)
       │                        │
       ↓                        ↓
Process executes              Kernel LSM hook fires
kernel function               BEFORE operation completes
       │                        │
       ↓                        ↓
Tetragon reads context:       Tetragon checks TracingPolicy:
  - process name                - selectors match?
  - PID, UID                    - action = Sigkill?
  - namespace, pod name         │
  - parent process              ↓
  - capabilities                SIGKILL sent from kernel context
       │                        Process terminated
       ↓                        Operation never completes
Tetragon exports event
  to userspace observer
</code></pre>
<p>The kprobe side provides the rich context (pod name, namespace, process tree) because it has access to Kubernetes metadata that Tetragon&#8217;s userspace component has pre-populated into maps. The LSM side provides the enforcement capability. Together, they give you context-aware kernel enforcement.</p>
<blockquote>
<p><strong><code class="" data-line="">SIGKILL</code> from kernel vs userspace kill</strong> — When a userspace process runs <code class="" data-line="">kill -9 &lt;pid&gt;</code>, it issues a kill syscall, the kernel schedules the signal delivery, and the target process dies on its next scheduler timeslice. There is a measurable delay — and more importantly, the target process may run for several more instructions before the signal is delivered. When a BPF LSM program returns a non-zero error code or calls <code class="" data-line="">bpf_send_signal(SIGKILL)</code> from within the hook, the signal is delivered synchronously within the kernel&#8217;s execution context. The process does not execute another instruction in the problematic syscall. This is not a speed difference — it is a structural difference in when the enforcement happens relative to the operation.</p>
</blockquote>
<hr />
<h2 id="writing-a-tetragon-tracingpolicy-for-enforcement">Writing a Tetragon TracingPolicy for Enforcement</h2>
<p>Tetragon policies are Kubernetes custom resources. Here&#8217;s a policy that prevents any container from executing shells:</p>
<pre><code class="" data-line="">apiVersion: cilium.io/v1alpha1
kind: TracingPolicy
metadata:
  name: block-shell-exec
spec:
  kprobes:
    - call: &quot;security_bprm_check&quot;
      syscall: false
      args:
        - index: 0
          type: &quot;linux_binprm&quot;
      selectors:
        - matchBinaries:
            - operator: &quot;In&quot;
              values:
                - &quot;/bin/sh&quot;
                - &quot;/bin/bash&quot;
                - &quot;/bin/dash&quot;
                - &quot;/usr/bin/sh&quot;
                - &quot;/usr/bin/bash&quot;
          matchNamespaces:
            - namespace: Pid
              operator: &quot;NotIn&quot;
              values: [&quot;1&quot;]      # exclude host namespace (PID 1 = init)
          matchActions:
            - action: Sigkill
              argError: -1       # EPERM returned to the caller
</code></pre>
<p>Apply and verify:</p>
<pre><code class="" data-line="">kubectl apply -f block-shell-exec.yaml

# Confirm it&#039;s active
kubectl get tracingpolicies
# NAME               ENABLED   REASON   AGE
# block-shell-exec   true               5s

# Verify Tetragon loaded the eBPF program for this policy
bpftool prog list | grep bprm
# 94: lsm  name tetragon_lsm_bprm  tag 8f2a1c3e4d5b7a9f  gpl
#     loaded_at 2026-04-22T14:22:13+0530  uid 0
</code></pre>
<p>Test it (in a non-production namespace):</p>
<pre><code class="" data-line="">kubectl exec -it test-pod -- /bin/sh

# Expected output:
# OCI runtime exec failed: exec failed: unable to start container process:
# error during container init: error starting executable [&quot;/bin/sh&quot;]:
# container_linux.go: ... starting container process caused: process_linux.go:
# ... SIGKILL
</code></pre>
<p>The shell never started. The <code class="" data-line="">security_bprm_check</code> LSM hook fired, the Tetragon eBPF program evaluated the policy, returned <code class="" data-line="">SIGKILL</code> from kernel space. The exec system call returned <code class="" data-line="">-EPERM</code> to the container runtime. No shell process was created.</p>
<hr />
<h2 id="audit-mode-before-enforce-mode">Audit Mode Before Enforce Mode</h2>
<p>Running a new LSM policy in enforce mode without prior testing will kill legitimate workloads. Tetragon supports audit mode for every policy:</p>
<pre><code class="" data-line="">          matchActions:
            - action: Post     # audit mode: log event, do NOT kill
</code></pre>
<p><code class="" data-line="">Post</code> emits a Tetragon event that you can observe:</p>
<pre><code class="" data-line=""># Watch audit events for the policy (before switching to Sigkill)
kubectl exec -n kube-system -it \
  $(kubectl get pod -n kube-system -l app.kubernetes.io/name=tetragon -o name | head -1) \
  -- tetra getevents --event-types PROCESS_KPROBE | grep bprm
</code></pre>
<p>Sample audit event:</p>
<pre><code class="" data-line="">{
  &quot;process_kprobe&quot;: {
    &quot;process&quot;: {
      &quot;pod&quot;: {&quot;name&quot;: &quot;my-app-6d4f9-xk2p1&quot;, &quot;namespace&quot;: &quot;production&quot;},
      &quot;binary&quot;: &quot;/bin/sh&quot;,
      &quot;pid&quot;: 18293
    },
    &quot;function_name&quot;: &quot;security_bprm_check&quot;,
    &quot;action&quot;: &quot;KPROBE_ACTION_POST&quot;
  }
}
</code></pre>
<p>If <code class="" data-line="">my-app</code> legitimately needs <code class="" data-line="">/bin/sh</code> for its health check script, you&#8217;ll see it here before you kill it. Refine the selector (add <code class="" data-line="">matchLabels</code> to exclude that specific deployment, or add the binary to an allowlist) and then switch to <code class="" data-line="">Sigkill</code>.</p>
<hr />
<h2 id="production-gotchas"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/26a0.png" alt="⚠" class="wp-smiley" style="height: 1em; max-height: 1em;" /> Production Gotchas</h2>
<p><strong>Enforce mode kills anything the selector matches — including health checks and init containers.</strong> Most production containers have some shell usage: liveness probes that run <code class="" data-line="">sh -c</code>, init containers that <code class="" data-line="">chmod</code> files, entrypoint wrappers. Run in <code class="" data-line="">Post</code> (audit) mode for at least 48 hours across a representative workload set before switching to <code class="" data-line="">Sigkill</code>. Track all matched events and understand every process in the trace before enforcing.</p>
<p><strong>LSM hooks fire in kernel context — eBPF program complexity is limited.</strong> The verifier enforces strict limits on LSM programs because they run synchronously in the kernel&#8217;s hot path. Policies with many conditions or complex map lookups may be rejected by the verifier. Tetragon&#8217;s policy engine compiles your TracingPolicy into eBPF that stays within verifier limits, but very complex <code class="" data-line="">matchArgs</code> chains with many values can hit limits. Test with <code class="" data-line="">kubectl apply</code> and check Tetragon pod logs for verifier rejection messages.</p>
<p><strong><code class="" data-line="">BPF_PROG_TYPE_LSM</code> requires kernel 5.7+ and BPF LSM enabled.</strong> Check <code class="" data-line="">/sys/kernel/security/lsm</code> for <code class="" data-line="">bpf</code> in the list. EKS nodes running Amazon Linux 2 with kernel 5.10+ have BPF LSM available. GKE nodes with kernel 5.10+ on Container-Optimized OS have it enabled. Ubuntu 22.04 (kernel 5.15) has it enabled by default. Ubuntu 20.04 kernels before 5.7 do not — check your actual kernel version.</p>
<p><strong>Policy scope: Tetragon TracingPolicies are cluster-wide by default.</strong> A policy without a <code class="" data-line="">matchNamespaces</code> or <code class="" data-line="">matchLabels</code> selector applies to every pod on every node. Start with namespace-scoped policies during testing. Use <code class="" data-line="">namespaced</code> TracingPolicy resources (Tetragon 0.10+) to limit scope to a specific namespace.</p>
<p><strong><code class="" data-line="">bpf_send_signal(SIGKILL)</code> vs returning an error code.</strong> Tetragon&#8217;s <code class="" data-line="">Sigkill</code> action uses <code class="" data-line="">bpf_send_signal()</code> rather than returning a negative error from the LSM hook. This means the syscall may return before the signal is delivered — there can be a single instruction window. For critical enforcement paths, combining LSM deny (return <code class="" data-line="">-EPERM</code>) with <code class="" data-line="">bpf_send_signal(SIGKILL)</code> is the belt-and-suspenders approach; Tetragon&#8217;s maintainers have documented which actions use which mechanism.</p>
<hr />
<h2 id="quick-reference">Quick Reference</h2>
<table>
<thead>
<tr>
<th>What you want</th>
<th>Command</th>
</tr>
</thead>
<tbody>
<tr>
<td>Is BPF LSM enabled?</td>
<td><code class="" data-line="">cat /sys/kernel/security/lsm</code> (look for <code class="" data-line="">bpf</code>)</td>
</tr>
<tr>
<td>What LSM programs are loaded?</td>
<td><code class="" data-line="">bpftool prog list | grep lsm</code></td>
</tr>
<tr>
<td>What Tetragon policies exist?</td>
<td><code class="" data-line="">kubectl get tracingpolicies -A</code></td>
</tr>
<tr>
<td>Audit events (before enforce)</td>
<td><code class="" data-line="">tetra getevents --event-types PROCESS_KPROBE</code></td>
</tr>
<tr>
<td>Watch Tetragon enforcement</td>
<td><code class="" data-line="">kubectl logs -n kube-system -l app.kubernetes.io/name=tetragon -f</code></td>
</tr>
<tr>
<td>Test a policy safely</td>
<td>Set <code class="" data-line="">action: Post</code> before <code class="" data-line="">action: Sigkill</code></td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th>Tetragon action</th>
<th>Effect</th>
</tr>
</thead>
<tbody>
<tr>
<td><code class="" data-line="">Post</code></td>
<td>Log event only — audit mode</td>
</tr>
<tr>
<td><code class="" data-line="">Sigkill</code></td>
<td>Send SIGKILL from kernel context</td>
</tr>
<tr>
<td><code class="" data-line="">Override</code></td>
<td>Return custom error code to syscall caller</td>
</tr>
<tr>
<td><code class="" data-line="">FollowFD</code></td>
<td>Track file descriptor for future hook correlation</td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th>LSM hook</th>
<th>Protects</th>
</tr>
</thead>
<tbody>
<tr>
<td><code class="" data-line="">security_bprm_check</code></td>
<td>exec (block shell spawning)</td>
</tr>
<tr>
<td><code class="" data-line="">security_inode_permission</code></td>
<td>file access (block reads/writes to sensitive paths)</td>
</tr>
<tr>
<td><code class="" data-line="">security_socket_connect</code></td>
<td>outbound connections (block C2 connections)</td>
</tr>
<tr>
<td><code class="" data-line="">security_capable</code></td>
<td>capability escalation (block CAP_SYS_ADMIN attempts)</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>LSM eBPF Tetragon enforces at the syscall boundary — the operation either never completes or returns an error before the kernel performs the action, with no detect-and-respond window</li>
<li>Falco, Datadog, and sidecar-based tools detect events after the syscall returns; this is architectural, not a product limitation — they operate at a layer where the operation has already occurred</li>
<li><code class="" data-line="">BPF_PROG_TYPE_LSM</code> attaches eBPF programs directly to Linux Security Module hooks; available on kernel 5.7+, enabled on all current EKS/GKE LTS node images</li>
<li>Tetragon sends <code class="" data-line="">SIGKILL</code> from kernel context using <code class="" data-line="">bpf_send_signal()</code> — not from a userspace agent polling an audit log</li>
<li>Always run Tetragon policies in <code class="" data-line="">Post</code> (audit) mode for 48+ hours before switching to <code class="" data-line="">Sigkill</code> — legitimate workloads trigger many of the same LSM hooks that attacks use</li>
<li>The combination of kprobe (rich context: pod name, namespace, process tree) and LSM (enforcement) gives Tetragon context-aware kernel enforcement that static profiles (AppArmor, seccomp) cannot provide dynamically</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>LSM hooks prevent operations in the moment. But after an incident — when enforcement failed, or when you&#8217;re doing post-hoc forensics — the question changes: what did this process spawn, what files did it touch, what connections did it make, and in what order? Answering that from logs alone is guesswork. Answering it from kernel-level process lineage is reconstruction.</p>
<p>EP13 covers how eBPF kprobe hooks on <code class="" data-line="">fork</code> and <code class="" data-line="">exec</code> build a complete, tamper-resistant process tree. Even after the attacker&#8217;s process has exited, the record remains — in kernel maps, exported to a persistent store, tied to the pod that ran it.</p>
<p><em>Next: <a href="/process-lineage-ebpf/">process lineage with eBPF — reconstructing what happened after the fact</a></em></p>
<p>Get EP13 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-lsm-tetragon-runtime-security%2F&amp;linkname=LSM%20and%20Tetragon%20%E2%80%94%20When%20the%20Kernel%20Says%20No" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-lsm-tetragon-runtime-security%2F&amp;linkname=LSM%20and%20Tetragon%20%E2%80%94%20When%20the%20Kernel%20Says%20No" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-lsm-tetragon-runtime-security%2F&amp;linkname=LSM%20and%20Tetragon%20%E2%80%94%20When%20the%20Kernel%20Says%20No" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-lsm-tetragon-runtime-security%2F&amp;linkname=LSM%20and%20Tetragon%20%E2%80%94%20When%20the%20Kernel%20Says%20No" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-lsm-tetragon-runtime-security%2F&amp;linkname=LSM%20and%20Tetragon%20%E2%80%94%20When%20the%20Kernel%20Says%20No" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-lsm-tetragon-runtime-security%2F&amp;linkname=LSM%20and%20Tetragon%20%E2%80%94%20When%20the%20Kernel%20Says%20No" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-lsm-tetragon-runtime-security%2F&amp;linkname=LSM%20and%20Tetragon%20%E2%80%94%20When%20the%20Kernel%20Says%20No" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Febpf-lsm-tetragon-runtime-security%2F&#038;title=LSM%20and%20Tetragon%20%E2%80%94%20When%20the%20Kernel%20Says%20No" data-a2a-url="https://linuxcent.com/ebpf-lsm-tetragon-runtime-security/" data-a2a-title="LSM and Tetragon — When the Kernel Says No"></a></p><p>The post <a href="https://linuxcent.com/ebpf-lsm-tetragon-runtime-security/">LSM and Tetragon — When the Kernel Says No</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/ebpf-lsm-tetragon-runtime-security/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1841</post-id>	</item>
		<item>
		<title>Network Flow Observability — What Every Connection Reveals</title>
		<link>https://linuxcent.com/ebpf-network-flow-observability/</link>
					<comments>https://linuxcent.com/ebpf-network-flow-observability/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Fri, 29 May 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[Cilium]]></category>
		<category><![CDATA[Flow Telemetry]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Network Observability]]></category>
		<category><![CDATA[SRE]]></category>
		<category><![CDATA[TC eBPF]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1838</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 10</span> <span class="rt-label rt-postfix">minutes</span></span>See every TCP connection, retransmit, and dropped packet across your cluster using eBPF TC hooks — the kernel-level flow telemetry that APM tools interpret, not originate.</p>
<p>The post <a href="https://linuxcent.com/ebpf-network-flow-observability/">Network Flow Observability — What Every Connection Reveals</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 10</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 10</em><br />
<a href="/what-is-ebpf/">What Is eBPF?</a> · <a href="/ebpf-verifier-safety/">The BPF Verifier</a> · <a href="/ebpf-vs-kernel-modules/">eBPF vs Kernel Modules</a> · <a href="/ebpf-program-types/">eBPF Program Types</a> · <a href="/ebpf-maps-persistent-data/">eBPF Maps</a> · <a href="/co-re-libbpf-write-once/">CO-RE and libbpf</a> · <a href="/xdp-network-fast-path/">XDP</a> · <a href="/tc-ebpf-pod-network-policy/">TC eBPF</a> · <a href="/bpftrace-kernel-observability/">bpftrace</a> · <strong>Network Flow Observability</strong> · <a href="/dns-kernel-observability/">DNS Observability</a></p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img decoding="async" width="605" height="2560" src="https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-scaled.png" alt="eBPF Network Flow Observability — Hubble and Cilium architecture for zero-instrumentation flow monitoring" class="wp-image-2119" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-scaled.png 605w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-71x300.png 71w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-242x1024.png 242w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-768x3249.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-363x1536.png 363w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-484x2048.png 484w" sizes="(max-width: 605px) 100vw, 605px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">Hubble captures every packet decision at the eBPF layer — no sidecar, no app changes, no sampling.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>Network flow observability with eBPF attaches persistent programs to TC hooks and records every connection attempt, retransmit, reset, and drop — continuously, with no sampling<br />
  <em>(TC hook = Traffic Control hook: the point in the Linux network stack where eBPF programs intercept packets after ingress or before egress, tied to a specific network interface)</em></li>
<li>APM tools and service mesh telemetry are interpretations of what happened; kernel-level flow data from TC hooks is the raw event stream they all derive from</li>
<li>Retransmit counters at the kernel level reveal congestion, half-open connections, and remote endpoint failures that application logs never surface</li>
<li>Cilium&#8217;s Hubble and similar tools (Pixie, Retina) are eBPF flow exporters — they run TC programs, collect <code class="" data-line="">perf_event</code> or <code class="" data-line="">ringbuf</code> events, and expose them over an API</li>
<li>You can verify what flow data a tool is actually collecting with four <code class="" data-line="">bpftool</code> commands — without reading documentation</li>
<li>Production caution: flow maps grow with the number of active connections; pin and bound your maps, and account for the per-packet overhead on high-throughput interfaces</li>
</ul>
<hr />
<p>EP09 showed bpftrace as an on-demand kernel query tool — compile a question, get an answer, clean up. Network flow observability with eBPF is the persistent version: programs that stay attached to TC hooks across your entire fleet, recording every connection without waiting for you to ask. When a client reports intermittent failures that appear nowhere in application logs, that persistent record is what you query. This episode covers how that layer works and how to read it.</p>
<h2 id="quick-check-what-flow-data-is-your-cluster-already-collecting">Quick Check: What Flow Data Is Your Cluster Already Collecting?</h2>
<p>Before building anything new, check what&#8217;s already running. If you have Cilium, Pixie, or Retina on your cluster, eBPF flow programs are already attached:</p>
<pre><code class="" data-line=""># SSH into a worker node, then:

# What TC programs are attached to cluster interfaces?
bpftool net list

# Expected output on a Cilium node:
# xdp:
#
# tc:
# eth0(2) clsact/ingress prog_id 38 prio 1 handle 0x1 direct-action
# eth0(2) clsact/egress  prog_id 39 prio 1 handle 0x1 direct-action
# lxc12a3(15) clsact/ingress prog_id 41 prio 1 handle 0x1 direct-action
# lxc12a3(15) clsact/egress  prog_id 42 prio 1 handle 0x1 direct-action
</code></pre>
<pre><code class="" data-line=""># What maps are those programs holding state in?
bpftool map list | grep -E &quot;flow|conn|sock|nat&quot;

# Sample output:
# 24: hash  name cilium_ct4_global  flags 0x0
#     key 24B  value 56B  max_entries 65536  memlock 4718592B
# 25: hash  name cilium_ct4_local   flags 0x0
#     key 24B  value 56B  max_entries 8192   memlock 589824B
</code></pre>
<p>Each <code class="" data-line="">lxcXXXX</code> interface is a pod&#8217;s veth pair. The TC programs on those interfaces are what Cilium uses to enforce NetworkPolicy and collect flow telemetry. If you see <code class="" data-line="">prog_id</code> values on pod interfaces, your cluster is already doing kernel-level flow collection.</p>
<blockquote>
<p><strong>Not running Cilium?</strong> On a plain kubeadm or EKS node without a CNI that uses eBPF, <code class="" data-line="">bpftool net list</code> will show no TC programs on pod interfaces — just whatever kube-proxy or the CNI plugin installed. You can still attach your own flow programs with <code class="" data-line="">tc qdisc add dev eth0 clsact</code> — that&#8217;s the starting point this episode covers.</p>
</blockquote>
<hr />
<p>The client opened a ticket on a Tuesday afternoon. &#8220;Intermittent connection failures to the payment gateway. Started around 11 AM. Application logs say timeout. Retry logic is masking it for most users but the error rate is up 0.3%.&#8221;</p>
<p>I looked at the APM dashboard. The service showed elevated latency — p99 at 850ms versus a normal 120ms — but no hard errors at the application layer. The service mesh metrics showed the downstream call succeeding from the mesh&#8217;s perspective. The payment gateway team said their side looked clean.</p>
<p>Three tools. Three different answers. All of them interpreting the network. None of them were the network.</p>
<p>I ran:</p>
<pre><code class="" data-line="">bpftool map dump id 24 | grep -A5 &quot;payment-gateway-ip&quot;
</code></pre>
<p>The connection tracking map showed retransmit count 14 for a specific <code class="" data-line="">(src_ip, dst_ip, src_port, dst_port)</code> tuple — the same 5-tuple, every 30 seconds, for 2 hours. The kernel was retransmitting. The TCP stack was compensating. The application was seeing sporadic success because retransmits eventually got through. The APM dashboard averaged that latency into a p99 and called it &#8220;elevated.&#8221;</p>
<p>The kernel had the truth. Everything above it was rounding.</p>
<hr />
<h2 id="why-application-level-metrics-miss-what-the-kernel-sees">Why Application-Level Metrics Miss What the Kernel Sees</h2>
<p>Application metrics — APM spans, service mesh telemetry, load balancer health checks — operate at Layer 7. They measure round-trip time for complete requests, error codes returned, bytes transferred. They answer &#8220;did this request succeed?&#8221; not &#8220;what did the network do to make it succeed?&#8221;</p>
<p>The TCP stack underneath those requests handles retransmits, congestion window adjustments, RST packets, and half-open connections silently. From an application&#8217;s perspective, a request that required 3 retransmits before the ACK arrived looks identical to one that succeeded on the first attempt — slightly slower, but successful.</p>
<p>This is structural, not a tooling gap. Application-layer observability tools cannot see below their own protocol boundary. The kernel&#8217;s TCP implementation does not report upward when it retransmits. It just retransmits.</p>
<p>eBPF flow observability closes this gap by attaching programs directly to the network path — at the TC hook, which fires on every packet crossing a network interface — and recording what the kernel actually does.</p>
<hr />
<h2 id="how-tc-hook-flow-programs-work">How TC Hook Flow Programs Work</h2>
<p>EP08 covered TC eBPF programs for pod network policy. Flow observability uses the same attachment point with a different purpose: instead of allowing or dropping packets, the program reads packet metadata and writes it to a map or ring buffer.</p>
<pre><code class="" data-line="">Pod sends packet
      ↓
veth interface (lxcXXXX)
      ↓
TC clsact/egress hook fires
      ↓
eBPF program reads:
  - src IP, dst IP
  - src port, dst port
  - protocol
  - packet size
  - TCP flags (SYN, ACK, FIN, RST, retransmit bit)
      ↓
Writes event to ringbuf (or perf_event_array)
      ↓
Userspace consumer reads ringbuf
      ↓
Aggregates to flow record
      ↓
Exports to Hubble/Prometheus/flow store
</code></pre>
<blockquote>
<p><strong><code class="" data-line="">ringbuf</code></strong> — a BPF ring buffer: a lock-free, memory-efficient queue shared between a kernel eBPF program and a userspace consumer. The kernel program writes events; the userspace reader drains them. Used instead of <code class="" data-line="">perf_event_array</code> in kernel 5.8+ because it avoids per-CPU memory waste and supports variable-length records. When you see Hubble exporting flows, it&#8217;s reading from a ringbuf that the TC program writes to.</p>
</blockquote>
<p>The key structural property: the TC hook fires on every packet. Not sampled. Not throttled by default. Every SYN, every ACK, every RST, every retransmit. For flow observability, you typically aggregate at the program level — count packets and bytes per 5-tuple per second, rather than emitting an event per packet — but the raw visibility is there if you need it.</p>
<hr />
<h2 id="what-retransmit-telemetry-actually-reveals">What Retransmit Telemetry Actually Reveals</h2>
<p>Most flow observability implementations track TCP retransmits specifically because they are the clearest signal of network-layer trouble invisible to applications.</p>
<p>A TCP retransmit happens when a sender doesn&#8217;t receive an ACK within the retransmission timeout (RTO). The kernel resends the segment and doubles the timeout (exponential backoff). From the application&#8217;s perspective, the call takes longer. If retransmits keep clearing, the application sees success — just slow success.</p>
<blockquote>
<p><strong><code class="" data-line="">perf_event</code></strong> — a kernel mechanism for collecting performance data. In eBPF, <code class="" data-line="">BPF_MAP_TYPE_PERF_EVENT_ARRAY</code> lets kernel programs push variable-length records to userspace readers via a ring buffer per CPU. Older tools use <code class="" data-line="">perf_event_array</code>; newer ones use <code class="" data-line="">BPF_MAP_TYPE_RINGBUF</code> (single shared ring, more efficient). If you inspect an older version of Cilium&#8217;s flow exporter, you&#8217;ll see <code class="" data-line="">perf_event</code> writes; newer versions use <code class="" data-line="">ringbuf</code>.</p>
</blockquote>
<p>To observe retransmits directly with bpftrace:</p>
<pre><code class="" data-line=""># Count retransmit events per destination IP — run for 60 seconds
bpftrace -e &#039;
kprobe:tcp_retransmit_skb {
    $sk = (struct sock *)arg0;
    $daddr = ntop(AF_INET, $sk-&gt;__sk_common.skc_daddr);
    @retransmits[$daddr] = count();
}
interval:s:60 { print(@retransmits); clear(@retransmits); exit(); }
&#039;
</code></pre>
<p>Sample output:</p>
<pre><code class="" data-line="">Attaching 2 probes...
@retransmits[10.96.0.10]:   2       # DNS service — normal
@retransmits[172.16.4.23]:  847     # payment gateway endpoint ← problem here
@retransmits[10.244.1.5]:   1       # normal pod-to-pod traffic
</code></pre>
<p>847 retransmits to a single endpoint in 60 seconds. That&#8217;s not noise. That&#8217;s a congested or half-open connection being retried 14 times per second by the TCP stack while the application layer averages it into &#8220;elevated latency.&#8221;</p>
<hr />
<h2 id="how-cilium-hubble-collects-flow-data">How Cilium Hubble Collects Flow Data</h2>
<p>Hubble is the flow observability layer built into Cilium. Understanding how it works makes you able to reason about what it can and cannot see — and how to verify what it&#8217;s actually collecting.</p>
<p>Hubble&#8217;s architecture:</p>
<pre><code class="" data-line="">Kernel (per node)
├── TC eBPF programs on all pod veth interfaces
│     write flow events → BPF ringbuf
│
└── Hubble node agent (userspace)
      reads ringbuf
      enriches with pod metadata (Kubernetes API)
      exposes gRPC API

Cluster level
└── Hubble Relay
      aggregates per-node gRPC streams
      exposes single cluster-wide API

User tooling
└── hubble observe  /  Hubble UI  /  Prometheus exporter
</code></pre>
<p>The TC programs are writing raw packet events. The Hubble agent is the consumer that translates those events into Kubernetes-aware flow records — adding pod name, namespace, label, and policy verdict on top of the 5-tuple and TCP metadata the kernel provides.</p>
<p>To see what Hubble&#8217;s TC programs have attached:</p>
<pre><code class="" data-line=""># On any Cilium node
bpftool net list | grep lxc

# lxce4a1(23) clsact/ingress prog_id 61  ← Hubble flow program on pod interface ingress
# lxce4a1(23) clsact/egress  prog_id 62  ← Hubble flow program on pod interface egress
# lxcf7b2(31) clsact/ingress prog_id 63
# lxcf7b2(31) clsact/egress  prog_id 64
</code></pre>
<pre><code class="" data-line=""># Inspect one of those programs to confirm it&#039;s reading flow metadata
bpftool prog show id 61

# Output:
# 61: sched_cls  name tail_handle_nat  tag 3a8e2f1b4c7d9e0a  gpl
#     loaded_at 2026-04-22T09:13:45+0530  uid 0
#     xlated 2144B  jited 1382B  memlock 4096B  map_ids 24,31,38
#     btf_id 142
</code></pre>
<p><code class="" data-line="">sched_cls</code> is the BPF program type for TC — confirming these are TC-attached flow programs. <code class="" data-line="">map_ids 24,31,38</code> — those are the maps this program reads from and writes to. You can dump any of them:</p>
<pre><code class="" data-line="">bpftool map dump id 24 | head -40

# Output (connection tracking entry):
# [{
#     &quot;key&quot;: {
#         &quot;saddr&quot;: &quot;10.244.1.5&quot;,        # ← source pod IP
#         &quot;daddr&quot;: &quot;172.16.4.23&quot;,        # ← destination IP
#         &quot;sport&quot;: 48291,                # ← source port
#         &quot;dport&quot;: 443,                  # ← destination port
#         &quot;nexthdr&quot;: 6,                  # ← protocol: TCP
#         &quot;flags&quot;: 3                     # ← CT_EGRESS | CT_ESTABLISHED
#     },
#     &quot;value&quot;: {
#         &quot;rx_packets&quot;: 14832,           # ← packets received
#         &quot;tx_packets&quot;: 14831,           # ← packets sent
#         &quot;rx_bytes&quot;: 3841024,           # ← bytes received
#         &quot;tx_bytes&quot;: 3756288,           # ← bytes sent
#         &quot;lifetime&quot;: 21600,             # ← seconds until entry expires
#         &quot;rx_closing&quot;: 0,
#         &quot;tx_closing&quot;: 0
#     }
# }]
</code></pre>
<p>That&#8217;s the ground truth. Not an APM span. Not a service mesh metric. The actual per-connection counters the kernel is maintaining for that 5-tuple.</p>
<hr />
<h2 id="writing-a-minimal-flow-observer-with-bpftrace">Writing a Minimal Flow Observer with bpftrace</h2>
<p>You don&#8217;t need Cilium or Hubble to get flow telemetry. bpftrace can produce it directly on any node with BTF:</p>
<pre><code class="" data-line=""># Persistent flow table: connections + packet counts for 2 minutes
bpftrace -e &#039;
kprobe:tcp_sendmsg {
    $sk = (struct sock *)arg0;
    $daddr = ntop(AF_INET, $sk-&gt;__sk_common.skc_daddr);
    $dport = $sk-&gt;__sk_common.skc_dport &gt;&gt; 8;
    @flows[comm, $daddr, $dport] = count();
}
interval:s:30 { print(@flows); clear(@flows); }
&#039; --timeout 120
</code></pre>
<p>Sample output (every 30 seconds):</p>
<pre><code class="" data-line="">@flows[curl, 93.184.216.34, 443]:         12    # curl → example.com:443
@flows[coredns, 10.96.0.10, 53]:          341   # CoreDNS upstream queries
@flows[payment-svc, 172.16.4.23, 443]:   1204   # payment service → gateway
@flows[nginx, 10.244.2.3, 8080]:          89    # nginx → upstream pod
</code></pre>
<p>For retransmit tracking specifically:</p>
<pre><code class="" data-line=""># Combined flow + retransmit watcher — runs until Ctrl-C
bpftrace -e &#039;
kprobe:tcp_retransmit_skb {
    $sk = (struct sock *)arg0;
    $daddr = ntop(AF_INET, $sk-&gt;__sk_common.skc_daddr);
    @retx[comm, $daddr] = count();
}
kprobe:tcp_sendmsg {
    $sk = (struct sock *)arg0;
    $daddr = ntop(AF_INET, $sk-&gt;__sk_common.skc_daddr);
    @sends[comm, $daddr] = count();
}
interval:s:10 {
    printf(&quot;=== Retransmit ratio (last 10s) ===\n&quot;);
    print(@retx);
    print(@sends);
    clear(@retx);
    clear(@sends);
}
&#039;
</code></pre>
<p>This gives you both the volume of sends and the retransmit count side by side — the ratio tells you whether retransmits are a rounding error (0.01%) or a signal (5%+).</p>
<hr />
<h2 id="production-gotchas"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/26a0.png" alt="⚠" class="wp-smiley" style="height: 1em; max-height: 1em;" /> Production Gotchas</h2>
<p><strong>Map size bounds matter.</strong> Connection tracking maps default to tens of thousands of entries. On nodes with high connection churn (serverless, short-lived batch jobs), maps can fill and start dropping new entries silently. Check <code class="" data-line="">bpftool map show id N</code> for <code class="" data-line="">max_entries</code> and monitor map utilization. Cilium exposes this as <code class="" data-line="">cilium_bpf_map_pressure</code> in Prometheus.</p>
<p><strong>Per-packet overhead on high-throughput interfaces.</strong> A TC program that fires on every packet on a 10Gbps interface processes millions of packets per second. Aggregating at the program level (count per 5-tuple rather than emit per packet) keeps overhead manageable — Cilium does this. A naive bpftrace one-liner that emits a perf event per packet will saturate the perf ring buffer under real load. Use <code class="" data-line="">ringbuf</code> write paths or aggregate before emitting.</p>
<p><strong>TC hook placement and direction confusion.</strong> Ingress TC on a pod&#8217;s veth (lxcXXXX) sees egress traffic from the pod&#8217;s perspective — because the host sees the packet arriving on the veth after the pod sent it. This reversal is consistent but confusing when you&#8217;re reading direction labels in flow records. EP08 covered this in detail for policy enforcement; the same asymmetry applies to flow data.</p>
<p><strong>Retransmit counters reset on connection close.</strong> If you&#8217;re tracking retransmit totals for a long-lived connection, the count is stored in the kernel&#8217;s socket state and is cleared when the socket closes. For persistent tracking across reconnects, aggregate at the flow level in userspace before the connection closes.</p>
<p><strong>Hubble flow visibility requires pod interfaces.</strong> Hubble only sees traffic that crosses a pod&#8217;s veth interface. Node-to-node traffic that doesn&#8217;t involve a pod (e.g., node SSH, kubelet-to-API-server on the node IP) is not captured by default. For host-level network observability, you need a TC program on the physical interface (<code class="" data-line="">eth0</code>, <code class="" data-line="">ens3</code>), not just on pod veth pairs.</p>
<hr />
<h2 id="quick-reference">Quick Reference</h2>
<table>
<thead>
<tr>
<th>What you want to see</th>
<th>Command</th>
</tr>
</thead>
<tbody>
<tr>
<td>What TC programs are attached</td>
<td><code class="" data-line="">bpftool net list</code></td>
</tr>
<tr>
<td>Which maps a program uses</td>
<td><code class="" data-line="">bpftool prog show id N</code> (check <code class="" data-line="">map_ids</code>)</td>
</tr>
<tr>
<td>Connection tracking entries</td>
<td><code class="" data-line="">bpftool map dump id N</code></td>
</tr>
<tr>
<td>Retransmits per destination</td>
<td><code class="" data-line="">bpftrace -e &#039;kprobe:tcp_retransmit_skb { ... }&#039;</code></td>
</tr>
<tr>
<td>Flow counts per process</td>
<td><code class="" data-line="">bpftrace -e &#039;kprobe:tcp_sendmsg { @[comm, daddr] = count(); }&#039;</code></td>
</tr>
<tr>
<td>Hubble flow stream (Cilium)</td>
<td><code class="" data-line="">hubble observe --follow</code></td>
</tr>
<tr>
<td>Hubble flows for one pod</td>
<td><code class="" data-line="">hubble observe --pod mynamespace/mypod --follow</code></td>
</tr>
<tr>
<td>Verify map pressure</td>
<td><code class="" data-line="">bpftool map show id N</code> (check <code class="" data-line="">max_entries</code> vs entries)</td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th>Kernel function</th>
<th>What it marks</th>
</tr>
</thead>
<tbody>
<tr>
<td><code class="" data-line="">tcp_sendmsg</code></td>
<td>Data being sent on a TCP socket</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_recvmsg</code></td>
<td>Data being received on a TCP socket</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_retransmit_skb</code></td>
<td>A segment being retransmitted</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_send_reset</code></td>
<td>RST being sent</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_fin</code></td>
<td>Connection teardown initiated</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_connect</code></td>
<td>New outbound TCP connection attempt</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>Network flow observability with eBPF attaches TC programs that record every connection event continuously — not sampled, not throttled, not filtered by what the application reports</li>
<li>Retransmit telemetry from <code class="" data-line="">tcp_retransmit_skb</code> reveals congestion and endpoint failures that are structurally invisible to application-layer monitoring tools</li>
<li>Cilium Hubble, Pixie, and Retina are all eBPF flow exporters — they run TC programs, drain a ringbuf, enrich with Kubernetes metadata, and expose the result over an API</li>
<li>You can verify what any flow tool is actually collecting with <code class="" data-line="">bpftool net list</code>, <code class="" data-line="">bpftool prog show</code>, and <code class="" data-line="">bpftool map dump</code> — four commands, no documentation needed</li>
<li>Map sizing and per-packet overhead are the two production concerns; aggregate at the kernel level, bound your maps, and monitor map pressure</li>
<li>The kernel&#8217;s connection tracking map is the ground truth. APM dashboards, service mesh metrics, and load balancer health checks are all interpretations of what that map contains</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>Flow observability tells you what connections exist. EP11 goes one level deeper: what names your pods are resolving those connections to. DNS is where a compromised workload first reveals itself — it queries a domain that has no business being queried from a production pod, and if you&#8217;re not watching the kernel-level DNS path, you won&#8217;t see it until after the damage.</p>
<p>DNS observability at the kernel level uses tracepoint hooks on the DNS syscall path — the same ground-truth approach as flow telemetry, but for name resolution: every query, every response, tied to the pod that made it, without deploying a sidecar.</p>
<p><em>Next: <a href="/dns-kernel-observability/">DNS observability at the kernel level — what your pods are actually resolving</a></em></p>
<p>Get EP11 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&#038;title=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" data-a2a-url="https://linuxcent.com/ebpf-network-flow-observability/" data-a2a-title="Network Flow Observability — What Every Connection Reveals"></a></p><p>The post <a href="https://linuxcent.com/ebpf-network-flow-observability/">Network Flow Observability — What Every Connection Reveals</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/ebpf-network-flow-observability/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1838</post-id>	</item>
		<item>
		<title>TC eBPF — Pod-Level Network Policy Without iptables</title>
		<link>https://linuxcent.com/tc-ebpf-kubernetes-network-policy/</link>
					<comments>https://linuxcent.com/tc-ebpf-kubernetes-network-policy/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Sun, 03 May 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[Cilium]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Linux Networking]]></category>
		<category><![CDATA[NetworkPolicy]]></category>
		<category><![CDATA[SRE]]></category>
		<category><![CDATA[TC eBPF]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1837</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 10</span> <span class="rt-label rt-postfix">minutes</span></span>TC eBPF gives Cilium pod identity for network policy enforcement. Why XDP alone isn't enough, how TC programs stack on veth interfaces, and how to debug stale filters.</p>
<p>The post <a href="https://linuxcent.com/tc-ebpf-kubernetes-network-policy/">TC eBPF — Pod-Level Network Policy Without iptables</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 10</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 8</em><br />
<em><a href="https://linuxcent.com/what-is-ebpf-linux-kubernetes/">What Is eBPF?</a> · <a href="https://linuxcent.com/bpf-verifier-kubernetes-safety/">The BPF Verifier</a> · <a href="https://linuxcent.com/ebpf-vs-kernel-modules-kubernetes/">eBPF vs Kernel Modules</a> · <a href="https://linuxcent.com/ebpf-program-types-kubernetes/">eBPF Program Types</a> · <a href="https://linuxcent.com/ebpf-maps-explained/">eBPF Maps</a> · <a href="https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/">CO-RE and libbpf</a> · <a href="https://linuxcent.com/ebpf-xdp-kubernetes-networking/">XDP</a> · </em><em>TC eBPF</em>**</p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img decoding="async" width="1595" height="2560" src="https://linuxcent.com/wp-content/uploads/2026/05/ep08-tc-ebpf-og-2-scaled.png" alt="TC eBPF and Cilium — traffic control hook architecture showing ingress/egress packet flow with sk_buff context" class="wp-image-2116" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep08-tc-ebpf-og-2-scaled.png 1595w, https://linuxcent.com/wp-content/uploads/2026/05/ep08-tc-ebpf-og-2-187x300.png 187w, https://linuxcent.com/wp-content/uploads/2026/05/ep08-tc-ebpf-og-2-638x1024.png 638w, https://linuxcent.com/wp-content/uploads/2026/05/ep08-tc-ebpf-og-2-768x1233.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep08-tc-ebpf-og-2-957x1536.png 957w, https://linuxcent.com/wp-content/uploads/2026/05/ep08-tc-ebpf-og-2-1276x2048.png 1276w" sizes="(max-width: 1595px) 100vw, 1595px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">The TC hook runs inside the kernel network stack — Cilium uses it for identity-based policy enforcement.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>TC eBPF fires after <code class="" data-line="">sk_buff</code> allocation — it has socket metadata, cgroup ID, and pod identity that XDP lacks<br />
<em>(<code class="" data-line="">sk_buff</code> = the kernel&#8217;s socket buffer, allocated for every packet; TC fires after this allocation, so it can read socket and process identity)</em></li>
<li>Direct action (DA) mode combines filter and action; the program&#8217;s return value is the packet fate</li>
<li>Multiple TC programs chain on the same hook ordered by priority — stale programs from Cilium upgrades cause silent policy conflicts</li>
<li><code class="" data-line="">tc filter show dev &lt;iface&gt; ingress/egress</code> is the primary inspection tool; <code class="" data-line="">bpftool net list</code> shows the full node picture</li>
<li>XDP + TC is the Cilium data path: XDP for pre-stack service load balancing, TC for per-pod identity-based enforcement</li>
<li>TC can modify packet content (<code class="" data-line="">bpf_skb_store_bytes</code>) — the basis for TC-based DNAT and packet mangling</li>
</ul>
<hr />
<p>TC eBPF is where Cilium implements pod-level network policy without iptables — the hook that fires after <code class="" data-line="">sk_buff</code> allocation, where socket and cgroup context exist, making per-pod enforcement possible. The obvious follow-up to XDP is why Cilium doesn&#8217;t use it for everything — pod network policy, egress enforcement, the full NetworkPolicy ruleset. The answer reveals an inherent trade-off built into the Linux data path: XDP&#8217;s speed comes from running before any context exists. At the moment it fires, there is no socket, no cgroup, no way to tell which pod sent the packet. The moment you need pod identity, you need a hook that fires later — and pays for it.</p>
<hr />
<p>A specific pod in production was experiencing intermittent TCP connection failures to an external service. Not all connections — roughly one in fifty. Kubernetes NetworkPolicy showed egress allowed for the namespace. Cilium policy status showed no violations. Running <code class="" data-line="">curl</code> from inside the pod worked fine.</p>
<p>The application logs told a different story: connection timeouts at the 30-second mark, no SYN-ACK received. Not a DNS issue — I verified with <code class="" data-line="">tcpdump</code> inside the pod namespace. SYN packets were leaving the pod network namespace. They weren&#8217;t making it onto the wire.</p>
<p>I ran <code class="" data-line="">bpftool net list</code> on the node and saw two TC egress programs attached to that pod&#8217;s veth interface. One from the current Cilium version (installed six weeks ago). One from the previous version — from before the rolling upgrade. Two programs. Different policy epochs. The older one had a stale block rule that fired intermittently based on connection tuple patterns it was never designed to handle in the new policy model.</p>
<p>Without understanding TC eBPF — what programs attach where, how multiple programs interact, and how to inspect them — I would have kept chasing ghosts in the application layer.</p>
<h2 id="quick-check-are-there-stale-tc-filters-on-your-cluster">Quick Check: Are There Stale TC Filters on Your Cluster?</h2>
<p>The most common TC eBPF issue on production clusters — stale filters left behind by a Cilium upgrade — is a two-command check:</p>
<pre><code class="" data-line=""># SSH into a worker node, then pick any pod&#039;s veth interface:
ip link | grep lxc | head -5
# lxc8a3f21b@if7: ...
# lxc2c9d3e1@if9: ...

# Check TC filters on that interface
tc filter show dev lxc8a3f21b egress
</code></pre>
<p><strong>Healthy output (one filter, one priority):</strong></p>
<pre><code class="" data-line="">filter protocol all pref 1 bpf chain 0
filter protocol all pref 1 bpf chain 0 handle 0x1 cil_to_container direct-action not_in_hw id 44
</code></pre>
<p><strong>Stale filter present (two priorities = problem):</strong></p>
<pre><code class="" data-line="">filter protocol all pref 1 bpf chain 0
filter protocol all pref 1 bpf chain 0 handle 0x1 cil_to_container direct-action not_in_hw id 44
filter protocol all pref 2 bpf chain 0
filter protocol all pref 2 bpf chain 0 handle 0x1 old_cil_to_container direct-action not_in_hw id 17
#                  ^^^^^^ two different priorities = two programs running in sequence
</code></pre>
<p>Two priorities on the same hook means two programs running sequentially. If the older one has a stale DROP rule, packets are being dropped intermittently — and nothing in the application layer will tell you why.</p>
<blockquote>
<p><strong>Not running Cilium?</strong> If you&#8217;re on a non-Cilium CNI (Calico, Flannel, <code class="" data-line="">aws-vpc-cni</code>), you likely won&#8217;t have TC eBPF filters on pod interfaces. Run <code class="" data-line="">tc filter show dev eth0 ingress</code> on the node uplink instead to see if any TC programs are attached at the node level. An empty response is normal for non-Cilium clusters.</p>
</blockquote>
<h2 id="why-tc-not-xdp">Why TC, Not XDP</h2>
<p>EP07 covered XDP: fastest possible hook, fires before <code class="" data-line="">sk_buff</code>, drops at line rate. If XDP is so fast, why doesn&#8217;t Cilium use it for everything?</p>
<p>Because XDP sees only raw packet bytes. No socket. No cgroup. No pod identity.</p>
<p>In Kubernetes, network policy is inherently about identity. &#8220;Allow pod A to connect to pod B on port 8080.&#8221; To enforce this, you need to know which pod a packet is coming from on egress — and which pod it&#8217;s going to on ingress. That mapping lives in the cgroup hierarchy and the socket buffer, neither of which exist at XDP time.</p>
<p>TC fires later in the packet lifecycle, after <code class="" data-line="">sk_buff</code> is allocated and populated:</p>
<pre><code class="" data-line="">Ingress path:
  wire → NIC → [XDP hook] → sk_buff allocated → [TC ingress hook] → netfilter → socket

Egress path:
  socket → IP routing → [TC egress hook] → qdisc → NIC → wire
</code></pre>
<p>At the TC egress hook on a pod&#8217;s veth interface, the <code class="" data-line="">sk_buff</code> carries the socket that created the packet — and from that socket you can read the cgroup ID. The cgroup hierarchy maps container → pod, so the TC program knows which pod this traffic belongs to. That&#8217;s what makes pod-level enforcement possible.</p>
<h2 id="the-linux-traffic-control-architecture">The Linux Traffic Control Architecture</h2>
<p><code class="" data-line="">tc</code> (traffic control) is the Linux subsystem for managing packet queues and scheduling. Most Linux administrators know it as the bandwidth-shaping tool:</p>
<pre><code class="" data-line=""># Classic tc usage — rate limit an interface
tc qdisc add dev eth0 root tbf rate 100mbit burst 32kbit latency 400ms
</code></pre>
<p>The qdisc (queuing discipline) is the primary abstraction. Under the qdisc sits a <strong>filter</strong> layer — and the filter type relevant to eBPF is <code class="" data-line="">cls_bpf</code>, which attaches eBPF programs as packet classifiers.</p>
<blockquote>
<p><strong>qdisc (queuing discipline)</strong> is the kernel&#8217;s packet scheduler for an interface — it controls how packets are buffered and in what order they leave. For eBPF policy enforcement, Cilium uses a special qdisc called <code class="" data-line="">clsact</code> which has no buffering behaviour at all; it purely provides the ingress and egress hook points where eBPF filters attach. If a pod veth doesn&#8217;t have <code class="" data-line="">clsact</code>, Cilium isn&#8217;t enforcing policy on that pod.</p>
</blockquote>
<p>Cilium attaches <code class="" data-line="">cls_bpf</code> filters in <strong>direct action</strong> (DA) mode, which combines classifier and action into a single eBPF program. The program&#8217;s return value is the packet fate directly:</p>
<table>
<thead>
<tr>
<th>Return value</th>
<th>Action</th>
</tr>
</thead>
<tbody>
<tr>
<td><code class="" data-line="">TC_ACT_OK</code> (0)</td>
<td>Pass the packet</td>
</tr>
<tr>
<td><code class="" data-line="">TC_ACT_SHOT</code> (2)</td>
<td>Drop the packet</td>
</tr>
<tr>
<td><code class="" data-line="">TC_ACT_REDIRECT</code> (7)</td>
<td>Redirect to another interface</td>
</tr>
<tr>
<td><code class="" data-line="">TC_ACT_PIPE</code> (3)</td>
<td>Pass to the next filter in the chain</td>
</tr>
</tbody>
</table>
<h2 id="tc-context-what-your-program-can-see">TC Context: What Your Program Can See</h2>
<p>TC programs receive a <code class="" data-line="">struct __sk_buff</code> — a safe, BPF-accessible projection of the kernel <code class="" data-line="">sk_buff</code>. Unlike the raw packet bytes in XDP, <code class="" data-line="">__sk_buff</code> includes metadata:</p>
<pre><code class="" data-line="">struct __sk_buff {
    __u32 len;           // packet length
    __u32 pkt_type;      // PACKET_HOST, PACKET_BROADCAST, etc.
    __u32 mark;          // skb-&gt;mark — used by Cilium for pod identity
    __u32 queue_mapping;
    __u32 protocol;      // ETH_P_IP, ETH_P_IPV6, etc.
    __u32 vlan_present;
    __u32 vlan_tci;
    __u32 vlan_proto;
    __u32 priority;
    __u32 ingress_ifindex;
    __u32 ifindex;
    __u32 tc_index;
    __u32 cb[5];
    __u32 hash;
    __u32 tc_classid;
    __u32 data;          // offset to packet data
    __u32 data_end;
    __u32 napi_id;
    __u32 family;
    __u32 remote_ip4;    // source IP (ingress) or dest IP (egress)
    __u32 local_ip4;
    __u32 remote_port;
    __u32 local_port;
    // ...
};
</code></pre>
<p><code class="" data-line="">skb-&gt;mark</code> is how Cilium passes pod identity between its hook points.</p>
<blockquote>
<p><strong><code class="" data-line="">skb-&gt;mark</code></strong> is a 32-bit field in every <code class="" data-line="">sk_buff</code> that any kernel subsystem can read or write. It&#8217;s a general-purpose scratch field — iptables uses it, routing rules use it, and Cilium uses it to carry pod security identity from the socket hook through to TC enforcement. When Cilium stamps a pod&#8217;s identity into <code class="" data-line="">skb-&gt;mark</code> at connection time, every subsequent TC filter on that packet&#8217;s path can read it without another identity lookup. The socket-level cgroup hook (<code class="" data-line="">cgroup_sock_addr</code>) stamps the cgroup-derived pod identity into <code class="" data-line="">skb-&gt;mark</code> when the socket calls <code class="" data-line="">connect()</code>. By the time the packet reaches the TC egress hook, <code class="" data-line="">skb-&gt;mark</code> carries the pod&#8217;s security identity — and the TC program uses it for policy enforcement.</p>
</blockquote>
<h2 id="what-ciliums-tc-filters-actually-do">What Cilium&#8217;s TC Filters Actually Do</h2>
<p>The TC filter on each pod&#8217;s veth is Cilium&#8217;s enforcement point for Kubernetes NetworkPolicy. The mechanism:</p>
<ol>
<li>When a pod opens a connection, a <code class="" data-line="">cgroup_sock_addr</code> hook stamps the pod&#8217;s security identity (derived from its labels + namespace) into <code class="" data-line="">skb-&gt;mark</code></li>
<li>The TC egress filter on the veth reads <code class="" data-line="">skb-&gt;mark</code>, looks up the pod identity + destination in the policy map, and returns <code class="" data-line="">TC_ACT_SHOT</code> (drop) or <code class="" data-line="">TC_ACT_OK</code> (pass)</li>
<li>The TC ingress filter on the receiving pod&#8217;s veth does the same check for inbound traffic</li>
</ol>
<p>The policy map is a BPF LRU hash keyed on <code class="" data-line="">{pod_identity, dst_ip, dst_port, protocol}</code>. This is what <code class="" data-line="">cilium policy get</code> reads from — and what <code class="" data-line="">bpftool map dump</code> shows directly:</p>
<pre><code class="" data-line=""># Find Cilium&#039;s policy maps
bpftool map list | grep -i policy

# Dump the active policy entries for a specific endpoint
# Get endpoint ID from: cilium endpoint list
cilium bpf policy get &lt;endpoint-id&gt;

# Cross-check with raw bpftool dump
bpftool map dump id &lt;POLICY_MAP_ID&gt; | head -30
</code></pre>
<p>The <code class="" data-line="">clsact</code> qdisc is the prerequisite for any TC eBPF filter — it creates the ingress and egress hook points without any queuing behavior. Every pod veth on a Cilium node has one:</p>
<pre><code class="" data-line="">tc qdisc show dev lxcABCDEF
# qdisc clsact ffff: dev lxcABCDEF parent ffff:fff1
# ^^^^^^^^^^^^ this line confirms Cilium&#039;s hook points exist on this pod&#039;s veth
# If this is missing: Cilium is NOT enforcing NetworkPolicy on this pod
</code></pre>
<p>If a pod veth doesn&#8217;t show <code class="" data-line="">clsact</code>, Cilium isn&#8217;t enforcing policy on that pod.</p>
<h2 id="multiple-programs-and-the-filter-chain">Multiple Programs and the Filter Chain</h2>
<p>This is the detail that caused my production incident.</p>
<p>TC supports chaining multiple filters on the same hook, ordered by priority. Lower priority number runs first. When Cilium upgrades, it installs a new filter at a new priority before removing the old one. If the upgrade procedure has any timing gap — or if the removal step fails silently — you end up with two programs running in sequence.</p>
<pre><code class="" data-line=""># Show all TC filters on a pod&#039;s veth — both priorities visible
tc filter show dev lxc12345 egress

# Example output with a stale filter:
filter protocol all pref 1 bpf chain 0
filter protocol all pref 1 bpf chain 0 handle 0x1 cil_to_container direct-action not_in_hw id 44
filter protocol all pref 2 bpf chain 0
filter protocol all pref 2 bpf chain 0 handle 0x1 old_cil_to_container direct-action not_in_hw id 17
</code></pre>
<p>Two programs. Pref 1 runs first. Pref 2 runs second — unless pref 1 returned <code class="" data-line="">TC_ACT_SHOT</code>, in which case the packet is already dropped and pref 2 never fires.</p>
<p>In my incident: pref 1 was the current Cilium version with correct policy, returning <code class="" data-line="">TC_ACT_OK</code> for the traffic in question. Pref 2 was the old version with a stale block entry, returning <code class="" data-line="">TC_ACT_SHOT</code> for a subset of connection tuples. Because <code class="" data-line="">TC_ACT_OK</code> passes to the next filter in the chain (<code class="" data-line="">TC_ACT_PIPE</code> would do the same), pref 2 got to run — and intermittently dropped packets.</p>
<p>The fix:</p>
<pre><code class="" data-line=""># Remove the stale filter by priority
tc filter del dev lxc12345 egress pref 2

# Verify only the current filter remains
tc filter show dev lxc12345 egress
</code></pre>
<p>This should be part of any post-upgrade verification for Cilium-managed clusters.</p>
<h2 id="how-cilium-uses-tc-across-the-full-node">How Cilium Uses TC Across the Full Node</h2>
<p>Cilium&#8217;s TC deployment on a node:</p>
<pre><code class="" data-line="">Pod veth (host-side, lxcXXXXX):
  TC ingress: cil_from_container — L3/L4 policy on the pod&#039;s outbound traffic
  TC egress:  cil_to_container   — L3/L4 policy on traffic arriving at the pod

Node uplink (eth0):
  TC ingress: cil_from_netdev    — traffic arriving from outside the node
  TC egress:  cil_to_netdev      — traffic leaving the node

XDP on eth0:
  cil_xdp_entry — pre-stack service load balancing (DNAT for ClusterIP)
</code></pre>
<p>The naming is counterintuitive at first: <code class="" data-line="">cil_from_container</code> is attached to the TC <strong>ingress</strong> hook on the veth.</p>
<blockquote>
<p><strong>Veth direction confusion:</strong> TC ingress/egress is named from the kernel&#8217;s perspective of the interface, not the pod&#8217;s. The host-side veth interface <em>receives</em> traffic that the pod is <em>sending</em> — so TC ingress on the host veth = the pod&#8217;s outbound traffic. This trips up everyone the first time. When debugging, always confirm direction with <code class="" data-line="">tc filter show dev lxcXXX ingress</code> and <code class="" data-line="">egress</code> separately, and check which Cilium program name is attached (<code class="" data-line="">cil_from_container</code> = pod outbound, <code class="" data-line="">cil_to_container</code> = pod inbound). The veth ingress direction from the host perspective is traffic flowing out of the container. Traffic leaving the pod hits the host-side veth ingress, which is <code class="" data-line="">cil_from_container</code>. It enforces egress policy for the pod. Naming follows the kernel&#8217;s perspective of the interface, not the application&#8217;s.</p>
</blockquote>
<p>To see the full picture on a node:</p>
<pre><code class="" data-line=""># All eBPF network programs (XDP and TC) across all interfaces
bpftool net list

# TC-specific view
for iface in $(ip link | grep lxc | awk -F&#039;: &#039; &#039;{print $2}&#039;); do
    echo &quot;=== $iface ===&quot;
    tc filter show dev $iface ingress
    tc filter show dev $iface egress
done
</code></pre>
<h2 id="tc-can-modify-packets-too">TC Can Modify Packets Too</h2>
<p>Unlike XDP, TC programs have full access to the <code class="" data-line="">sk_buff</code> and can modify packet content — headers, payload, and checksums. This is how TC-based DNAT works in Cilium when XDP isn&#8217;t available on the NIC: the program rewrites the destination IP at L3 and updates the IP + transport checksums atomically. The kernel BPF helper handles the checksum recalculation.</p>
<p>From an operational standpoint: if you see a TC program attached but expected traffic is being redirected rather than dropped, the program is likely doing DNAT. <code class="" data-line="">bpftool prog dump xlated id &lt;ID&gt;</code> shows the disassembled instructions and will reveal <code class="" data-line="">bpf_skb_store_bytes</code> calls if packet rewriting is happening.</p>
<h2 id="debugging-tc-programs-in-production">Debugging TC Programs in Production</h2>
<p>Workflow I follow when investigating network issues on Cilium clusters:</p>
<pre><code class="" data-line=""># 1. List all eBPF network programs (see the full picture)
bpftool net list

# 2. Check specific interface for stale TC filters
tc filter show dev lxcABCDEF ingress
tc filter show dev lxcABCDEF egress

# 3. Inspect a specific program
bpftool prog show id 44

# 4. Disassemble a program (last resort for understanding behavior)
bpftool prog dump xlated id 44

# 5. Check Cilium&#039;s view of the same interface
cilium endpoint list
cilium endpoint get &lt;endpoint-id&gt;

# 6. Enable verbose TC program logs (debug builds only)
# Cilium: set CILIUM_DEBUG=true in the deployment
</code></pre>
<h2 id="common-mistakes">Common Mistakes</h2>
<table>
<thead>
<tr>
<th>Mistake</th>
<th>Impact</th>
<th>Fix</th>
</tr>
</thead>
<tbody>
<tr>
<td>Not checking for stale TC filters after Cilium upgrades</td>
<td>Conflicting policy programs cause intermittent drops</td>
<td>Run <code class="" data-line="">tc filter show</code> post-upgrade; remove stale by priority</td>
</tr>
<tr>
<td>Confusing ingress/egress direction on veth interfaces</td>
<td>Policy applied to wrong traffic direction</td>
<td>TC ingress on host-side veth = pod&#8217;s outbound traffic</td>
</tr>
<tr>
<td>Attaching TC without <code class="" data-line="">clsact</code> qdisc</td>
<td>Filter attachment fails</td>
<td><code class="" data-line="">tc qdisc add dev &lt;iface&gt; clsact</code> before filter add</td>
</tr>
<tr>
<td>Using <code class="" data-line="">TC_ACT_OK</code> when you want to stop the chain</td>
<td>Subsequent filters still run</td>
<td>Use <code class="" data-line="">TC_ACT_OK</code> knowing the chain continues; use <code class="" data-line="">TC_ACT_REDIRECT</code> or explicit <code class="" data-line="">TC_ACT_SHOT</code> only</td>
</tr>
<tr>
<td>Expecting TC performance equal to XDP</td>
<td>TC has sk_buff overhead — it&#8217;s slower</td>
<td>Right tool: XDP for pre-stack bulk drops, TC for identity-aware policy</td>
</tr>
<tr>
<td>Hardcoding <code class="" data-line="">skb-&gt;mark</code> interpretation</td>
<td>Different tools use mark differently</td>
<td>Document mark field usage clearly; coordinate between Cilium and custom programs</td>
</tr>
</tbody>
</table>
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>TC eBPF fires after <code class="" data-line="">sk_buff</code> allocation — it has socket metadata, cgroup ID, and pod identity that XDP lacks</li>
<li>Direct action (DA) mode combines filter and action; the program&#8217;s return value is the packet fate</li>
<li>Multiple TC programs chain on the same hook ordered by priority — stale programs from Cilium upgrades cause silent policy conflicts</li>
<li><code class="" data-line="">tc filter show dev &lt;iface&gt; ingress/egress</code> is the primary inspection tool; <code class="" data-line="">bpftool net list</code> shows the full node picture</li>
<li>XDP + TC is the Cilium data path: XDP for pre-stack service load balancing, TC for per-pod identity-based enforcement</li>
<li>TC can modify packet content (<code class="" data-line="">bpf_skb_store_bytes</code>) — the basis for TC-based DNAT and packet mangling</li>
</ul>
<h2 id="whats-next">What&#8217;s Next</h2>
<p>EP08 closes out the kernel machinery arc: program types, maps, CO-RE, XDP, TC. Five episodes on the engine under the tools. EP09 shifts from understanding the machinery to using it interactively.</p>
<p>bpftrace turns kernel knowledge into one-liners you can run on a live production node. Which process is touching this file right now? Where is this latency spike originating in the kernel call stack? Which container is making DNS queries to an unexpected resolver? Under 10 seconds per question — no restart, no sidecar, no instrumentation change.</p>
<p>Every bpftrace one-liner is a complete eBPF program compiled, loaded, run, and cleaned up on the fly. EP09 covers how that works and why it changes the way you investigate production incidents.</p>
<p><em>Next: <a href="/ebpf-bpftrace-kernel-observability/">bpftrace — kernel answers in one line</a></em></p>
<p>Get EP09 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Ftc-ebpf-kubernetes-network-policy%2F&amp;linkname=TC%20eBPF%20%E2%80%94%20Pod-Level%20Network%20Policy%20Without%20iptables" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Ftc-ebpf-kubernetes-network-policy%2F&amp;linkname=TC%20eBPF%20%E2%80%94%20Pod-Level%20Network%20Policy%20Without%20iptables" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Ftc-ebpf-kubernetes-network-policy%2F&amp;linkname=TC%20eBPF%20%E2%80%94%20Pod-Level%20Network%20Policy%20Without%20iptables" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Ftc-ebpf-kubernetes-network-policy%2F&amp;linkname=TC%20eBPF%20%E2%80%94%20Pod-Level%20Network%20Policy%20Without%20iptables" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Ftc-ebpf-kubernetes-network-policy%2F&amp;linkname=TC%20eBPF%20%E2%80%94%20Pod-Level%20Network%20Policy%20Without%20iptables" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Ftc-ebpf-kubernetes-network-policy%2F&amp;linkname=TC%20eBPF%20%E2%80%94%20Pod-Level%20Network%20Policy%20Without%20iptables" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Ftc-ebpf-kubernetes-network-policy%2F&amp;linkname=TC%20eBPF%20%E2%80%94%20Pod-Level%20Network%20Policy%20Without%20iptables" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Ftc-ebpf-kubernetes-network-policy%2F&#038;title=TC%20eBPF%20%E2%80%94%20Pod-Level%20Network%20Policy%20Without%20iptables" data-a2a-url="https://linuxcent.com/tc-ebpf-kubernetes-network-policy/" data-a2a-title="TC eBPF — Pod-Level Network Policy Without iptables"></a></p><p>The post <a href="https://linuxcent.com/tc-ebpf-kubernetes-network-policy/">TC eBPF — Pod-Level Network Policy Without iptables</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/tc-ebpf-kubernetes-network-policy/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1837</post-id>	</item>
		<item>
		<title>XDP — Packets Processed Before the Kernel Knows They Arrived</title>
		<link>https://linuxcent.com/ebpf-xdp-kubernetes-networking/</link>
					<comments>https://linuxcent.com/ebpf-xdp-kubernetes-networking/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Tue, 21 Apr 2026 14:53:14 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[Cilium]]></category>
		<category><![CDATA[DDoS mitigation]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Linux Networking]]></category>
		<category><![CDATA[SRE]]></category>
		<category><![CDATA[XDP]]></category>
		<guid isPermaLink="false">https://linuxcent.com/ebpf-xdp-kubernetes-networking/</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 10</span> <span class="rt-label rt-postfix">minutes</span></span>XDP processes packets before the Linux kernel allocates a single byte of memory. How Cilium uses XDP for service load balancing and how it differs from iptables.</p>
<p>The post <a href="https://linuxcent.com/ebpf-xdp-kubernetes-networking/">XDP — Packets Processed Before the Kernel Knows They Arrived</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 10</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 7</em><br />
<em><a href="https://linuxcent.com/what-is-ebpf-linux-kubernetes/">What Is eBPF?</a> · <a href="https://linuxcent.com/bpf-verifier-kubernetes-safety/">The BPF Verifier</a> · <a href="https://linuxcent.com/ebpf-vs-kernel-modules-kubernetes/">eBPF vs Kernel Modules</a> · <a href="https://linuxcent.com/ebpf-program-types-kubernetes/">eBPF Program Types</a> · <a href="https://linuxcent.com/ebpf-maps-explained/">eBPF Maps</a> · <a href="https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/">CO-RE and libbpf</a> · </em><em>XDP</em>**</p>
<p><strong>14 min read</strong></p>
<hr />
<h2 id="introduction">Introduction</h2>
<p>EP01 through EP06 covered what eBPF is, how the verifier keeps it safe, and how the toolchain compiles and loads programs across kernel versions. This episode is where that foundation meets production networking.</p>
<p>XDP — eXpress Data Path — is the earliest hook in the Linux kernel packet path. It fires before <code class="" data-line="">sk_buff</code> allocation, before routing, before netfilter. A DROP decision at XDP costs one bounds check and a return value. Everything else is skipped. At 1 million packets per second, that difference shows up directly as CPU.</p>
<p>This episode explains where XDP sits, what it can and cannot see, how Cilium uses it, and what every Kubernetes operator needs to know about it — even if they never write an eBPF program.</p>
<hr />
<h2 id="table-of-contents">Table of Contents</h2>
<ul>
<li><a href="#tldr">TL;DR</a></li>
<li><a href="#quick-check-is-xdp-running-on-your-cluster">Quick Check: Is XDP Running on Your Cluster?</a></li>
<li><a href="#where-xdp-sits-in-the-kernel-data-path">Where XDP Sits in the Kernel Data Path</a></li>
<li><a href="#xdp-modes">XDP Modes</a></li>
<li><a href="#the-xdp-context-what-your-program-can-see">The XDP Context: What Your Program Can See</a></li>
<li><a href="#what-this-means-on-your-cluster-right-now">What This Means on Your Cluster Right Now</a></li>
<li><a href="#xdp-metadata-cooperating-with-tc">XDP Metadata: Cooperating with TC</a></li>
<li><a href="#how-cilium-uses-xdp">How Cilium Uses XDP</a></li>
<li><a href="#operational-inspection">Operational Inspection</a></li>
<li><a href="#common-mistakes">Common Mistakes</a></li>
<li><a href="#key-takeaways">Key Takeaways</a></li>
</ul>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img loading="lazy" decoding="async" width="2400" height="2461" src="https://linuxcent.com/wp-content/uploads/2026/05/ep07-xdp-og-2.png" alt="XDP Pre-Stack Packet Hook — eBPF kernel data path diagram showing where XDP fires before sk_buff allocation" class="wp-image-2114" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep07-xdp-og-2.png 2400w, https://linuxcent.com/wp-content/uploads/2026/05/ep07-xdp-og-2-293x300.png 293w, https://linuxcent.com/wp-content/uploads/2026/05/ep07-xdp-og-2-999x1024.png 999w, https://linuxcent.com/wp-content/uploads/2026/05/ep07-xdp-og-2-768x788.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep07-xdp-og-2-1498x1536.png 1498w, https://linuxcent.com/wp-content/uploads/2026/05/ep07-xdp-og-2-1997x2048.png 1997w" sizes="auto, (max-width: 2400px) 100vw, 2400px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">XDP fires before <code class="" data-line="">sk_buff</code> allocation — the earliest possible kernel hook for zero-copy packet processing.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>XDP fires before <code class="" data-line="">sk_buff</code> allocation — the earliest possible kernel hook for packet processing<br />
<em>(<code class="" data-line="">sk_buff</code> = the kernel&#8217;s socket buffer — every normal packet requires one to be allocated, which adds up fast at scale)</em></li>
<li>Three modes: native (in-driver, full performance), generic (fallback, no perf gain), offloaded (NIC ASIC)</li>
<li>XDP context is raw packet bytes — no socket, no cgroup, no pod identity; handle non-IP traffic explicitly</li>
<li>Every pointer dereference requires a bounds check against <code class="" data-line="">data_end</code> — the verifier enforces this</li>
<li><code class="" data-line="">BPF_MAP_TYPE_LPM_TRIE</code> is the right map type for IP prefix blocklists — handles /32 hosts and CIDRs together</li>
<li>XDP metadata area enables coordination with TC programs — classify at XDP speed, enforce with pod context at TC</li>
</ul>
<hr />
<h2 id="quick-check-is-xdp-running-on-your-cluster">Quick Check: Is XDP Running on Your Cluster?</h2>
<p>Before the data path walkthrough — a two-command check you can run right now on any cluster node:</p>
<pre><code class="" data-line=""># SSH into a worker node, then:
bpftool net list
</code></pre>
<p>On a Cilium-managed node, you&#8217;ll see something like:</p>
<pre><code class="" data-line="">eth0 (index 2):
        xdpdrv  id 44

lxc8a3f21b (index 7):
        tc ingress id 47
        tc egress  id 48
</code></pre>
<p>Reading the output:<br />
&#8211; <code class="" data-line="">xdpdrv</code> — XDP in <strong>native mode</strong>, running in the NIC driver before <code class="" data-line="">sk_buff</code> (this is what you want)<br />
&#8211; <code class="" data-line="">xdpgeneric</code> instead of <code class="" data-line="">xdpdrv</code> — <strong>generic mode</strong>, runs after <code class="" data-line="">sk_buff</code> allocation, no performance benefit<br />
&#8211; No XDP line at all — XDP not deployed; your CNI uses iptables for service forwarding</p>
<p>If you&#8217;re on <strong>EKS with <code class="" data-line="">aws-vpc-cni</code></strong> or <strong>GKE with <code class="" data-line="">kubenet</code></strong>, you likely won&#8217;t see XDP here — those CNIs use iptables. Understanding this section explains why teams migrating to Cilium see lower node CPU under the same traffic load.</p>
<hr />
<h2 id="where-xdp-sits-in-the-kernel-data-path">Where XDP Sits in the Kernel Data Path</h2>
<p>A client&#8217;s cluster was under a SYN flood — roughly 1 million packets per second from a rotating set of source IPs. We had iptables DROP rules installed within the first ten minutes, blocklist updated every 30 seconds as new source ranges appeared. The flood traffic dropped in volume. But node CPU stayed high. The <code class="" data-line="">%si</code> column in <code class="" data-line="">top</code> — software interrupt time — was sitting at 25–30%.</p>
<blockquote>
<p><strong><code class="" data-line="">%si</code> in <code class="" data-line="">top</code></strong> is the percentage of CPU time spent handling hardware interrupts and kernel-level packet processing — separate from your application&#8217;s CPU usage. On a quiet managed cluster (EKS, GKE) this is usually under 1%. Under a packet flood, high <code class="" data-line="">%si</code> means the kernel is burning cycles just <em>receiving</em> packets, before your workloads run at all. It&#8217;s the metric that tells you the problem is below the application layer.</p>
</blockquote>
<p>The iptables rules were matching. Packets were being dropped. CPU was still burning. The answer is where in the kernel the drop was happening. iptables fires inside the <code class="" data-line="">netfilter</code> framework — after the kernel has already allocated an <code class="" data-line="">sk_buff</code> for the packet, done DMA from the NIC ring buffer, and traversed several netfilter hooks. At 1Mpps, the allocation cost alone is measurable.</p>
<p>XDP fires before any of that.</p>
<p>The standard Linux packet receive path:</p>
<pre><code class="" data-line="">NIC hardware
  ↓
DMA to ring buffer (kernel memory)
  ↓
[XDP hook — fires here, before sk_buff]
  ├── XDP_DROP   → discard, zero further allocation
  ├── XDP_PASS   → continue to kernel network stack
  ├── XDP_TX     → transmit back out the same interface
  └── XDP_REDIRECT → forward to another interface or CPU
  ↓
sk_buff allocated from slab allocator
  ↓
netfilter: PREROUTING
  ↓
IP routing decision
  ↓
netfilter: INPUT or FORWARD
  ↓  [iptables fires somewhere in here]
socket receive queue
  ↓
userspace application
</code></pre>
<p>XDP runs at the driver level, in the NAPI poll context — the same context where the driver is processing received packets off the ring buffer. The program runs before the kernel&#8217;s general networking code gets involved. There&#8217;s no <code class="" data-line="">sk_buff</code>, no reference counting, no slab allocation.</p>
<blockquote>
<p><strong>NAPI</strong> (New API) is how modern Linux receives packets efficiently. Instead of one CPU interrupt per packet (catastrophically expensive at 1Mpps), the NIC fires a single interrupt, then the kernel polls the NIC ring buffer in batches until it&#8217;s drained. XDP runs inside this polling loop — as close to the hardware as software gets without running on the NIC itself.</p>
</blockquote>
<p>At 1Mpps, the difference between XDP_DROP and an iptables DROP is roughly the cost of allocating and then immediately freeing 1 million <code class="" data-line="">sk_buff</code> objects per second — plus netfilter traversal, connection tracking lookup, and the DROP action itself. That&#8217;s the CPU time that was burning.</p>
<p>After moving the blocklist to an XDP program, the <code class="" data-line="">%si</code> on the same traffic load dropped from 28% to 3%.</p>
<hr />
<h2 id="xdp-modes">XDP Modes</h2>
<p>XDP operates in three modes, and which one you get depends on your NIC driver.</p>
<h3 id="native-xdp-xdp_flags_drv_mode">Native XDP (XDP_FLAGS_DRV_MODE)</h3>
<p>The eBPF program runs directly in the NIC driver&#8217;s NAPI poll function — in interrupt context, before <code class="" data-line="">sk_buff</code>. This is the only mode that delivers the full performance benefit.</p>
<p>Driver support is required. The widely supported drivers: <code class="" data-line="">mlx4</code>, <code class="" data-line="">mlx5</code> (Mellanox/NVIDIA), <code class="" data-line="">i40e</code>, <code class="" data-line="">ice</code> (Intel), <code class="" data-line="">bnxt_en</code> (Broadcom), <code class="" data-line="">virtio_net</code> (KVM/QEMU), <code class="" data-line="">veth</code> (containers). Check support:</p>
<pre><code class="" data-line=""># Verify native XDP support on your driver
ethtool -i eth0 | grep driver
# driver: mlx5_core  ← supports native XDP

# Load in native mode
ip link set dev eth0 xdpdrv obj blocklist.bpf.o sec xdp
</code></pre>
<p>The <code class="" data-line="">veth</code> driver supporting native XDP is what makes XDP meaningful inside Kubernetes pods — each pod&#8217;s veth interface can run an XDP program at wire speed.</p>
<h3 id="generic-xdp-xdp_flags_skb_mode">Generic XDP (XDP_FLAGS_SKB_MODE)</h3>
<p>Fallback for drivers that don&#8217;t support native XDP. The program still runs, but it runs after <code class="" data-line="">sk_buff</code> allocation, as a hook in the <code class="" data-line="">netif_receive_skb</code> path. No performance benefit over early netfilter. <code class="" data-line="">sk_buff</code> is still allocated and freed for every packet.</p>
<pre><code class="" data-line=""># Generic mode — development and testing only
ip link set dev eth0 xdpgeneric obj blocklist.bpf.o sec xdp
</code></pre>
<p>Use this for development on a laptop with a NIC that lacks native XDP support. Never benchmark with it and never use it in production expecting performance gains.</p>
<h3 id="offloaded-xdp">Offloaded XDP</h3>
<p>Runs on the NIC&#8217;s own processing unit (SmartNIC). Zero CPU involvement — the XDP decision happens in NIC hardware. Supported by Netronome Agilio NICs. Rare in production, but the theoretical ceiling for XDP performance.</p>
<hr />
<h2 id="the-xdp-context-what-your-program-can-see">The XDP Context: What Your Program Can See</h2>
<p>XDP programs receive one argument: <code class="" data-line="">struct xdp_md</code>.</p>
<pre><code class="" data-line="">struct xdp_md {
    __u32 data;           // offset of first packet byte in the ring buffer page
    __u32 data_end;       // offset past the last byte
    __u32 data_meta;      // metadata area before data (XDP metadata for TC cooperation)
    __u32 ingress_ifindex;
    __u32 rx_queue_index;
};
</code></pre>
<p><code class="" data-line="">data</code> and <code class="" data-line="">data_end</code> are used as follows:</p>
<pre><code class="" data-line="">void *data     = (void *)(long)ctx-&gt;data;
void *data_end = (void *)(long)ctx-&gt;data_end;

// Every pointer dereference must be bounds-checked first
struct ethhdr *eth = data;
if ((void *)(eth + 1) &gt; data_end)
    return XDP_PASS;  // malformed or truncated packet
</code></pre>
<p>The verifier enforces these bounds checks — every pointer derived from <code class="" data-line="">ctx-&gt;data</code> must be validated before use. The error <code class="" data-line="">invalid mem access &#039;inv&#039;</code> means you dereferenced a pointer without checking the bounds. This is the most common cause of XDP program rejection.</p>
<blockquote>
<p><strong>For operators (not writing XDP code):</strong> You&#8217;ll see <code class="" data-line="">invalid mem access &#039;inv&#039;</code> in logs when an eBPF program is rejected at load time — most commonly during a Cilium upgrade or a custom tool deployment on a kernel the tool wasn&#8217;t built for. The fix is in the eBPF source or the tool version, not the cluster config.</p>
</blockquote>
<p>What XDP <strong>cannot</strong> see:<br />
&#8211; Socket state — no socket buffer exists yet<br />
&#8211; Cgroup hierarchy — no pod identity<br />
&#8211; Process information — no PID, no container<br />
&#8211; Connection tracking state (unless you maintain it yourself in a map)</p>
<p>XDP is ingress-only. It fires on packets arriving at an interface, not departing. For egress, TC is the hook.</p>
<hr />
<h2 id="what-this-means-on-your-cluster-right-now">What This Means on Your Cluster Right Now</h2>
<p>Every Cilium-managed node has XDP programs running. Here&#8217;s how to see them:</p>
<pre><code class="" data-line=""># All XDP programs on all interfaces — this is the full picture
bpftool net list
# Sample output on a Cilium node:
#
# eth0 (index 2):
#         xdpdrv  id 44         ← XDP in native mode on the node uplink
#
# lxc8a3f21b (index 7):
#         tc ingress id 47      ← TC enforces NetworkPolicy on pod ingress
#         tc egress  id 48      ← TC enforces NetworkPolicy on pod egress
#
# &quot;xdpdrv&quot;     = native mode (runs in NIC driver, before sk_buff — full performance)
# &quot;xdpgeneric&quot; = fallback mode (after sk_buff — no performance benefit over iptables)

# Which mode is active?
ip link show eth0 | grep xdp
# xdp mode drv  ← native (full performance)
# xdp mode generic  ← fallback (no perf benefit)

# Details on the XDP program ID
bpftool prog show id $(bpftool net show dev eth0 | grep xdp | awk &#039;{print $NF}&#039;)
# Shows: loaded_at, tag, xlated bytes, jited bytes, map IDs
</code></pre>
<p>The <code class="" data-line="">map IDs</code> in that output are the BPF maps the XDP program is using — typically the service VIP table for DNAT, and in security tools, the blocklist or allowlist. To see what&#8217;s in them:</p>
<pre><code class="" data-line=""># List maps used by the XDP program
bpftool prog show id &lt;PROG_ID&gt; | grep map_ids

# Dump the service map (for a Cilium node — this is the load balancer table)
bpftool map dump id &lt;MAP_ID&gt; | head -40
</code></pre>
<p>For a blocklist scenario — like the SYN flood mitigation above — the <code class="" data-line="">BPF_MAP_TYPE_LPM_TRIE</code> is the standard data structure. A lookup for <code class="" data-line="">192.168.1.45</code> hits a <code class="" data-line="">192.168.1.0/24</code> entry in the same map, handling both host /32s and CIDR ranges in one lookup.</p>
<pre><code class="" data-line=""># Count entries in an XDP filter map
bpftool map dump id &lt;BLOCKLIST_MAP_ID&gt; | grep -c &quot;key&quot;

# Verify XDP is active and inspect program details
bpftool net show dev eth0
</code></pre>
<hr />
<h2 id="xdp-metadata-cooperating-with-tc">XDP Metadata: Cooperating with TC</h2>
<p>Think of it as a sticky note attached to the packet. XDP writes the note at line speed (no context about pods or sockets). TC reads it later when full context is available, and acts on it. The packet carries the note between them.</p>
<p>More precisely: XDP can write metadata into the area before <code class="" data-line="">ctx-&gt;data</code> — a small scratch space that survives as the packet moves from XDP to the TC hook. This is the coordination mechanism between the two eBPF layers.</p>
<p>The pattern: XDP classifies at speed (no <code class="" data-line="">sk_buff</code> overhead), TC enforces with pod context (where you have socket identity). XDP writes a classification tag into the metadata area. TC reads it and makes the policy decision.</p>
<p>From an operational standpoint, when you see two eBPF programs on the same interface (one XDP, one TC), this pipeline is the likely explanation:</p>
<pre><code class="" data-line="">bpftool net list
# xdpdrv id 44 on eth0       ← XDP classifier running at line rate
# tc ingress id 47 on eth0   ← TC enforcer reading XDP metadata
</code></pre>
<hr />
<h2 id="how-cilium-uses-xdp">How Cilium Uses XDP</h2>
<blockquote>
<p><strong>Not running Cilium?</strong> On EKS with <code class="" data-line="">aws-vpc-cni</code> or GKE with <code class="" data-line="">kubenet</code>, service forwarding uses iptables NAT rules and <code class="" data-line="">conntrack</code> instead. You can see this with <code class="" data-line="">iptables -t nat -L -n</code> on a node — look for the <code class="" data-line="">KUBE-SVC-*</code> chains. Those chains are what XDP replaces in a Cilium cluster. This is why teams migrating from kube-proxy to Cilium report lower node CPU at high connection rates — it&#8217;s not magic, it&#8217;s hook placement.</p>
</blockquote>
<p>On a Cilium node, XDP handles the load balancing path for ClusterIP services. When a packet arrives at the node destined for a ClusterIP:</p>
<ol>
<li>XDP program checks the destination IP against a BPF LRU hash map of known service VIPs</li>
<li>On a match, it performs DNAT — rewriting the destination IP to a backend pod IP</li>
<li>Returns <code class="" data-line="">XDP_TX</code> or <code class="" data-line="">XDP_REDIRECT</code> to forward directly</li>
</ol>
<p>No iptables NAT rules. No <code class="" data-line="">conntrack</code> state machine. No socket buffer allocation for the routing decision. The lookup is O(1) in a BPF hash map.</p>
<pre><code class="" data-line=""># See Cilium&#039;s XDP program on the node uplink
ip link show eth0 | grep xdp
# xdp  (attached, native mode)

# The XDP program details
bpftool prog show pinned /sys/fs/bpf/cilium/xdp

# Load time, instruction count, JIT-compiled size
bpftool prog show id $(bpftool net list | grep xdp | awk &#039;{print $NF}&#039;)
</code></pre>
<p>At production scale — 500+ nodes, 50k+ services — removing iptables from the service forwarding path with XDP reduces per-node CPU utilization measurably. The effect is most visible on nodes handling high connection rates to cluster services.</p>
<hr />
<h2 id="operational-inspection">Operational Inspection</h2>
<pre><code class="" data-line=""># All XDP programs on all interfaces
bpftool net list

# Check XDP mode (native, generic, offloaded)
ip link show | grep xdp

# Per-interface stats — includes XDP drop/pass counters
cat /sys/class/net/eth0/statistics/rx_dropped

# XDP drop counters exposed via bpftool
bpftool map dump id &lt;stats_map_id&gt;

# Verify XDP is active and show program details
bpftool net show dev eth0
</code></pre>
<hr />
<h2 id="common-mistakes">Common Mistakes</h2>
<table>
<thead>
<tr>
<th>Mistake</th>
<th>Impact</th>
<th>Fix</th>
</tr>
</thead>
<tbody>
<tr>
<td>Missing bounds check before pointer dereference</td>
<td>Verifier rejects: &#8220;invalid mem access&#8221;</td>
<td>Always check <code class="" data-line="">ptr + sizeof(*ptr) &gt; data_end</code> before use</td>
</tr>
<tr>
<td>Using generic XDP for performance testing</td>
<td>Misleading numbers — sk_buff still allocated</td>
<td>Test in native mode only; check <code class="" data-line="">ip link</code> output for mode</td>
</tr>
<tr>
<td>Not handling non-IP traffic (ARP, IPv6, VLAN)</td>
<td>ARP breaks, IPv6 drops, VLAN-tagged frames dropped</td>
<td>Check <code class="" data-line="">eth-&gt;h_proto</code> and return <code class="" data-line="">XDP_PASS</code> for non-IP</td>
</tr>
<tr>
<td>XDP for egress or pod identity</td>
<td>No socket context at XDP; XDP is ingress only</td>
<td>Use TC egress for pod-identity-aware egress policy</td>
</tr>
<tr>
<td>Forgetting <code class="" data-line="">BPF_F_NO_PREALLOC</code> on LPM trie</td>
<td>Full memory allocated at map creation for all entries</td>
<td>Always set this flag for sparse prefix tries</td>
</tr>
<tr>
<td>Blocking ARP by accident in a /24 blocklist</td>
<td>Loss of layer-2 reachability within the blocked subnet</td>
<td>Separate ARP handling before the IP blocklist check</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>XDP fires before <code class="" data-line="">sk_buff</code> allocation — the earliest possible kernel hook for packet processing</li>
<li>Three modes: native (in-driver, full performance), generic (fallback, no perf gain), offloaded (NIC ASIC)</li>
<li>XDP context is raw packet bytes — no socket, no cgroup, no pod identity; handle non-IP traffic explicitly</li>
<li>Every pointer dereference requires a bounds check against <code class="" data-line="">data_end</code> — the verifier enforces this</li>
<li><code class="" data-line="">BPF_MAP_TYPE_LPM_TRIE</code> is the right map for IP prefix blocklists — handles /32 hosts and CIDRs together</li>
<li>XDP metadata area enables coordination with TC programs — classify at XDP speed, enforce with pod context at TC</li>
</ul>
<h2 id="whats-next">What&#8217;s Next</h2>
<p>XDP handles ingress at the fastest possible point but has no visibility into which pod sent a packet. EP08 covers TC eBPF — the hook that fires after <code class="" data-line="">sk_buff</code> allocation, where socket and cgroup context exist.</p>
<p>TC is how Cilium implements pod-to-pod network policy without iptables. It&#8217;s also where stale programs from failed Cilium upgrades leave ghost filters that cause intermittent packet drops. Knowing how TC programs chain — and how to find and remove stale ones — is a specific, concrete operational skill.</p>
<p><em>Next: <a href="https://linuxcent.com/tc-ebpf-kubernetes-network-policy/">TC eBPF — pod-level network policy without iptables</a></em></p>
<p>Get EP08 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-xdp-kubernetes-networking%2F&amp;linkname=XDP%20%E2%80%94%20Packets%20Processed%20Before%20the%20Kernel%20Knows%20They%20Arrived" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-xdp-kubernetes-networking%2F&amp;linkname=XDP%20%E2%80%94%20Packets%20Processed%20Before%20the%20Kernel%20Knows%20They%20Arrived" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-xdp-kubernetes-networking%2F&amp;linkname=XDP%20%E2%80%94%20Packets%20Processed%20Before%20the%20Kernel%20Knows%20They%20Arrived" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-xdp-kubernetes-networking%2F&amp;linkname=XDP%20%E2%80%94%20Packets%20Processed%20Before%20the%20Kernel%20Knows%20They%20Arrived" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-xdp-kubernetes-networking%2F&amp;linkname=XDP%20%E2%80%94%20Packets%20Processed%20Before%20the%20Kernel%20Knows%20They%20Arrived" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-xdp-kubernetes-networking%2F&amp;linkname=XDP%20%E2%80%94%20Packets%20Processed%20Before%20the%20Kernel%20Knows%20They%20Arrived" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-xdp-kubernetes-networking%2F&amp;linkname=XDP%20%E2%80%94%20Packets%20Processed%20Before%20the%20Kernel%20Knows%20They%20Arrived" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Febpf-xdp-kubernetes-networking%2F&#038;title=XDP%20%E2%80%94%20Packets%20Processed%20Before%20the%20Kernel%20Knows%20They%20Arrived" data-a2a-url="https://linuxcent.com/ebpf-xdp-kubernetes-networking/" data-a2a-title="XDP — Packets Processed Before the Kernel Knows They Arrived"></a></p><p>The post <a href="https://linuxcent.com/ebpf-xdp-kubernetes-networking/">XDP — Packets Processed Before the Kernel Knows They Arrived</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/ebpf-xdp-kubernetes-networking/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1540</post-id>	</item>
		<item>
		<title>CO-RE and libbpf — Write Once, Run on Any Kernel</title>
		<link>https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/</link>
					<comments>https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Sun, 19 Apr 2026 10:08:05 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[BTF]]></category>
		<category><![CDATA[Cilium]]></category>
		<category><![CDATA[CO-RE]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[libbpf]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Portability]]></category>
		<guid isPermaLink="false">https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 9</span> <span class="rt-label rt-postfix">minutes</span></span>CO-RE and libbpf make eBPF programs portable across kernel versions without recompilation. What it means for tools like Cilium and Falco surviving your node patches.</p>
<p>The post <a href="https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/">CO-RE and libbpf — Write Once, Run on Any Kernel</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 9</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 6</em><br />
<em><a href="https://linuxcent.com/what-is-ebpf-linux-kubernetes/">What Is eBPF?</a> · <a href="https://linuxcent.com/bpf-verifier-kubernetes-safety/">The BPF Verifier</a> · <a href="https://linuxcent.com/ebpf-vs-kernel-modules-kubernetes/">eBPF vs Kernel Modules</a> · <a href="https://linuxcent.com/ebpf-program-types-kubernetes/">eBPF Program Types</a> · <a href="https://linuxcent.com/ebpf-maps-explained/">eBPF Maps</a> · </em><em>CO-RE and libbpf</em>**</p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img loading="lazy" decoding="async" width="1350" height="2560" src="https://linuxcent.com/wp-content/uploads/2026/05/ep06-core-libbpf-og-2-scaled.png" alt="CO-RE and libbpf — portable eBPF program architecture using BTF for kernel-version independence" class="wp-image-2113" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep06-core-libbpf-og-2-scaled.png 1350w, https://linuxcent.com/wp-content/uploads/2026/05/ep06-core-libbpf-og-2-158x300.png 158w, https://linuxcent.com/wp-content/uploads/2026/05/ep06-core-libbpf-og-2-540x1024.png 540w, https://linuxcent.com/wp-content/uploads/2026/05/ep06-core-libbpf-og-2-768x1457.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep06-core-libbpf-og-2-810x1536.png 810w, https://linuxcent.com/wp-content/uploads/2026/05/ep06-core-libbpf-og-2-1080x2048.png 1080w" sizes="auto, (max-width: 1350px) 100vw, 1350px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">CO-RE relocations + BTF let a single eBPF binary run across different kernel versions without recompilation.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>Kernel structs change between releases — hardcoded offsets break across patch versions, not just major releases</li>
<li>BTF embeds full type information in the kernel at <code class="" data-line="">/sys/kernel/btf/vmlinux</code>; CO-RE uses it to patch field accesses at load time<br />
<em>(BTF = BPF Type Format — a compact description of every struct, field, and byte offset in the running kernel, built into the kernel image)</em></li>
<li><code class="" data-line="">vmlinux.h</code>, generated from BTF, replaces all kernel headers with a single file committed to your repository</li>
<li><code class="" data-line="">BPF_CORE_READ()</code> is the CO-RE macro — every kernel struct access in a portable program goes through it</li>
<li>libbpf skeleton generation (<code class="" data-line="">bpftool gen skeleton</code>) eliminates manual fd management for map and program lifecycle</li>
<li>For production tools: libbpf + CO-RE. For one-off debugging: bpftrace. For prototyping: BCC.</li>
</ul>
<hr />
<p>eBPF CO-RE (Compile Once, Run Everywhere) solves the kernel portability problem — the reason Cilium and Falco survive kernel upgrades without recompilation. What maps assumed — quietly — is that the kernel structs those programs read look the same tomorrow as they do today. They don&#8217;t. The Linux kernel has no stable ABI for internal data structures. <code class="" data-line="">task_struct</code>, <code class="" data-line="">sk_buff</code>, <code class="" data-line="">sock</code> — the fields eBPF programs read constantly — can shift between patch releases, not just major versions. I learned this the hard way when a routine upgrade from 5.15.0-89 to 5.15.0-91 — two patch revisions — silently broke a custom tracer I&#8217;d been running in production for six months.</p>
<hr />
<p>Six months after deploying a custom eBPF tracer for a client — it detected specific syscall patterns that Falco&#8217;s default ruleset didn&#8217;t cover — they ran a routine Ubuntu patch upgrade. Not a major kernel version jump. 5.15.0-89 to 5.15.0-91. Two patch revisions.</p>
<p>The tracer stopped loading. The error was <code class="" data-line="">invalid indirect read from stack</code>. I opened the program source: nothing remotely like an indirect read. The program was a straightforward tracepoint handler, maybe 40 lines of C.</p>
<p>Three hours of debugging led to a four-byte offset difference. The struct <code class="" data-line="">task_struct</code> had a field alignment change between the two patch versions. My program accessed <code class="" data-line="">-&gt;comm</code> at a hardcoded byte offset. On 5.15.0-89 that offset was <code class="" data-line="">0x620</code>. On 5.15.0-91 it was <code class="" data-line="">0x624</code>. The verifier caught the misalignment — correctly — and rejected the program.</p>
<p>I had compiled the eBPF bytecode against a fixed kernel header snapshot. The binary was not portable. Every time the kernel moved a struct field, the tool broke.</p>
<p>CO-RE is the solution to this.</p>
<h2 id="quick-check-does-your-cluster-support-co-re">Quick Check: Does Your Cluster Support CO-RE?</h2>
<p>Two commands — check whether your nodes have the BTF support that CO-RE tools require:</p>
<pre><code class="" data-line=""># SSH into a worker node, then:
ls -la /sys/kernel/btf/vmlinux &amp;&amp; echo &quot;BTF available — CO-RE tools will work&quot;
</code></pre>
<p>Expected output on a supported node:</p>
<pre><code class="" data-line="">-r--r--r-- 1 root root 4956234 Apr 21 00:00 /sys/kernel/btf/vmlinux
BTF available — CO-RE tools will work
</code></pre>
<p>If the file is missing: CO-RE tools (Cilium, Falco, Tetragon) will fall back to legacy BCC compilation mode — which requires a full compiler toolchain and kernel headers installed on every node.</p>
<pre><code class="" data-line=""># Confirm the kernel was built with BTF enabled
cat /boot/config-$(uname -r) | grep CONFIG_DEBUG_INFO_BTF
# CONFIG_DEBUG_INFO_BTF=y  ← required for CO-RE
</code></pre>
<p><strong>Common results by platform:</strong><br />
| Platform | BTF available? |<br />
|&#8212;&#8212;&#8212;-|&#8212;&#8212;&#8212;&#8212;&#8212;-|<br />
| Ubuntu 20.04+ (kernel 5.4+) | ✓ Yes |<br />
| EKS managed nodes (AL2023) | ✓ Yes |<br />
| GKE managed nodes (kernel 5.10+) | ✓ Yes |<br />
| Amazon Linux 2 (older kernels) | ✗ No — BCC fallback |<br />
| RHEL 7 / CentOS 7 | ✗ No |</p>
<h2 id="why-kernel-structs-change-and-why-it-matters">Why Kernel Structs Change and Why It Matters</h2>
<p>The Linux kernel has no stable ABI for internal data structures. <code class="" data-line="">task_struct</code>, <code class="" data-line="">sock</code>, <code class="" data-line="">sk_buff</code>, <code class="" data-line="">file</code> — the structs that eBPF programs read constantly — change between releases.</p>
<blockquote>
<p><strong>ABI (Application Binary Interface)</strong> is the contract that says a compiled binary built against version N will still work against version N+1 without recompilation. The Linux kernel maintains a stable ABI for <em>syscalls</em> (<code class="" data-line="">open()</code>, <code class="" data-line="">read()</code>, <code class="" data-line="">connect()</code>) but makes no such guarantee for internal structs. Fields move, get added, get renamed between patch releases — and any program with hardcoded offsets silently breaks. Field additions, reordering, alignment changes, struct embedding changes. The kernel developers are under no obligation to preserve internal layouts, and they don&#8217;t.</p>
</blockquote>
<p>Before CO-RE, eBPF programs dealt with this in two ways:</p>
<p><strong>BCC (BPF Compiler Collection)</strong> — compile the eBPF C code at runtime on the target host, using that system&#8217;s kernel headers. No portability problem because compilation happens on the machine you&#8217;re deploying to. Cost: you need a full compiler toolchain, kernel headers, and Python runtime on every production node. Startup time in seconds. Container image size in hundreds of MB. For a security tool that should be lightweight and fast-starting, this is a non-starter.</p>
<p><strong>Per-kernel compiled binaries</strong> — ship different builds for each supported kernel version, detect at runtime, load the matching binary. Falco maintained this model for years. The operational overhead is significant: a matrix of kernel × distro × version with separate build and test pipelines for each combination.</p>
<p>CO-RE is the third option. Compile once on a build machine, and let libbpf patch struct field accesses at load time on the target system, using type information embedded in the running kernel.</p>
<h2 id="btf-the-type-system-that-makes-co-re-possible">BTF: The Type System That Makes CO-RE Possible</h2>
<p>BTF (BPF Type Format) is compact type debug information embedded directly into the kernel image. Since Linux 5.2, kernels built with <code class="" data-line="">CONFIG_DEBUG_INFO_BTF=y</code> expose their full type information at <code class="" data-line="">/sys/kernel/btf/vmlinux</code>.</p>
<pre><code class="" data-line=""># Verify BTF is available
ls -la /sys/kernel/btf/vmlinux

# Inspect the BTF for a specific struct
bpftool btf dump file /sys/kernel/btf/vmlinux format raw | grep -A 5 &#039;task_struct&#039;

# See the actual field offsets the running kernel uses
bpftool btf dump file /sys/kernel/btf/vmlinux format c | grep -A 20 &#039;struct task_struct {&#039;
</code></pre>
<p>BTF encodes every struct definition with field names, types, and byte offsets. When libbpf loads an eBPF program compiled with CO-RE relocations, it reads both the BTF the program was compiled against (embedded in the <code class="" data-line="">.bpf.o</code> file) and the BTF of the running kernel. If <code class="" data-line="">task_struct-&gt;comm</code> has moved, libbpf patches the field access instruction before loading the program.</p>
<p>This patching happens at load time, transparently, without modifying the binary you shipped.</p>
<blockquote>
<p><strong>CO-RE relocation</strong> is the mechanism behind this. When a CO-RE program is compiled, it embeds metadata saying &#8220;I need the offset of <code class="" data-line="">comm</code> inside <code class="" data-line="">task_struct</code>&#8221; rather than hardcoding <code class="" data-line="">0x620</code>. At load time, libbpf reads this relocation, looks up the real offset from the running kernel&#8217;s BTF, and patches the instruction. For operators: this is why Cilium and Falco survive kernel upgrades without you reinstalling them.</p>
</blockquote>
<p>Most distribution kernels now ship with BTF enabled:</p>
<pre><code class="" data-line=""># Ubuntu 20.04+ (kernel 5.4+)
cat /boot/config-$(uname -r) | grep CONFIG_DEBUG_INFO_BTF
# CONFIG_DEBUG_INFO_BTF=y

# Check at runtime
file /sys/kernel/btf/vmlinux
# /sys/kernel/btf/vmlinux: symbolic link to /sys/kernel/btf/vmlinux
</code></pre>
<p>Amazon Linux 2023, Ubuntu 22.04, Debian 11+, RHEL 8.2+, and most cloud-provider-managed kernels have BTF. The notable exception: RHEL 7 and Amazon Linux 2 on older kernels.</p>
<h2 id="the-co-re-toolchain">The CO-RE Toolchain</h2>
<p>The build pipeline for a CO-RE eBPF program:</p>
<pre><code class="" data-line="">Development machine:
  vmlinux.h (generated from kernel BTF)
       ↓
  myprog.bpf.c ──── clang -target bpf -g ────→ myprog.bpf.o
  (CO-RE relocations embedded in BTF section)
       ↓
  bpftool gen skeleton myprog.bpf.o ─────────→ myprog.skel.h
       ↓
  myprog.c (userspace) ── gcc ──→ myprog
  (statically links libbpf, skeleton handles load/attach/cleanup)

Target machine (any kernel with BTF, 5.4+):
  ./myprog
  ↓ libbpf reads /sys/kernel/btf/vmlinux
  ↓ patches field accesses to match current kernel struct layout
  ↓ verifier validates patched program
  ↓ program loads and runs
</code></pre>
<p>One binary. Any supported kernel. No compiler on the target system.</p>
<h2 id="vmlinuxh-one-header-to-replace-them-all">vmlinux.h — One Header to Replace Them All</h2>
<p>Before CO-RE, eBPF C programs included dozens of kernel headers — <code class="" data-line="">linux/sched.h</code>, <code class="" data-line="">linux/net.h</code>, <code class="" data-line="">linux/fs.h</code>, <code class="" data-line="">linux/socket.h</code> — and they had to match the exact kernel version you were targeting.</p>
<p><code class="" data-line="">vmlinux.h</code> is generated from the BTF of a running kernel. It contains every struct, enum, typedef, and macro definition the kernel exposes through BTF — in a single file, without any compile-time kernel dependency.</p>
<pre><code class="" data-line=""># Generate vmlinux.h from the running kernel
bpftool btf dump file /sys/kernel/btf/vmlinux format c &gt; vmlinux.h

# Typical size
wc -l vmlinux.h
# 350000+
</code></pre>
<p>You commit <code class="" data-line="">vmlinux.h</code> to your repository, generated from a representative kernel. CO-RE handles the actual layout differences at load time on whatever kernel you deploy to. The file is large but you only generate it once and update it when you add support for a new kernel generation.</p>
<p>In your eBPF C source:</p>
<pre><code class="" data-line="">#include &quot;vmlinux.h&quot;           // replaces all kernel headers
#include &lt;bpf/bpf_helpers.h&gt;   // eBPF helper functions
#include &lt;bpf/bpf_tracing.h&gt;   // tracing macros
#include &lt;bpf/bpf_core_read.h&gt; // CO-RE read macros
</code></pre>
<h2 id="how-co-re-fixes-the-offset-problem">How CO-RE Fixes the Offset Problem</h2>
<p>The mechanism is worth understanding once, even if you&#8217;re not writing eBPF programs.</p>
<p>When a CO-RE eBPF program accesses a kernel struct field, it doesn&#8217;t hardcode the byte offset. Instead, it records a <em>relocation</em> — &#8220;I need the offset of <code class="" data-line="">pid</code> inside <code class="" data-line="">task_struct</code>&#8221; — in the compiled binary. When libbpf loads the program, it resolves each relocation by looking up the field in the running kernel&#8217;s BTF and patches the access instruction to use the correct offset for this specific kernel.</p>
<p>This is why my four-byte problem happened: the tracer I&#8217;d compiled wasn&#8217;t using CO-RE. It hardcoded <code class="" data-line="">0x620</code> as the offset of <code class="" data-line="">task_struct-&gt;comm</code>. When the kernel moved it to <code class="" data-line="">0x624</code>, the program accessed the wrong memory, the verifier caught the misalignment, and the load failed. A CO-RE rewrite would have resolved <code class="" data-line="">comm</code>&#8216;s offset at load time from BTF and never known the difference.</p>
<p>The relocation model also handles fields that don&#8217;t exist on older kernels. If a program accesses a field added in kernel 5.15 and the running kernel is 5.10, libbpf can either skip the access (returning a zero value) or fail the load — depending on how the program marks the field access. This is how tools ship support for features across a kernel version range without separate builds.</p>
<h2 id="what-co-re-means-for-tools-you-already-run">What CO-RE Means for Tools You Already Run</h2>
<p>This is why you care about CO-RE even if you&#8217;re never going to write an eBPF program yourself.</p>
<p>Falco, Cilium, Tetragon, and Pixie all ship as single binaries or container images. You install them on a Ubuntu 22.04 node, a RHEL 9 node, and an Amazon Linux 2023 node — three different kernel versions, three different <code class="" data-line="">task_struct</code> layouts — and the same binary works on all of them. Before CO-RE, Falco maintained pre-compiled kernel probes for every supported kernel version in a matrix of distro × kernel × version. The probe list had thousands of entries. A kernel your distro shipped between Falco release cycles meant a gap in coverage until the next release.</p>
<p>With CO-RE, there&#8217;s one binary. libbpf reads the running kernel&#8217;s BTF at load time, patches the field accesses to match the actual struct layout, and the verifier checks the patched program. The tool vendor doesn&#8217;t need to know about your specific kernel. You don&#8217;t need to wait for a probe release.</p>
<p>The constraint is BTF availability. Check your nodes:</p>
<pre><code class="" data-line=""># Quick check — if this file exists, CO-RE tools work
ls /sys/kernel/btf/vmlinux

# Full confirmation
cat /boot/config-$(uname -r) | grep CONFIG_DEBUG_INFO_BTF
# CONFIG_DEBUG_INFO_BTF=y  ← required
</code></pre>
<p>What you&#8217;ll find: Ubuntu 20.04+, Debian 11+, RHEL 8.2+, Amazon Linux 2023, and GKE/EKS managed nodes all have BTF. Amazon Linux 2 and RHEL 7 do not. If you&#8217;re running those, CO-RE-based tools fall back to the legacy BCC compilation path — which requires kernel headers installed on the node.</p>
<h2 id="the-one-thing-to-run-right-now">The One Thing to Run Right Now</h2>
<p>This command shows you the exact struct layout your running kernel uses — the same layout libbpf reads when it patches CO-RE programs at load time:</p>
<pre><code class="" data-line=""># See how your kernel defines task_struct right now
bpftool btf dump file /sys/kernel/btf/vmlinux format c | grep -A 30 &#039;^struct task_struct {&#039;
</code></pre>
<p>The output is the canonical type information for your running kernel. Every field, every offset. When libbpf loads a CO-RE program, it&#8217;s reading this to figure out whether <code class="" data-line="">task_struct-&gt;comm</code> is at offset <code class="" data-line="">0x620</code> or <code class="" data-line="">0x624</code>.</p>
<p>You can also see specific struct sizes and verify that two kernels differ:</p>
<pre><code class="" data-line=""># On kernel A (5.15.0-89)
bpftool btf dump file /sys/kernel/btf/vmlinux format raw | grep -w &quot;task_struct&quot; | head -3

# On kernel B (5.15.0-91) — same command, different output if struct changed
# This is what broke my tracer: field offset changed across a two-patch jump
</code></pre>
<p>The practical use: when a CO-RE eBPF tool fails to load with a BTF error, this is where you look. The error tells you which struct field the relocation failed on. This command shows you the current layout. You can confirm whether the field exists, whether it moved, whether it was renamed.</p>
<h2 id="bcc-vs-libbpf-vs-bpftrace">BCC vs libbpf vs bpftrace</h2>
<p>Three approaches to eBPF development, with distinct tradeoffs:</p>
<table>
<thead>
<tr>
<th></th>
<th>BCC</th>
<th>libbpf + CO-RE</th>
<th>bpftrace</th>
</tr>
</thead>
<tbody>
<tr>
<td>Compilation</td>
<td>Runtime on target host</td>
<td>Build-time on dev machine</td>
<td>Runtime (embedded LLVM)</td>
</tr>
<tr>
<td>Target deployment</td>
<td>Compiler + headers on every node</td>
<td>Single static binary</td>
<td>bpftrace binary only</td>
</tr>
<tr>
<td>Portability</td>
<td>Compile-on-target handles it</td>
<td>CO-RE + BTF handles it</td>
<td>Internal CO-RE support</td>
</tr>
<tr>
<td>Memory overhead</td>
<td>High (Python + compiler: 200MB+)</td>
<td>Low (few MB binary)</td>
<td>Medium</td>
</tr>
<tr>
<td>Startup time</td>
<td>Seconds (compilation)</td>
<td>Milliseconds</td>
<td>Seconds (JIT compile)</td>
</tr>
<tr>
<td>Best for</td>
<td>Prototyping, development</td>
<td>Production tools, shipped software</td>
<td>Interactive debugging sessions</td>
</tr>
<tr>
<td>Language</td>
<td>Python + C</td>
<td>C (kernel) + C/Go/Rust (userspace)</td>
<td>bpftrace scripting</td>
</tr>
</tbody>
</table>
<p>For anything you&#8217;re shipping — an eBPF-based security tool, an observability agent, an open-source project — libbpf + CO-RE is the right choice. BCC is for prototyping before you commit to an implementation. bpftrace is for the 30-second debugging session on a live node.</p>
<p>The practical test: if you&#8217;re building something you&#8217;ll deploy as a container image or a package, it needs to be a self-contained binary with no build dependencies on the target system. That means libbpf.</p>
<h2 id="common-mistakes">Common Mistakes</h2>
<table>
<thead>
<tr>
<th>Mistake</th>
<th>Impact</th>
<th>Fix</th>
</tr>
</thead>
<tbody>
<tr>
<td>Direct struct dereference instead of <code class="" data-line="">BPF_CORE_READ</code></td>
<td>Program breaks on any kernel struct change</td>
<td>Use <code class="" data-line="">BPF_CORE_READ()</code> for all kernel struct field access</td>
</tr>
<tr>
<td>Missing <code class="" data-line="">char LICENSE[] SEC(&quot;license&quot;) = &quot;GPL&quot;</code></td>
<td>GPL-only helpers (most tracing helpers) unavailable</td>
<td>Always include the license section</td>
</tr>
<tr>
<td>vmlinux.h generated on a very old kernel</td>
<td>Missing structs added in newer kernel releases</td>
<td>Regenerate from the highest kernel version you target</td>
</tr>
<tr>
<td>Forgetting <code class="" data-line="">-g</code> flag in clang invocation</td>
<td>No BTF debug info → no CO-RE relocations</td>
<td>Always compile with <code class="" data-line="">-g -O2 -target bpf</code></td>
</tr>
<tr>
<td>Hardcoding struct offsets as integer constants</td>
<td>Breaks silently on next kernel patch</td>
<td>Use BTF-aware CO-RE macros exclusively</td>
</tr>
</tbody>
</table>
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>Kernel structs change between releases — hardcoded offsets break across patch versions, not just major releases</li>
<li>BTF embeds full type information in the kernel at <code class="" data-line="">/sys/kernel/btf/vmlinux</code>; CO-RE uses it to patch field accesses at load time</li>
<li><code class="" data-line="">vmlinux.h</code>, generated from BTF, replaces all kernel headers with a single file committed to your repository</li>
<li><code class="" data-line="">BPF_CORE_READ()</code> is the CO-RE macro — every kernel struct access in a portable program goes through it</li>
<li>libbpf skeleton generation (<code class="" data-line="">bpftool gen skeleton</code>) eliminates manual fd management for map and program lifecycle</li>
<li>For production tools: libbpf + CO-RE. For one-off debugging: bpftrace. For prototyping: BCC.</li>
</ul>
<h2 id="whats-next">What&#8217;s Next</h2>
<p>CO-RE makes eBPF programs portable across kernel versions. EP07 takes the next question: where in the kernel&#8217;s data path does it make sense to attach them?</p>
<p>XDP fires before the kernel has allocated a single byte of memory for an incoming packet — before the kernel even knows whether to accept it. That hook placement is why Cilium can do line-rate load balancing and why some network filtering rules that look correct in iptables do nothing against certain traffic. The rules weren&#8217;t wrong. The hook was in the wrong place.</p>
<p><em>Next: <a href="/ebpf-xdp-network-fast-path/">XDP — packets processed before the kernel knows they arrived</a></em></p>
<p>Get EP07 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-co-re-libbpf-portable-programs%2F&amp;linkname=CO-RE%20and%20libbpf%20%E2%80%94%20Write%20Once%2C%20Run%20on%20Any%20Kernel" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-co-re-libbpf-portable-programs%2F&amp;linkname=CO-RE%20and%20libbpf%20%E2%80%94%20Write%20Once%2C%20Run%20on%20Any%20Kernel" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-co-re-libbpf-portable-programs%2F&amp;linkname=CO-RE%20and%20libbpf%20%E2%80%94%20Write%20Once%2C%20Run%20on%20Any%20Kernel" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-co-re-libbpf-portable-programs%2F&amp;linkname=CO-RE%20and%20libbpf%20%E2%80%94%20Write%20Once%2C%20Run%20on%20Any%20Kernel" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-co-re-libbpf-portable-programs%2F&amp;linkname=CO-RE%20and%20libbpf%20%E2%80%94%20Write%20Once%2C%20Run%20on%20Any%20Kernel" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-co-re-libbpf-portable-programs%2F&amp;linkname=CO-RE%20and%20libbpf%20%E2%80%94%20Write%20Once%2C%20Run%20on%20Any%20Kernel" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-co-re-libbpf-portable-programs%2F&amp;linkname=CO-RE%20and%20libbpf%20%E2%80%94%20Write%20Once%2C%20Run%20on%20Any%20Kernel" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Febpf-co-re-libbpf-portable-programs%2F&#038;title=CO-RE%20and%20libbpf%20%E2%80%94%20Write%20Once%2C%20Run%20on%20Any%20Kernel" data-a2a-url="https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/" data-a2a-title="CO-RE and libbpf — Write Once, Run on Any Kernel"></a></p><p>The post <a href="https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/">CO-RE and libbpf — Write Once, Run on Any Kernel</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1504</post-id>	</item>
		<item>
		<title>eBPF Maps — The Persistent Data Layer Between Kernel and Userspace</title>
		<link>https://linuxcent.com/ebpf-maps-explained/</link>
					<comments>https://linuxcent.com/ebpf-maps-explained/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Thu, 16 Apr 2026 17:57:45 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[bpftool]]></category>
		<category><![CDATA[Cilium]]></category>
		<category><![CDATA[eBPF maps]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Observability]]></category>
		<category><![CDATA[SRE]]></category>
		<guid isPermaLink="false">https://linuxcent.com/ebpf-maps-explained/</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 11</span> <span class="rt-label rt-postfix">minutes</span></span>eBPF maps are the persistent data layer between kernel programs and userspace tools. Hash maps, ring buffers, LRU maps — explained for SREs running Cilium and Falco.</p>
<p>The post <a href="https://linuxcent.com/ebpf-maps-explained/">eBPF Maps — The Persistent Data Layer Between Kernel and Userspace</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 11</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 5</em><br />
<em><a href="https://linuxcent.com/what-is-ebpf-linux-kubernetes/">What Is eBPF?</a> · <a href="https://linuxcent.com/bpf-verifier-kubernetes-safety/">The BPF Verifier</a> · <a href="https://linuxcent.com/ebpf-vs-kernel-modules-kubernetes/">eBPF vs Kernel Modules</a> · <a href="https://linuxcent.com/ebpf-program-types-kubernetes/">eBPF Program Types</a> · </em><em>eBPF Maps</em>**</p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img loading="lazy" decoding="async" width="1190" height="2560" src="https://linuxcent.com/wp-content/uploads/2026/05/ep05-ebpf-maps-og-2-scaled.png" alt="eBPF Maps — the persistent data layer between kernel eBPF programs and userspace tools" class="wp-image-2112" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep05-ebpf-maps-og-2-scaled.png 1190w, https://linuxcent.com/wp-content/uploads/2026/05/ep05-ebpf-maps-og-2-139x300.png 139w, https://linuxcent.com/wp-content/uploads/2026/05/ep05-ebpf-maps-og-2-476x1024.png 476w, https://linuxcent.com/wp-content/uploads/2026/05/ep05-ebpf-maps-og-2-768x1652.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep05-ebpf-maps-og-2-714x1536.png 714w, https://linuxcent.com/wp-content/uploads/2026/05/ep05-ebpf-maps-og-2-952x2048.png 952w" sizes="auto, (max-width: 1190px) 100vw, 1190px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">eBPF maps are the shared memory between kernel programs and userspace — hash, array, ringbuf, and LRU variants shown.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>eBPF programs are stateless — maps are where all state lives, between invocations and between kernel and userspace<br />
<em>(&#8220;stateless&#8221; here means each program invocation starts with no memory of previous runs — like a function with no global variables)</em></li>
<li>Every production eBPF tool (Cilium, Falco, Tetragon, Datadog NPM) is a map-based architecture — <code class="" data-line="">bpftool map list</code> shows you what it&#8217;s actually holding</li>
<li>Per-CPU maps eliminate write contention for high-frequency counters; the tool aggregates per-CPU values at export time</li>
<li>LRU maps handle unbounded key spaces (IPs, PIDs, connections) without hard errors when full — but eviction is silent, so size generously</li>
<li>Ring buffer (kernel 5.8+) is the correct event streaming primitive — Falco and Tetragon both use it</li>
<li>Map memory is kernel-locked and invisible to standard memory metrics — account for it explicitly on eBPF-heavy nodes</li>
<li>Pinned maps survive restarts; Cilium uses this for zero-disruption connection tracking through upgrades</li>
</ul>
<hr />
<h2 id="the-big-picture">The Big Picture</h2>
<pre><code class="" data-line="">  HOW eBPF MAPS CONNECT KERNEL PROGRAMS TO USERSPACE TOOLS

  ┌─────────────────────────────────────────────────────────────┐
  │  Kernel space                                               │
  │                                                             │
  │  [XDP program]  [TC program]  [kprobe]  [tracepoint]        │
  │        │              │           │           │             │
  │        └──────────────┴───────────┴───────────┘             │
  │                              │                              │
  │                   bpf_map_update_elem()                     │
  │                              │                              │
  │                              ▼                              │
  │  ┌─────────────────────────────────────────────────────┐    │
  │  │             eBPF MAP (kernel object)                │    │
  │  │  hash · percpu_hash · lru_hash · ringbuf · lpm_trie │    │
  │  │  Lives outside program invocations.                 │    │
  │  │  Pinned maps (/sys/fs/bpf/) survive restarts.       │    │
  │  └────────────────────┬────────────────────────────────┘    │
  └───────────────────────│─────────────────────────────────────┘
                          │  read / write via file descriptor
                          ▼
  ┌─────────────────────────────────────────────────────────────┐
  │  Userspace tools                                            │
  │                                                             │
  │  Cilium agent  Falco engine  Tetragon  bpftool map dump     │
  └─────────────────────────────────────────────────────────────┘
</code></pre>
<hr />
<p>eBPF maps are the persistent data layer between kernel programs and the tools that consume their output. eBPF programs fire and exit — there&#8217;s no memory between invocations. Yet Cilium tracks TCP connections across millions of packets, and Falco correlates a process exec from five minutes ago with a suspicious network connection happening now. The mechanism between stateless kernel programs and the stateful production tools you depend on is what this episode is about — and understanding it changes what you see when you run <code class="" data-line="">bpftool map list</code>.</p>
<hr />
<p>I was trying to identify the noisy neighbor saturating a cluster&#8217;s egress link. I had an eBPF program loading cleanly, events firing, everything confirming it was working. But when I read back the per-port connection counters from userspace, everything was zero.</p>
<p>I spent an hour on it before posting to the BCC mailing list. The reply came back fast: eBPF programs don&#8217;t hold state between invocations. Every time the kprobe fires, the program starts fresh. The counter I was incrementing existed only for that single call — created, incremented to one, then discarded. On every single invocation. I was counting events one at a time, throwing the count away, and reading nothing.</p>
<p>That&#8217;s what eBPF maps solve.</p>
<h2 id="quick-check-what-maps-are-running-on-your-node">Quick Check: What Maps Are Running on Your Node?</h2>
<p>Before the map types walkthrough — see the live state of maps on any cluster node right now:</p>
<pre><code class="" data-line=""># SSH into a worker node, then:
bpftool map list
</code></pre>
<p>On a node running Cilium + Falco, you&#8217;ll see something like:</p>
<pre><code class="" data-line="">12: hash          name cilium_ct4_glo    key 24B  value 56B  max_entries 65536  memlock 5767168B
13: lpm_trie      name cilium_ipcache    key 40B  value 32B  max_entries 512000 memlock 327680B
14: percpu_hash   name cilium_metrics    key 8B   value 32B  max_entries 65536  memlock 2097152B
28: ringbuf       name falco_events      max_entries 8388608
</code></pre>
<p>Reading this output:<br />
&#8211; <code class="" data-line="">hash</code>, <code class="" data-line="">lpm_trie</code>, <code class="" data-line="">percpu_hash</code>, <code class="" data-line="">ringbuf</code> — the map <em>type</em> (each optimised for a different access pattern)<br />
&#8211; <code class="" data-line="">key 24B value 56B</code> — sizes of a single entry&#8217;s key and value in bytes<br />
&#8211; <code class="" data-line="">max_entries</code> — the hard ceiling; when the map is full, behaviour depends on type (see LRU section below)<br />
&#8211; <code class="" data-line="">memlock</code> — non-pageable kernel memory this map consumes (invisible to <code class="" data-line="">free</code> and container metrics)</p>
<blockquote>
<p><strong>Not running Cilium?</strong> On EKS with <code class="" data-line="">aws-vpc-cni</code> or GKE with <code class="" data-line="">kubenet</code>, there are far fewer maps here — primarily kube-proxy uses iptables rather than BPF maps. Running <code class="" data-line="">bpftool map list</code> still works; you&#8217;ll just see fewer entries. On a pure iptables-based cluster, most of the maps you see come from the system kernel itself, not a CNI.</p>
</blockquote>
<h2 id="maps-are-the-architecture-not-an-afterthought">Maps Are the Architecture, Not an Afterthought</h2>
<p>Maps are kernel objects that live outside any individual program invocation. They&#8217;re shared between multiple eBPF programs, readable and writable from userspace, and persistent for the lifetime of the map — which can outlive both the program that created them and the userspace process that loaded them.</p>
<p>Every production eBPF tool is fundamentally a map-based architecture:</p>
<ul>
<li>Cilium stores connection tracking state in BPF hash maps</li>
<li>Falco uses ring buffers to stream syscall events to its userspace rule engine</li>
<li>Tetragon maintains process tree state across exec events using maps</li>
<li>Datadog NPM stores per-connection flow stats in per-CPU maps for lock-free metric accumulation</li>
</ul>
<p>Run <code class="" data-line="">bpftool map list</code> on a Cilium node:</p>
<pre><code class="" data-line="">$ bpftool map list
ID 12: hash          name cilium_ct4_glo    key 24B  value 56B   max_entries 65536
#      ^^^^           ^^^^^^^^^^^^^^^^       ^^^^^^   ^^^^^^^     ^^^^^^^^^^^^^^^^
#      type           map name               key size value size  max concurrent entries

ID 13: lpm_trie      name cilium_ipcache    key 40B  value 32B   max_entries 512000
#      longest-prefix-match trie — for IP address + CIDR lookups

ID 14: percpu_hash   name cilium_metrics    key 8B   value 32B   max_entries 65536
#      one copy of this map per CPU — no write contention for high-frequency counters

ID 28: ringbuf       name falco_events      max_entries 8388608
#                                           ^^^^^^^^^^^ 8MB ring buffer for event streaming
</code></pre>
<p>Connection tracking, IP policy cache, per-CPU metrics, event stream. Every one of these is a different map type, chosen for a specific reason.</p>
<h2 id="map-types-and-what-theyre-actually-used-for">Map Types and What They&#8217;re Actually Used For</h2>
<h3 id="hash-maps">Hash Maps</h3>
<p>The general-purpose key-value store. A key maps to a value — lookup is O(1) average. Cilium&#8217;s connection tracking map (<code class="" data-line="">cilium_ct4_glo</code>) is a hash map: the key is a 5-tuple (source IP, destination IP, ports, protocol), the value is the connection state.</p>
<pre><code class="" data-line="">$ bpftool map show id 12
12: hash  name cilium_ct4_glo  flags 0x0
        key 24B  value 56B  max_entries 65536  memlock 5767168B
</code></pre>
<p>The <code class="" data-line="">key 24B</code> is the 5-tuple. The <code class="" data-line="">value 56B</code> is the connection state record. <code class="" data-line="">max_entries 65536</code> is the upper bound — Cilium can track 65,536 active connections in this map before hitting the limit.</p>
<p>Hash maps are shared across all CPUs on the node. When multiple CPUs try to update the same entry simultaneously — which happens constantly on busy nodes — writes need to be coordinated. For most use cases this is fine. For high-frequency counters updated on every packet, it&#8217;s a bottleneck. That&#8217;s when you reach for a per-CPU hash map.</p>
<p><strong>Where you see them:</strong> connection tracking, per-IP statistics, process-to-identity mapping, policy verdict caching.</p>
<h3 id="per-cpu-hash-maps">Per-CPU Hash Maps</h3>
<p>Per-CPU hash maps solve the write coordination problem by giving each CPU its own independent copy of every entry. There&#8217;s no sharing, no contention, no waiting — each CPU writes its own copy without touching any other.</p>
<p>The tradeoff: reading from userspace means collecting one value per CPU and summing them up. That aggregation happens in the tool, not the kernel.</p>
<pre><code class="" data-line=""># Cilium&#039;s per-CPU metrics map — one counter value per CPU
bpftool map dump id 14
key: 0x00000001
  value (CPU 00): 12345
  value (CPU 01): 8901
  value (CPU 02): 3421
  value (CPU 03): 7102
# total bytes for this metric: 31769
</code></pre>
<p>Cilium&#8217;s <code class="" data-line="">cilium_metrics</code> map uses this pattern for exactly this reason — it&#8217;s updated on every packet across every CPU on the node. Forcing all CPUs to coordinate writes to a single shared entry at that rate would hurt throughput. Instead: each CPU writes locally, Cilium&#8217;s userspace agent sums the values at export time.</p>
<p><strong>Where you see them:</strong> packet counters, byte counters, syscall frequency metrics — anywhere updates happen on every event at high volume.</p>
<h3 id="lru-hash-maps">LRU Hash Maps</h3>
<p>LRU hash maps add automatic eviction. Same key-value semantics as a regular hash map, but when the map hits its entry limit, the least recently accessed entry is dropped to make room for the new one.</p>
<p>This matters for any map tracking dynamic state with an unpredictable number of keys: TCP connections, process IDs, DNS queries, pod IPs. Without LRU semantics, a full map returns an error on insert — and in production, that means your tool silently stops tracking new entries. Not a crash, not an alert — just missing data.</p>
<p>Cilium&#8217;s connection tracking map is LRU-bounded at 65,536 entries. On a node handling high-connection-rate workloads, this can fill up. When it does, Cilium starts evicting old connections to make room for new ones — and if it&#8217;s evicting too aggressively, you&#8217;ll see connection resets.</p>
<pre><code class="" data-line=""># Check current CT map usage vs its limit
bpftool map show id 12
# max_entries tells you the ceiling
# count entries to see current usage
bpftool map dump id 12 | grep -c &quot;^key&quot;
</code></pre>
<p>Size LRU maps at 2× your expected concurrent active entries. Aggressive eviction under pressure introduces gaps — not crashes, but missing or incorrect state.</p>
<p><strong>Where you see them:</strong> connection tracking, process lineage, anything where the key space is dynamic and unbounded.</p>
<h3 id="ring-buffers">Ring Buffers</h3>
<p>Ring buffers are how eBPF tools stream events from the kernel to a userspace consumer. Falco reads syscall events from a ring buffer. Tetragon streams process execution and network events through ring buffers. The pattern is the same across all of them:</p>
<pre><code class="" data-line="">kernel eBPF program
  → sees event (syscall, network packet, process exec)
  → writes record to ring buffer
  → userspace tool reads it and processes (Falco rules, Tetragon policies)
</code></pre>
<p>What makes ring buffers the right primitive for event streaming:</p>
<ul>
<li><strong>Single buffer shared across all CPUs</strong> — unlike the older <code class="" data-line="">perf_event_array</code> approach which required one buffer per CPU, a ring buffer is one allocation, one file descriptor, one consumer</li>
<li><strong>Lock-free</strong> — the kernel writes, the userspace tool reads, they don&#8217;t block each other</li>
<li><strong>Backpressure when full</strong> — if the userspace tool can&#8217;t keep up, new events are dropped rather than queued indefinitely. The tool can detect and count drops. Falco reports these as <code class="" data-line="">Dropped events</code> in its stats output.</li>
</ul>
<pre><code class="" data-line=""># Falco&#039;s ring buffer — 8MB
bpftool map list | grep ringbuf
# ID 28: ringbuf  name falco_events  max_entries 8388608
</code></pre>
<p>8,388,608 bytes = 8MB. That&#8217;s the buffer between Falco&#8217;s kernel hooks and its rule engine. If there&#8217;s a burst of syscall activity and Falco&#8217;s rule evaluation can&#8217;t keep up, events drop into that window and are lost.</p>
<p>Sizing matters operationally. Too small and you drop events during normal burst. Too large and you&#8217;re holding non-pageable kernel memory that doesn&#8217;t show up in standard memory metrics.</p>
<pre><code class="" data-line=""># Check Falco&#039;s drop rate
falcoctl stats
# or check the Falco logs
journalctl -u falco | grep -i &quot;drop&quot;
</code></pre>
<p>Most production deployments run 8–32MB. Start at 8MB, monitor drop rates under load, size up if needed.</p>
<p><strong>Where you see them:</strong> Falco event streaming, Tetragon audit events, any tool that needs to move high-volume event data from kernel to userspace.</p>
<h3 id="array-maps">Array Maps</h3>
<p>Array maps are fixed-size, integer-indexed, and entirely pre-allocated at creation time. Think of them as lookup tables with integer keys — constant-time access, no hash overhead, no dynamic allocation.</p>
<p>Cilium uses array maps for policy configuration: a fixed set of slots indexed by endpoint identity number. When a packet arrives and Cilium needs to check policy, it indexes into the array directly rather than doing a hash lookup. For read-heavy, write-rare data, this is faster.</p>
<p>The constraint: you can&#8217;t delete entries from an array map. Every slot exists for the lifetime of the map. If you need to track state that comes and goes — connections, processes, pods — use a hash map instead.</p>
<p><strong>Where you see them:</strong> policy configuration, routing tables with fixed indices, per-CPU stats indexed by CPU number.</p>
<h3 id="lpm-trie-maps">LPM Trie Maps</h3>
<p>LPM (Longest Prefix Match) trie maps handle IP prefix lookups — the same operation that a hardware router does when deciding which interface to send a packet out of.</p>
<p>You can store a mix of specific host addresses (/32) and CIDR ranges (/16, /24) in the same map, and a lookup returns the most specific match. If <code class="" data-line="">10.0.1.15/32</code> and <code class="" data-line="">10.0.0.0/8</code> are both in the map, a lookup for <code class="" data-line="">10.0.1.15</code> returns the /32 entry.</p>
<p>Cilium&#8217;s <code class="" data-line="">cilium_ipcache</code> map is an LPM trie. It maps every IP in the cluster to its security identity — the identifier Cilium uses for policy enforcement. When a packet arrives, Cilium does a trie lookup on the source IP to find out which endpoint sent it, then checks policy against that identity.</p>
<pre><code class="" data-line=""># Inspect the ipcache map
bpftool map show id 13
# lpm_trie  name cilium_ipcache  key 40B  value 32B  max_entries 512000

# Look up which security identity owns a pod IP
bpftool map lookup id 13 key hex 20 00 00 00 0a 00 01 0f 00 00 00 00 00 00 00 00 00 00 00 00
</code></pre>
<p><strong>Where you see them:</strong> IP-to-identity mapping (Cilium), CIDR-based policy enforcement, IP blocklists.</p>
<hr />
<h2 id="pinned-maps-state-that-survives-restarts">Pinned Maps — State That Survives Restarts</h2>
<p>By default, a map&#8217;s lifetime is tied to the tool that created it. When the tool exits, the kernel garbage-collects the map.</p>
<p>Pinning writes a reference to the BPF filesystem at <code class="" data-line="">/sys/fs/bpf</code>, which keeps the map alive even after the creating process exits:</p>
<pre><code class="" data-line=""># See all maps Cilium has pinned
ls /sys/fs/bpf/tc/globals/
# cilium_ct4_global  cilium_ipcache  cilium_metrics  cilium_policy ...

# Inspect a pinned map directly — no Cilium process needed
bpftool map dump pinned /sys/fs/bpf/tc/globals/cilium_ct4_global

# Pin any map by ID for manual inspection
bpftool map pin id 12 /sys/fs/bpf/my_conn_tracker
bpftool map dump pinned /sys/fs/bpf/my_conn_tracker
</code></pre>
<p>Cilium pins all its maps under <code class="" data-line="">/sys/fs/bpf/tc/globals/</code>. When Cilium restarts — rolling upgrade, crash, OOM kill — it reopens its pinned maps and resumes with existing state intact. Pods maintain established TCP connections through a Cilium restart without disruption.</p>
<p>This is operationally significant: if you&#8217;re evaluating eBPF-based tools for production, check whether they pin their maps. A tool that doesn&#8217;t loses all its tracked state on every restart — connection tracking resets, process lineage gaps, policy state rebuilt from scratch.</p>
<hr />
<h2 id="map-memory-a-production-consideration">Map Memory: A Production Consideration</h2>
<p>Map memory is kernel-locked — it cannot be paged out, and it doesn&#8217;t show up in standard memory pressure metrics. Your node&#8217;s <code class="" data-line="">free</code> output and container memory limits don&#8217;t account for it.</p>
<blockquote>
<p><strong>Kernel-locked memory</strong> is memory the OS guarantees will never be swapped to disk — it stays in RAM permanently. The kernel requires this for eBPF maps because a kernel program running during a network interrupt cannot wait for a page fault. The side effect: it doesn&#8217;t appear in <code class="" data-line="">top</code>, <code class="" data-line="">free</code>, or container memory metrics, so it&#8217;s easy to accidentally provision nodes without accounting for it.</p>
</blockquote>
<pre><code class="" data-line=""># Total eBPF map memory locked on this node
bpftool map list -j | python3 -c &quot;
import json,sys
maps=json.load(sys.stdin)
total=sum(m.get(&#039;bytes_memlock&#039;,0) for m in maps)
print(f&#039;Total map memory: {total/1024/1024:.1f} MB&#039;)
&quot;

# Check system memlock limit (unlimited is correct for eBPF tools)
ulimit -l

# Check what Cilium&#039;s systemd unit sets
systemctl show cilium | grep -i memlock
</code></pre>
<p>On a node running Cilium + Falco + Datadog NPM, I&#8217;ve seen 200–400MB of map memory locked. That&#8217;s real, non-pageable kernel memory. If you&#8217;re sizing nodes for eBPF-heavy workloads, account for this separately from your pod workload memory.</p>
<p>If an eBPF tool fails to load with a permission error despite having enough free memory, the root cause is usually the <code class="" data-line="">memlock</code> ulimit for the process. Cilium, Falco, and most production tools set <code class="" data-line="">LimitMEMLOCK=infinity</code> in their systemd units. Verify this if you&#8217;re deploying a new eBPF-based tool and seeing unexpected load failures.</p>
<hr />
<h2 id="inspecting-maps-in-production">Inspecting Maps in Production</h2>
<pre><code class="" data-line=""># List all maps: type, name, key/value sizes, memory usage
bpftool map list

# Dump all entries in a map (careful with large maps)
bpftool map dump id 12

# Look up a specific entry by key
bpftool map lookup id 12 key hex 0a 00 01 0f 00 00 00 00

# Watch map stats live
watch -n1 &#039;bpftool map show id 12&#039;

# See all maps for a specific tool by checking its pinned path
ls /sys/fs/bpf/tc/globals/                    # Cilium
ls /sys/fs/bpf/falco/                         # Falco (if pinned)

# Cross-reference map IDs with the programs using them
bpftool prog list
bpftool map list
</code></pre>
<hr />
<h2 id="production-gotchas"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/26a0.png" alt="⚠" class="wp-smiley" style="height: 1em; max-height: 1em;" /> Production Gotchas</h2>
<p><strong>A full LRU map drops state silently, not loudly</strong><br />
When Cilium&#8217;s CT map fills up, it starts evicting the least recently used connections — not returning an error. You see connection resets, not a tool alert. Check map utilisation (<code class="" data-line="">bpftool map dump id X | grep -c key</code>) against <code class="" data-line="">max_entries</code> on nodes with high connection rates.</p>
<p><strong>Ring buffer drops don&#8217;t stop the tool — they create gaps</strong><br />
When Falco&#8217;s ring buffer fills up, events are dropped. Falco keeps running. The rule engine keeps processing. But you have gaps in your syscall visibility. Monitor <code class="" data-line="">Dropped events</code> in Falco&#8217;s stats and size the ring buffer accordingly.</p>
<p><strong>Map memory is invisible to standard monitoring</strong><br />
200–400MB of kernel-locked memory on a Cilium + Falco node doesn&#8217;t appear in <code class="" data-line="">top</code>, container memory metrics, or memory pressure alerts. Size eBPF-heavy nodes with this in mind and add explicit map memory monitoring via <code class="" data-line="">bpftool</code>.</p>
<p><strong>Tools that don&#8217;t pin their maps lose state on restart</strong><br />
A Cilium restart with pinned maps = zero-disruption connection tracking. A tool without pinning = all tracked state rebuilt from scratch. This matters for connection tracking tools and any tool maintaining process lineage.</p>
<p><strong><code class="" data-line="">perf_event_array</code> on kernel 5.8+ is the old way</strong><br />
Older eBPF tools use per-CPU <code class="" data-line="">perf_event_array</code> for event streaming. Ring buffer is strictly better — single allocation, lower overhead, simpler consumption. If you&#8217;re running a tool that still uses <code class="" data-line="">perf_event_array</code> on a 5.8+ kernel, it&#8217;s using a legacy path.</p>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>eBPF programs are stateless — maps are where all state lives, between invocations and between kernel and userspace</li>
<li>Every production eBPF tool (Cilium, Falco, Tetragon, Datadog NPM) is a map-based architecture — <code class="" data-line="">bpftool map list</code> shows you what it&#8217;s actually holding</li>
<li>Per-CPU maps eliminate write contention for high-frequency counters; the tool aggregates per-CPU values at export time</li>
<li>LRU maps handle unbounded key spaces (IPs, PIDs, connections) without hard errors when full — but eviction is silent, so size generously</li>
<li>Ring buffer (kernel 5.8+) is the correct event streaming primitive — Falco and Tetragon both use it</li>
<li>Map memory is kernel-locked and invisible to standard memory metrics — account for it explicitly on eBPF-heavy nodes</li>
<li>Pinned maps survive restarts; Cilium uses this for zero-disruption connection tracking through upgrades</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>You know what program types run in the kernel, and you know how they hold state.</p>
<p>Get EP06 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a> But there&#8217;s a problem anyone running eBPF-based tools eventually runs into: a tool works on one kernel version and breaks on the next. Struct layouts shift between patch versions. Field offsets move. EP06 covers CO-RE (Compile Once, Run Everywhere) and libbpf — the mechanism that makes tools like Cilium and Falco survive your node upgrades without recompilation, and why kernel version compatibility is a solved problem for any tool built on this toolchain.</p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-maps-explained%2F&amp;linkname=eBPF%20Maps%20%E2%80%94%20The%20Persistent%20Data%20Layer%20Between%20Kernel%20and%20Userspace" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-maps-explained%2F&amp;linkname=eBPF%20Maps%20%E2%80%94%20The%20Persistent%20Data%20Layer%20Between%20Kernel%20and%20Userspace" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-maps-explained%2F&amp;linkname=eBPF%20Maps%20%E2%80%94%20The%20Persistent%20Data%20Layer%20Between%20Kernel%20and%20Userspace" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-maps-explained%2F&amp;linkname=eBPF%20Maps%20%E2%80%94%20The%20Persistent%20Data%20Layer%20Between%20Kernel%20and%20Userspace" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-maps-explained%2F&amp;linkname=eBPF%20Maps%20%E2%80%94%20The%20Persistent%20Data%20Layer%20Between%20Kernel%20and%20Userspace" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-maps-explained%2F&amp;linkname=eBPF%20Maps%20%E2%80%94%20The%20Persistent%20Data%20Layer%20Between%20Kernel%20and%20Userspace" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-maps-explained%2F&amp;linkname=eBPF%20Maps%20%E2%80%94%20The%20Persistent%20Data%20Layer%20Between%20Kernel%20and%20Userspace" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Febpf-maps-explained%2F&#038;title=eBPF%20Maps%20%E2%80%94%20The%20Persistent%20Data%20Layer%20Between%20Kernel%20and%20Userspace" data-a2a-url="https://linuxcent.com/ebpf-maps-explained/" data-a2a-title="eBPF Maps — The Persistent Data Layer Between Kernel and Userspace"></a></p><p>The post <a href="https://linuxcent.com/ebpf-maps-explained/">eBPF Maps — The Persistent Data Layer Between Kernel and Userspace</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/ebpf-maps-explained/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1481</post-id>	</item>
		<item>
		<title>eBPF Program Types — What&#8217;s Actually Running on Your Nodes</title>
		<link>https://linuxcent.com/ebpf-program-types-kubernetes/</link>
					<comments>https://linuxcent.com/ebpf-program-types-kubernetes/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Sun, 12 Apr 2026 09:51:22 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[bpftool]]></category>
		<category><![CDATA[Cilium]]></category>
		<category><![CDATA[eBPF program types]]></category>
		<category><![CDATA[Falco]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[SRE]]></category>
		<guid isPermaLink="false">https://linuxcent.com/ebpf-program-types-kubernetes/</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 8</span> <span class="rt-label rt-postfix">minutes</span></span>What eBPF program types run on your Kubernetes nodes — XDP, TC, tracepoints, LSM explained through real SRE incidents using bpftool, Cilium, and Falco.</p>
<p>The post <a href="https://linuxcent.com/ebpf-program-types-kubernetes/">eBPF Program Types — What&#8217;s Actually Running on Your Nodes</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 8</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 4</em><br />
<em><a href="https://linuxcent.com/what-is-ebpf-linux-kubernetes/">What Is eBPF?</a> · <a href="https://linuxcent.com/bpf-verifier-kubernetes-safety/">The BPF Verifier</a> · <a href="https://linuxcent.com/ebpf-vs-kernel-modules-kubernetes/">eBPF vs Kernel Modules</a> · </em><em>eBPF Program Types</em>**</p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img loading="lazy" decoding="async" width="2400" height="1828" src="https://linuxcent.com/wp-content/uploads/2026/05/ep04-ebpf-program-types-og-2.png" alt="eBPF Program Types — tracing, networking, and security hook points across the Linux kernel" class="wp-image-2111" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep04-ebpf-program-types-og-2.png 2400w, https://linuxcent.com/wp-content/uploads/2026/05/ep04-ebpf-program-types-og-2-300x229.png 300w, https://linuxcent.com/wp-content/uploads/2026/05/ep04-ebpf-program-types-og-2-1024x780.png 1024w, https://linuxcent.com/wp-content/uploads/2026/05/ep04-ebpf-program-types-og-2-768x585.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep04-ebpf-program-types-og-2-1536x1170.png 1536w, https://linuxcent.com/wp-content/uploads/2026/05/ep04-ebpf-program-types-og-2-2048x1560.png 2048w" sizes="auto, (max-width: 2400px) 100vw, 2400px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">Each eBPF program type attaches to a different kernel hook — from socket filters to LSM enforcement points.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li><code class="" data-line="">bpftool prog list</code> and <code class="" data-line="">bpftool net list</code> show every eBPF program on a node — run these first when debugging eBPF-based tool behavior</li>
<li>TC programs can stack on the same interface; stale programs from incomplete Cilium upgrades cause intermittent packet drops — check <code class="" data-line="">tc filter show</code> after every Cilium upgrade</li>
<li>XDP fires before <code class="" data-line="">sk_buff</code> allocation — fastest hook, but no pod identity; Cilium uses it for service load balancing, not pod policy</li>
<li>XDP silently falls back to generic mode on unsupported NICs — verify with <code class="" data-line="">ip link show | grep xdp</code></li>
<li>Tracepoints are stable across kernel versions; kprobe-based tools may silently break after node OS patches</li>
<li>LSM hooks enforce at the kernel level — what makes Tetragon&#8217;s enforcement mode fundamentally different from sidecar-based approaches</li>
</ul>
<hr />
<h2 id="the-big-picture">The Big Picture</h2>
<pre><code class="" data-line="">  WHERE eBPF PROGRAM TYPES ATTACH IN THE KERNEL

  NIC hardware
       ↓
  DMA → ring buffer
       ↓
  ┌─────────────────────────────────────────────────┐
  │  XDP hook  (Cilium: service load balancing)     │
  │  Sees: raw packet bytes only. No pod identity.  │
  └─────────────────────────┬───────────────────────┘
                            │ XDP_PASS
                            ▼
  sk_buff allocated
       ↓
  ┌─────────────────────────────────────────────────┐
  │  TC ingress hook  (Cilium: pod policy ingress)  │
  │  Sees: sk_buff + socket + cgroup → pod identity │
  └─────────────────────────┬───────────────────────┘
                            ↓
  netfilter / IP routing
       ↓
  socket → process (syscall boundary)
  ┌─────────────────────────────────────────────────┐
  │  Tracepoint / kprobe  (Falco: syscall monitor)  │
  │  Sees: any kernel event, any process, any pod   │
  └─────────────────────────────────────────────────┘
  ┌─────────────────────────────────────────────────┐
  │  LSM hook  (Tetragon: kernel-level enforcement) │
  │  Sees: security check context. Can DENY.        │
  └─────────────────────────────────────────────────┘
       ↓
  IP routing → qdisc
  ┌─────────────────────────────────────────────────┐
  │  TC egress hook  (Cilium: pod policy egress)    │
  │  Sees: socket + cgroup on outbound traffic      │
  └─────────────────────────────────────────────────┘
       ↓
  NIC → wire
</code></pre>
<hr />
<p>eBPF program types define where in the kernel a hook fires and what it can see — and knowing the difference is what makes you effective when Cilium or Falco behave unexpectedly. What we hadn&#8217;t answered — and what a 2am incident eventually forced — is what kind of eBPF programs are actually running on your nodes, and why the difference matters when something breaks.</p>
<p>A pod in production was dropping roughly one in fifty outbound TCP connections. Not all of them — just enough to cause intermittent timeouts in the application logs. NetworkPolicy showed egress allowed. Cilium reported no violations. Running <code class="" data-line="">curl</code> manually from inside the pod worked every time.</p>
<p>I spent the better part of three hours eliminating possibilities. DNS. MTU. Node-level conntrack table exhaustion. Upstream firewall rules. Nothing.</p>
<p>Eventually, almost as an afterthought, I ran this:</p>
<pre><code class="" data-line="">sudo bpftool prog list
</code></pre>
<p>There were two TC programs attached to that pod&#8217;s veth interface. One from the current Cilium version. One from the previous version — left behind by a rolling upgrade that hadn&#8217;t cleaned up properly. Two programs. Different policy state. One was occasionally dropping packets based on rules that no longer existed in the current policy model.</p>
<p>The answer had been sitting in the kernel the whole time. I just didn&#8217;t know where to look.</p>
<p>That incident forced me to actually understand something I&#8217;d been hand-waving for two years: eBPF isn&#8217;t a single hook. It&#8217;s a family of program types, each attached to a different location in the kernel, each seeing different data, each suited for different problems. Understanding the difference is what separates &#8220;I run Cilium and Falco&#8221; from &#8220;I understand what Cilium and Falco are actually doing on my nodes&#8221; — and that difference matters when something breaks at 2am.</p>
<h2 id="the-command-you-should-run-on-your-cluster-right-now">The Command You Should Run on Your Cluster Right Now</h2>
<p>Before getting into the theory, do this:</p>
<pre><code class="" data-line=""># See every eBPF program loaded on the node
sudo bpftool prog list

# See every eBPF program attached to a network interface
sudo bpftool net list
</code></pre>
<p>On a node running Cilium and Falco, you&#8217;ll see something like this:</p>
<pre><code class="" data-line="">42: xdp           name cil_xdp_entry       loaded_at 2026-04-01T09:23:41
43: sched_cls     name cil_from_netdev      loaded_at 2026-04-01T09:23:41
44: sched_cls     name cil_to_netdev        loaded_at 2026-04-01T09:23:41
51: cgroup_sock_addr  name cil_sock4_connect loaded_at 2026-04-01T09:23:41
88: raw_tracepoint  name sys_enter          loaded_at 2026-04-01T09:23:55
89: raw_tracepoint  name sys_exit           loaded_at 2026-04-01T09:23:55
</code></pre>
<p>Each line is a different program type. Each one fires at a different point in the kernel. The type column — <code class="" data-line="">xdp</code>, <code class="" data-line="">sched_cls</code>, <code class="" data-line="">raw_tracepoint</code>, <code class="" data-line="">cgroup_sock_addr</code> — tells you where in the kernel execution path that program is attached and therefore what it can and cannot see.</p>
<p>If you see more programs than you expect on a specific interface — like I did — that&#8217;s your first clue.</p>
<h2 id="why-program-types-exist">Why Program Types Exist</h2>
<p>The Linux kernel isn&#8217;t a single pipeline. Network packets, system calls, file operations, process scheduling — these all run through different subsystems with different execution contexts and different available data.</p>
<p>eBPF lets you attach programs to specific points within those subsystems. The &#8220;program type&#8221; is the contract: it defines where the hook fires, what data the program receives, and what it&#8217;s allowed to do with it. A program designed to process network packets before they hit the kernel stack looks completely different from one designed to intercept system calls across all containers simultaneously.</p>
<p>Most of us will interact with four or five program types through the tools we already run. Understanding what each one actually is — where it sits, what it sees — is what makes you effective when those tools behave unexpectedly.</p>
<h2 id="the-types-behind-the-tools-you-already-use">The Types Behind the Tools You Already Use</h2>
<h3 id="tc-why-cilium-can-tell-which-pod-sent-a-packet">TC — Why Cilium Can Tell Which Pod Sent a Packet</h3>
<p>TC stands for Traffic Control. It&#8217;s where Cilium enforces your NetworkPolicy, and it&#8217;s what caused my incident.</p>
<p>TC programs attach to network interfaces — specifically to the ingress and egress directions of the pod&#8217;s virtual interface (<code class="" data-line="">lxcXXXXX</code> in Cilium&#8217;s naming). They fire after the kernel has already processed the packet enough to know its context: which socket created it, which cgroup that socket belongs to. Cgroup maps to container, container maps to pod.</p>
<p>This is the critical piece: <strong>TC is how Cilium knows which pod a packet belongs to</strong>. Without that cgroup context, per-pod policy enforcement isn&#8217;t possible.</p>
<pre><code class="" data-line=""># See TC programs on a pod&#039;s veth interface
sudo tc filter show dev lxc12345 ingress
sudo tc filter show dev lxc12345 egress

# If you see two entries on the same direction — that&#039;s the incident I described
# The priority number (pref 1, pref 2) tells you the order they run
</code></pre>
<p>When there are two TC programs on the same interface, the first one to return &#8220;drop&#8221; wins. The second program never runs. This is why the issue was intermittent rather than consistent — the stale program only matched specific connection patterns.</p>
<p>Fixing it is straightforward once you know what to look for:</p>
<pre><code class="" data-line=""># Remove a stale TC filter by its priority number
sudo tc filter del dev lxc12345 egress pref 2
</code></pre>
<p>Add this check to your post-upgrade runbook. Cilium upgrades are generally clean but not always.</p>
<h3 id="xdp-why-cilium-doesnt-use-tc-for-everything">XDP — Why Cilium Doesn&#8217;t Use TC for Everything</h3>
<p>If TC is good enough for pod-level policy, why does Cilium also run an XDP program on the node&#8217;s main interface? Look at the <code class="" data-line="">bpftool prog list</code> output again — there&#8217;s an <code class="" data-line="">xdp</code> program loaded alongside the TC programs.</p>
<p>XDP fires earlier. Much earlier. Before the kernel allocates any memory for the packet. Before routing. Before connection tracking. Before anything.</p>
<p>The tradeoff is exactly what you&#8217;d expect: XDP is fast but context-poor. It sees raw packet bytes. It doesn&#8217;t know which pod the packet came from. It can&#8217;t read cgroup information because no socket buffer has been allocated yet.</p>
<p>Cilium uses XDP specifically for ClusterIP service load balancing — when a packet arrives at the node destined for a service VIP, XDP rewrites the destination to the actual pod IP in a single map lookup and sends it on its way. No iptables. No conntrack. The work is done before the kernel stack is involved.</p>
<p>There&#8217;s a silent failure mode worth knowing about here. XDP runs in one of two modes:</p>
<ul>
<li><strong>Native mode</strong> — runs inside the NIC driver itself, before any kernel allocation. This is where the performance comes from.</li>
<li><strong>Generic mode</strong> — fallback when the NIC driver doesn&#8217;t support XDP. Runs later, after <code class="" data-line="">sk_buff</code> allocation. No performance benefit over iptables.</li>
</ul>
<p>If your NIC doesn&#8217;t support native XDP, Cilium silently falls back to generic mode. The policy still works — but the performance characteristics you assumed aren&#8217;t there.</p>
<pre><code class="" data-line=""># Check which XDP mode is active on your node&#039;s main interface
ip link show eth0 | grep xdp
# xdpdrv  ← native mode (fast)
# xdpgeneric ← generic mode (no perf benefit)
</code></pre>
<p>Most cloud provider instance types with modern Mellanox/Intel NICs support native mode. Worth verifying rather than assuming.</p>
<h3 id="tracepoints-how-falco-sees-every-container">Tracepoints — How Falco Sees Every Container</h3>
<p>Falco loads two programs: <code class="" data-line="">sys_enter</code> and <code class="" data-line="">sys_exit</code>. These are raw tracepoints — they fire on every single system call, from every process, in every container on the node.</p>
<p>Tracepoints are explicitly defined and maintained instrumentation points in the kernel. Unlike hooks that attach to specific internal function names (which can be renamed or inlined between kernel versions), tracepoints are stable interfaces. They&#8217;re part of the kernel&#8217;s public contract with tooling that wants to instrument it.</p>
<p>This matters operationally. When you patch your nodes — and cloud-managed nodes get patched frequently — tools built on tracepoints keep working. Tools built on kprobes (internal function hooks) may silently stop firing if the function they&#8217;re attached to gets renamed or inlined by the compiler in a new kernel build.</p>
<pre><code class="" data-line=""># Verify what Falco is actually using
sudo bpftool prog list | grep -E &quot;kprobe|tracepoint&quot;

# Falco&#039;s current eBPF driver should show raw_tracepoint entries
# If you see kprobe entries from Falco, you&#039;re on the older driver
# Check: falco --version and the driver being loaded at startup
</code></pre>
<p>If you&#8217;re running Falco on a cluster that gets regular OS patch upgrades and you haven&#8217;t verified the driver mode, check it. The older kprobe-based driver has a real failure mode on certain kernel versions.</p>
<h3 id="lsm-how-tetragon-blocks-operations-at-the-kernel-level">LSM — How Tetragon Blocks Operations at the Kernel Level</h3>
<p>LSM hooks run at the kernel&#8217;s security decision points: file opens, socket connections, process execution, capability checks. The defining characteristic is that they can <em>deny</em> an operation. Return an error from an LSM hook and the kernel refuses the syscall before it completes.</p>
<p>This is qualitatively different from observability hooks. kprobes and tracepoints watch. LSM hooks enforce.</p>
<p>When you see Tetragon configured to kill a process attempting a privileged operation, or block a container from writing to a specific path, that&#8217;s an LSM hook making the decision inside the kernel — not a sidecar watching traffic, not an admission webhook running before pod creation, not a userspace agent trying to act fast enough. The enforcement is in the kernel itself.</p>
<pre><code class="" data-line=""># See if any LSM eBPF programs are active on the node
sudo bpftool prog list | grep lsm

# Verify LSM eBPF support on your kernel (required for Tetragon enforcement mode)
grep CONFIG_BPF_LSM /boot/config-$(uname -r)
# CONFIG_BPF_LSM=y   ← required
</code></pre>
<h2 id="the-practical-summary">The Practical Summary</h2>
<table>
<thead>
<tr>
<th>What&#8217;s happening on your node</th>
<th>Program type</th>
<th>Where to look</th>
</tr>
</thead>
<tbody>
<tr>
<td>Cilium service load balancing</td>
<td>XDP</td>
<td><code class="" data-line="">ip link show eth0 \| grep xdp</code></td>
</tr>
<tr>
<td>Cilium pod network policy</td>
<td>TC (<code class="" data-line="">sched_cls</code>)</td>
<td><code class="" data-line="">tc filter show dev lxcXXXX egress</code></td>
</tr>
<tr>
<td>Falco syscall monitoring</td>
<td>Tracepoint</td>
<td><code class="" data-line="">bpftool prog list \| grep tracepoint</code></td>
</tr>
<tr>
<td>Tetragon enforcement</td>
<td>LSM</td>
<td><code class="" data-line="">bpftool prog list \| grep lsm</code></td>
</tr>
<tr>
<td>Anything unexpected</td>
<td>All types</td>
<td><code class="" data-line="">bpftool prog list</code>, <code class="" data-line="">bpftool net list</code></td>
</tr>
</tbody>
</table>
<h2 id="the-incident-revisited">The Incident, Revisited</h2>
<p>Three hours of debugging. The answer was a stale TC program sitting at priority 2 on a pod&#8217;s veth interface, left behind by an incomplete Cilium upgrade.</p>
<pre><code class="" data-line=""># What I should have run first
sudo bpftool net list
sudo tc filter show dev lxc12345 egress
</code></pre>
<p>Two commands. Thirty seconds. If I&#8217;d known that TC programs can stack on the same interface, I&#8217;d have started there.</p>
<p>That&#8217;s the point of understanding program types — not to write eBPF programs yourself, but to know where to look when the tools you depend on don&#8217;t behave the way you expect. The programs are already there, running on your nodes right now. <code class="" data-line="">bpftool prog list</code> shows you all of them.</p>
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li><code class="" data-line="">bpftool prog list</code> and <code class="" data-line="">bpftool net list</code> show every eBPF program on a node — run these before anything else when debugging eBPF-based tool behavior</li>
<li>TC programs can stack on the same interface; stale programs from incomplete Cilium upgrades cause intermittent drops — check <code class="" data-line="">tc filter show</code> after every Cilium upgrade</li>
<li>XDP runs before the kernel stack — fastest hook, but no pod identity; Cilium uses it for service load balancing, not pod policy</li>
<li>XDP silently falls back to generic mode on unsupported NICs — verify with <code class="" data-line="">ip link show | grep xdp</code></li>
<li>Tracepoints are stable across kernel versions; kprobe-based tools may silently break after node OS patches — verify your Falco driver mode</li>
<li>LSM hooks enforce at the kernel level — this is what makes Tetragon&#8217;s enforcement mode fundamentally different from sidecar-based approaches</li>
</ul>
<h2 id="whats-next">What&#8217;s Next</h2>
<p>Every eBPF program fires, does its work, and exits — but the work always involves data.</p>
<p>Get EP05 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a> Counting connections. Tracking processes. Streaming events to a detection engine. In EP05, I&#8217;ll cover eBPF maps: the persistent data layer that connects kernel programs to the tools consuming their output. Understanding maps explains a class of production issues — and makes <code class="" data-line="">bpftool map dump</code> useful rather than cryptic.</p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-program-types-kubernetes%2F&amp;linkname=eBPF%20Program%20Types%20%E2%80%94%20What%E2%80%99s%20Actually%20Running%20on%20Your%20Nodes" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-program-types-kubernetes%2F&amp;linkname=eBPF%20Program%20Types%20%E2%80%94%20What%E2%80%99s%20Actually%20Running%20on%20Your%20Nodes" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-program-types-kubernetes%2F&amp;linkname=eBPF%20Program%20Types%20%E2%80%94%20What%E2%80%99s%20Actually%20Running%20on%20Your%20Nodes" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-program-types-kubernetes%2F&amp;linkname=eBPF%20Program%20Types%20%E2%80%94%20What%E2%80%99s%20Actually%20Running%20on%20Your%20Nodes" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-program-types-kubernetes%2F&amp;linkname=eBPF%20Program%20Types%20%E2%80%94%20What%E2%80%99s%20Actually%20Running%20on%20Your%20Nodes" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-program-types-kubernetes%2F&amp;linkname=eBPF%20Program%20Types%20%E2%80%94%20What%E2%80%99s%20Actually%20Running%20on%20Your%20Nodes" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-program-types-kubernetes%2F&amp;linkname=eBPF%20Program%20Types%20%E2%80%94%20What%E2%80%99s%20Actually%20Running%20on%20Your%20Nodes" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Febpf-program-types-kubernetes%2F&#038;title=eBPF%20Program%20Types%20%E2%80%94%20What%E2%80%99s%20Actually%20Running%20on%20Your%20Nodes" data-a2a-url="https://linuxcent.com/ebpf-program-types-kubernetes/" data-a2a-title="eBPF Program Types — What’s Actually Running on Your Nodes"></a></p><p>The post <a href="https://linuxcent.com/ebpf-program-types-kubernetes/">eBPF Program Types — What&#8217;s Actually Running on Your Nodes</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/ebpf-program-types-kubernetes/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1450</post-id>	</item>
		<item>
		<title>The Runtime Reckoning: Dockershim Out, eBPF In, and PSP Finally Dies (2022–2023)</title>
		<link>https://linuxcent.com/kubernetes-dockershim-removal-ebpf/</link>
					<comments>https://linuxcent.com/kubernetes-dockershim-removal-ebpf/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Wed, 08 Apr 2026 19:31:17 +0000</pubDate>
				<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Cilium]]></category>
		<category><![CDATA[Cloud Native]]></category>
		<category><![CDATA[Container Runtime]]></category>
		<category><![CDATA[Dockershim]]></category>
		<category><![CDATA[eBPF]]></category>
		<category><![CDATA[Kubernetes Security]]></category>
		<guid isPermaLink="false">https://linuxcent.com/kubernetes-dockershim-removal-ebpf/</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span>Dockershim was removed in v1.24. PSP was deleted in v1.25. eBPF displaced iptables in the network stack. Trace the runtime reckoning that modernized Kubernetes infrastructure.</p>
<p>The post <a href="https://linuxcent.com/kubernetes-dockershim-removal-ebpf/">The Runtime Reckoning: Dockershim Out, eBPF In, and PSP Finally Dies (2022–2023)</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;<br />
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;<br />
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;<br />
    font-size:.88em;line-height:1.6;border-left:4px solid #555}<br />
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}<br />
pre code{background:transparent;padding:0;color:inherit}<br />
pre[data-lang="bash"],pre[data-lang="sh"],<br />
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}<br />
pre[data-lang="yaml"],pre[data-lang="json"],<br />
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}<br />
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],<br />
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}<br />
pre[data-lang="text"],pre[data-lang="output"],<br />
pre[data-lang="console"]{border-left-color:#888}<br />
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;<br />
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;<br />
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;<br />
    transition:opacity .15s,background .15s;line-height:1.6}<br />
pre:hover .lc-copy-btn{opacity:1}<br />
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}<br />
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}<br />
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;<br />
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;<br />
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}<br />
pre:hover .lc-lang-badge{opacity:1}<br />
table{border-collapse:collapse;width:100%;margin:16px 0}<br />
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}<br />
th{background:#f0f0f0;font-weight:600}<br />
tr:nth-child(even){background:#fafafa}<br />
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<hr />
<h2 id="introduction">Introduction</h2>
<p>2022 is the year Kubernetes dealt with its legacy. The Docker shim that everyone had been warned about for two years was actually removed. PodSecurityPolicy — the broken security primitive that clusters had depended on since 1.3 — was deleted. And eBPF started displacing iptables as the networking substrate.</p>
<p>These weren&#8217;t additions to Kubernetes. They were the removal of technical debt accumulated over eight years. And the migrations they forced were the most operationally significant events since RBAC went stable.</p>
<hr />
<h2 id="kubernetes-124-dockershim-removed-may-2022">Kubernetes 1.24 — Dockershim Removed (May 2022)</h2>
<p>The dockershim was removed in 1.24. The deprecation had been announced in 1.20 (December 2020) — 18 months of warning. It didn&#8217;t matter. Operators who hadn&#8217;t migrated still scrambled.</p>
<p>The actual migration was straightforward for most environments:</p>
<pre><code class="" data-line=""># On each node, before upgrading to 1.24:
# 1. Install containerd
apt-get install -y containerd.io

# 2. Configure containerd
containerd config default | tee /etc/containerd/config.toml
# Edit: set SystemdCgroup = true in runc options

# 3. Update kubelet to use containerd socket
# /etc/systemd/system/kubelet.service.d/10-kubeadm.conf
# Add: --container-runtime-endpoint=unix:///run/containerd/containerd.sock

# 4. Restart
systemctl daemon-reload &amp;&amp; systemctl restart kubelet
</code></pre>
<p>What the migration revealed: how many teams were depending on the Docker socket being present on nodes. Tools that mounted <code class="" data-line="">/var/run/docker.sock</code> to talk to the Docker daemon — build tools, CI agents, some monitoring agents — broke. The ecosystem had to adapt to <code class="" data-line="">nerdctl</code> (containerd&#8217;s Docker-compatible CLI), Kaniko, Buildah, or mounting the containerd socket instead.</p>
<p>Other 1.24 highlights:<br />
&#8211; <strong>Beta APIs disabled by default</strong>: New beta features would no longer be enabled automatically. This reversed a long-standing policy that had caused too many production clusters to accidentally pick up unstable features<br />
&#8211; <strong>gRPC probes</strong> stable: Liveness and readiness probes could now use gRPC health checks natively — no more writing HTTP wrapper endpoints for gRPC services<br />
&#8211; <strong>Non-graceful node shutdown</strong> alpha: Handle the case where the node disappears without the kubelet getting to gracefully terminate pods — stateful workloads on node failure</p>
<hr />
<h2 id="kubernetes-125-psp-removed-august-2022">Kubernetes 1.25 — PSP Removed (August 2022)</h2>
<p>PodSecurityPolicy was deleted in 1.25. Every cluster that was still using PSP had to migrate to Pod Security Admission (or OPA/Gatekeeper or Kyverno) before upgrading.</p>
<p>Pod Security Admission was GA in 1.25, ready to take over:</p>
<pre><code class="" data-line=""># Enforce restricted policy on a namespace
kubectl label namespace production \
  pod-security.kubernetes.io/enforce=restricted \
  pod-security.kubernetes.io/enforce-version=v1.25

# Test a pod against the policy without enforcing
kubectl label namespace staging \
  pod-security.kubernetes.io/warn=restricted \
  pod-security.kubernetes.io/audit=restricted
</code></pre>
<p>The dry-run modes (warn, audit) were critical for migration: you could enable them on namespaces and watch what would have been rejected before switching to enforce mode.</p>
<p>The real migration challenge was existing workloads running as root, with privileged security contexts, or with hostPath mounts. The restricted policy rejected all of these. Production applications that had been running for years under permissive PSP policies now failed validation.</p>
<p>Also in 1.25:<br />
&#8211; <strong>Ephemeral containers</strong> stable: Attach a debug container to a running pod without restarting it</p>
<pre><code class="" data-line=""># Debug a running pod with no shell
kubectl debug -it nginx-pod --image=busybox:latest --target=nginx
</code></pre>
<ul>
<li><strong>CSI ephemeral volumes</strong> stable</li>
<li><strong>cgroups v2</strong> (unified hierarchy) support stable: Enables memory QoS, improved resource accounting</li>
</ul>
<hr />
<h2 id="kubernetes-126-structured-parameter-scheduling-storage-december-2022">Kubernetes 1.26 — Structured Parameter Scheduling, Storage (December 2022)</h2>
<p>1.26 focused on the scheduler and storage:<br />
&#8211; <strong>Dynamic Resource Allocation</strong> alpha: A generalization of the device plugin API — allows requesting complex resources (GPUs, FPGAs, network adapters) with scheduling constraints. The foundation for AI/ML workload scheduling on heterogeneous hardware<br />
&#8211; <strong>CrossNamespacePVCDataSource</strong> beta: Clone a PVC across namespaces — enables namespace-based data isolation while sharing data sets<br />
&#8211; <strong>Pod scheduling readiness</strong> alpha: A pod can declare that it&#8217;s not ready to be scheduled until external conditions are met (data pre-loading complete, license validated, etc.)<br />
&#8211; <strong>Removal of in-tree cloud provider code</strong> (beta, continued): A long-running effort to move cloud-provider-specific code out of the core Kubernetes binary</p>
<p>The Dynamic Resource Allocation feature deserves emphasis: it&#8217;s the mechanism that makes Kubernetes a serious platform for GPU scheduling in AI/ML workloads. Device plugins (the prior mechanism) had limitations — a pod either got a GPU or it didn&#8217;t. DRA allows richer resource semantics: this pod needs two GPUs on the same PCIe bus, or this pod needs a specific GPU model.</p>
<hr />
<h2 id="ebpf-reshapes-kubernetes-networking">eBPF Reshapes Kubernetes Networking</h2>
<p>The most significant architectural shift in Kubernetes networking during 2022–2023 wasn&#8217;t a Kubernetes release feature. It was the adoption of eBPF-based CNI solutions — primarily Cilium — as the default networking layer in major managed Kubernetes offerings.</p>
<p><strong>The iptables problem</strong>: kube-proxy has been using iptables rules to implement Service routing since Kubernetes 1.0. Every Service adds iptables rules to every node. At 10,000 services, the iptables rule table on each node has hundreds of thousands of rules. Traversing these rules on every packet is O(n). Updating them requires locking and flushing. At scale, iptables becomes a bottleneck.</p>
<p><strong>The eBPF solution</strong>: Cilium replaces kube-proxy entirely, implementing Service routing using eBPF maps — hash tables in kernel memory. Service lookup is O(1). Rule updates don&#8217;t require locking. Network policy enforcement happens in the kernel, before packets even reach the application.</p>
<pre><code class="" data-line=""># Check if Cilium is running in kube-proxy replacement mode
cilium status | grep &quot;KubeProxy replacement&quot;
# KubeProxy replacement:    True

# eBPF-based service map — inspect directly
cilium service list
# ID   Frontend          Service Type   Backend
# 1    10.96.0.1:443     ClusterIP      10.0.0.5:6443
# 2    10.96.0.10:53     ClusterIP      10.0.1.2:53, 10.0.1.3:53
</code></pre>
<p><strong>Network policy enforcement</strong>: Cilium&#8217;s NetworkPolicy implementation enforces rules at the eBPF layer — packets that would be dropped by policy are dropped before they ever leave the kernel, before they touch the pod&#8217;s network stack. This is both faster and more secure than userspace enforcement.</p>
<p><strong>Hubble</strong>: Cilium&#8217;s observability layer — built on the same eBPF probes — provides real-time network flow visibility, HTTP layer observability (which service called which endpoint, response codes), and DNS query logging without any application changes.</p>
<p>Major adoption milestones:<br />
&#8211; GKE&#8217;s default CNI became Cilium (Dataplane V2) in 2021<br />
&#8211; Amazon EKS added Cilium support<br />
&#8211; Azure AKS enabled Cilium-based networking<br />
&#8211; Google&#8217;s Autopilot clusters use Cilium exclusively</p>
<hr />
<h2 id="kubernetes-127-graceful-failure-in-place-resize-alpha-april-2023">Kubernetes 1.27 — Graceful Failure, In-Place Resize Alpha (April 2023)</h2>
<ul>
<li><strong>In-Place Pod Vertical Scaling</strong> alpha: Change the CPU and memory resources of a running container without restarting the pod. For databases, JVM-based applications, and anything with warm caches, live resizing is a significant operational improvement</li>
</ul>
<pre><code class="" data-line=""># Resize a container&#039;s CPU without restart
kubectl patch pod database-pod --type=&#039;json&#039; \
  -p=&#039;[{&quot;op&quot;: &quot;replace&quot;, &quot;path&quot;: &quot;/spec/containers/0/resources/requests/cpu&quot;, &quot;value&quot;: &quot;2&quot;}]&#039;
</code></pre>
<ul>
<li><strong>SeccompDefault</strong> stable: Enable the default seccomp profile (RuntimeDefault) cluster-wide — a meaningful reduction in the default syscall attack surface for all pods</li>
<li><strong>Mutable scheduling directives for Jobs</strong> stable: Change node affinity and tolerations of pending (not yet running) Job pods</li>
<li><strong>ReadWriteOncePod PersistentVolume access mode</strong> stable: A volume can only be mounted by a single pod at a time — the correct semantic for databases with file-level locking requirements</li>
</ul>
<hr />
<h2 id="the-15-million-lines-removed-cloud-provider-code-migration">The 1.5 Million Lines Removed: Cloud Provider Code Migration</h2>
<p>One of the largest ongoing engineering efforts in Kubernetes 1.26–1.31 was the removal of in-tree cloud provider code. Every major cloud provider (AWS, Azure, GCP, OpenStack, vSphere) had code compiled directly into the Kubernetes control plane binaries.</p>
<p>The result: the Kubernetes API server and controller manager binaries contained code for AWS EBS volumes, GCE persistent disks, Azure managed disks, OpenStack Cinder — regardless of which cloud you were running on.</p>
<p>The migration moved this code to external Cloud Controller Managers (CCM) — separate processes that communicate with the API server like any other controller:</p>
<pre><code class="" data-line="">Before: kube-controller-manager (monolithic, includes all cloud providers)
After:  kube-controller-manager (generic) + cloud-controller-manager (cloud-specific, external)
</code></pre>
<p>By 1.31, approximately <strong>1.5 million lines of code</strong> had been removed from the core binaries, reducing binary sizes by approximately 40%. This is the largest refactor in Kubernetes history.</p>
<hr />
<h2 id="gateway-api-replacing-ingress-20222023">Gateway API: Replacing Ingress (2022–2023)</h2>
<p>The Ingress API, which graduated to stable in 1.19, has fundamental limitations:<br />
&#8211; No support for TCP/UDP routing (HTTP only)<br />
&#8211; No traffic splitting between multiple backends<br />
&#8211; No header-based routing<br />
&#8211; Vendor-specific features implemented via annotations (not portable)<br />
&#8211; No RBAC granularity within a single Ingress resource</p>
<p><strong>Gateway API</strong> (kubernetes-sigs/gateway-api) was designed as the successor, with a role-based model:</p>
<pre><code class="" data-line="">GatewayClass  → Managed by infrastructure provider (cluster admin)
Gateway       → Managed by cluster operators
HTTPRoute     → Managed by application developers
</code></pre>
<pre><code class="" data-line=""># Gateway — cluster operator configures the load balancer
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
  name: production-gateway
spec:
  gatewayClassName: nginx
  listeners:
  - name: https
    port: 443
    protocol: HTTPS
    tls:
      mode: Terminate
      certificateRefs:
      - name: tls-cert

---
# HTTPRoute — application team configures routing
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
  name: api-route
spec:
  parentRefs:
  - name: production-gateway
  rules:
  - matches:
    - path:
        type: PathPrefix
        value: /api/v2
    backendRefs:
    - name: api-v2-service
      port: 8080
      weight: 90
    - name: api-v3-canary
      port: 8080
      weight: 10
</code></pre>
<p>Gateway API reached GA (v1.0) in October 2023, with the core HTTPRoute, Gateway, and GatewayClass resources graduating to stable.</p>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>Dockershim removal in 1.24 completed the CRI migration that started in 1.5 — the Kubernetes runtime interface is now clean, with containerd and CRI-O as the standard runtimes</li>
<li>PSP removal in 1.25 forced a migration that should have happened years earlier; Pod Security Admission&#8217;s simplicity is a feature, not a limitation</li>
<li>eBPF-based networking (Cilium, Dataplane V2) is now the default in GKE and increasingly in EKS and AKS — O(1) service routing and kernel-level policy enforcement replace the iptables approach that dated to Kubernetes 1.0</li>
<li>Dynamic Resource Allocation (1.26 alpha) is the foundation for AI/ML GPU scheduling — more capable than device plugins and designed for heterogeneous hardware requests</li>
<li>Gateway API reaching GA replaced the annotation-driven, non-portable Ingress API with a role-oriented, extensible routing API</li>
<li>The cloud provider code removal (1.5M lines) is the largest refactor in Kubernetes history, a prerequisite for a maintainable, leaner core</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p><a href="ep05-security-hardens.md">← EP05: Security Hardens</a> | <a href="ep07-platform-engineering.md">EP07: Platform Engineering Era →</a></p>
<p><em>Series: Kubernetes: From Borg to Platform Engineering | linuxcent.com</em></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Fkubernetes-dockershim-removal-ebpf%2F&amp;linkname=The%20Runtime%20Reckoning%3A%20Dockershim%20Out%2C%20eBPF%20In%2C%20and%20PSP%20Finally%20Dies%20%282022%E2%80%932023%29" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Fkubernetes-dockershim-removal-ebpf%2F&amp;linkname=The%20Runtime%20Reckoning%3A%20Dockershim%20Out%2C%20eBPF%20In%2C%20and%20PSP%20Finally%20Dies%20%282022%E2%80%932023%29" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Fkubernetes-dockershim-removal-ebpf%2F&amp;linkname=The%20Runtime%20Reckoning%3A%20Dockershim%20Out%2C%20eBPF%20In%2C%20and%20PSP%20Finally%20Dies%20%282022%E2%80%932023%29" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Fkubernetes-dockershim-removal-ebpf%2F&amp;linkname=The%20Runtime%20Reckoning%3A%20Dockershim%20Out%2C%20eBPF%20In%2C%20and%20PSP%20Finally%20Dies%20%282022%E2%80%932023%29" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Fkubernetes-dockershim-removal-ebpf%2F&amp;linkname=The%20Runtime%20Reckoning%3A%20Dockershim%20Out%2C%20eBPF%20In%2C%20and%20PSP%20Finally%20Dies%20%282022%E2%80%932023%29" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Fkubernetes-dockershim-removal-ebpf%2F&amp;linkname=The%20Runtime%20Reckoning%3A%20Dockershim%20Out%2C%20eBPF%20In%2C%20and%20PSP%20Finally%20Dies%20%282022%E2%80%932023%29" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Fkubernetes-dockershim-removal-ebpf%2F&amp;linkname=The%20Runtime%20Reckoning%3A%20Dockershim%20Out%2C%20eBPF%20In%2C%20and%20PSP%20Finally%20Dies%20%282022%E2%80%932023%29" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Fkubernetes-dockershim-removal-ebpf%2F&#038;title=The%20Runtime%20Reckoning%3A%20Dockershim%20Out%2C%20eBPF%20In%2C%20and%20PSP%20Finally%20Dies%20%282022%E2%80%932023%29" data-a2a-url="https://linuxcent.com/kubernetes-dockershim-removal-ebpf/" data-a2a-title="The Runtime Reckoning: Dockershim Out, eBPF In, and PSP Finally Dies (2022–2023)"></a></p><p>The post <a href="https://linuxcent.com/kubernetes-dockershim-removal-ebpf/">The Runtime Reckoning: Dockershim Out, eBPF In, and PSP Finally Dies (2022–2023)</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/kubernetes-dockershim-removal-ebpf/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1656</post-id>	</item>
	</channel>
</rss>

<!--
Performance optimized by W3 Total Cache. Learn more: https://www.boldgrid.com/w3-total-cache/?utm_source=w3tc&utm_medium=footer_comment&utm_campaign=free_plugin

Page Caching using Disk: Enhanced 

Served from: linuxcent.com @ 2026-07-03 02:30:26 by W3 Total Cache
-->