<?xml version="1.0" encoding="UTF-8"?>
<source>
  <jobs>
    <job>
      <externalid>198d64d4-207</externalid>
      <Title>Senior/Staff Site Reliability Engineer</Title>
      <Description><![CDATA[<p>You are a seasoned SRE who keeps production infrastructure running at scale. You own the reliability and availability of customer-facing systems , from Kubernetes clusters to deployment pipelines to the networking layer that connects it all. You think in SLOs, automate ruthlessly, and treat every incident as a chance to make the system better.</p>
<p><strong>Key Responsibilities</strong></p>
<ul>
<li>Own and operate our Kubernetes infrastructure: cluster lifecycle, upgrades, networking, and multi-tenant isolation for customer workloads</li>
</ul>
<ul>
<li>Build and maintain CI/CD pipelines and deployment infrastructure</li>
</ul>
<ul>
<li>Leverage AI to an extreme level to automate analysis and resolution of production issues, and improve software development speed, reliability and maintainability</li>
</ul>
<ul>
<li>Build dashboards, alerting, and anomaly detection across our systems</li>
</ul>
<ul>
<li>Define and enforce SLOs and build out incident response processes</li>
</ul>
<ul>
<li>Manage and improve our networking, load balancing, and service mesh configurations</li>
</ul>
<ul>
<li>Drive reliability improvements across the stack through automation, runbooks, and chaos engineering</li>
</ul>
<p><strong>Requirements</strong></p>
<ul>
<li>5+ years experience in managing critical production systems and software development workflows</li>
</ul>
<ul>
<li>Strong production experience setting up and operating Kubernetes at scale, using infrastructure-as-code (Terraform, Ansible)</li>
</ul>
<ul>
<li>Deep knowledge of Linux networking, container networking (CNI plugins, VXLAN, BGP), and DNS</li>
</ul>
<ul>
<li>Experience building CI/CD systems and GitOps workflows (FluxCD, ArgoCD)</li>
</ul>
<ul>
<li>Proficiency in Python and either Go or Bash for tooling and automation</li>
</ul>
<ul>
<li>Strong experience with logging, monitoring and alerting (Prometheus, Grafana, Loki, Thanos, VictoriaMetrics, Datadog)</li>
</ul>
<ul>
<li>Excellent communication and ability to drive technical decisions across teams</li>
</ul>
<ul>
<li>Self-starter who executes quickly, takes ownership, and constantly seeks improvement</li>
</ul>
<p><strong>Nice to have</strong></p>
<ul>
<li>Experience with managing GPU and AI/ML workloads</li>
</ul>
<ul>
<li>Experience with kernel-based monitoring and routing (eBPF, XDP)</li>
</ul>
<ul>
<li>Experience with security tooling (Falco, Coroot, SIEM)</li>
</ul>
<ul>
<li>Experience with bare metal Kubernetes networking (Calico, Cilium, MetalLB)</li>
</ul>
<ul>
<li>Experience with distributed storage systems (Ceph, Longhorn, etc.)</li>
</ul>
<p><strong>Compensation</strong></p>
<ul>
<li>$180,000-250,000 plus equity + benefits</li>
</ul>
<p><strong>Benefits</strong></p>
<ul>
<li>Interesting and challenging work</li>
</ul>
<ul>
<li>A lot of learning and growth opportunities</li>
</ul>
<ul>
<li>Regular team events and offsites</li>
</ul>
<ul>
<li>Health, dental, and vision insurance (US)</li>
</ul>
<ul>
<li>Visa sponsorship and relocation assistance</li>
</ul>
<p style="margin-top:24px;font-size:13px;color:#666;">XML job scraping automation by <a href="https://yubhub.co">YubHub</a></p>]]></Description>
      <Jobtype>full-time</Jobtype>
      <Experiencelevel>senior</Experiencelevel>
      <Workarrangement>onsite</Workarrangement>
      <Salaryrange>$180,000-250,000</Salaryrange>
      <Skills>Kubernetes, Infrastructure-as-code, Linux networking, Container networking, CI/CD systems, GitOps workflows, Python, Go, Bash, Logging, Monitoring, Alerting, GPU and AI/ML workloads, Kernel-based monitoring and routing, Security tooling, Bare metal Kubernetes networking, Distributed storage systems</Skills>
      <Category>Engineering</Category>
      <Industry>Technology</Industry>
      <Employername>Fal</Employername>
      <Employerlogo>https://logos.yubhub.co/fal.com.png</Employerlogo>
      <Employerdescription>Fal is a technology company that operates in the San Francisco area.</Employerdescription>
      <Employerwebsite>https://fal.com</Employerwebsite>
      <Compensationcurrency></Compensationcurrency>
      <Compensationmin></Compensationmin>
      <Compensationmax></Compensationmax>
      <Applyto>https://job-boards.greenhouse.io/fal/jobs/4146019009</Applyto>
      <Location>San Francisco</Location>
      <Country></Country>
      <Postedate>2026-04-24</Postedate>
    </job>
  </jobs>
</source>