{"version":"0.1","company":{"name":"YubHub","url":"https://yubhub.co","jobsUrl":"https://yubhub.co/jobs/skill/observability-tooling"},"x-facet":{"type":"skill","slug":"observability-tooling","display":"Observability Tooling","count":15},"x-feed-size-limit":100,"x-feed-sort":"enriched_at desc","x-feed-notice":"This feed contains at most 100 jobs (the most recently enriched). For the full corpus, use the paginated /stats/by-facet endpoint or /search.","x-generator":"yubhub-xml-generator","x-rights":"Free to redistribute with attribution: \"Data by YubHub (https://yubhub.co)\"","x-schema":"Each entry in `jobs` follows https://schema.org/JobPosting. YubHub-native raw fields carry `x-` prefix.","jobs":[{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_70e2591f-d7d"},"title":"Technical Program Manager, Infrastructure","description":"<p>As a Technical Program Manager for Infrastructure, you&#39;ll work across multiple infrastructure domains to coordinate complex programs that have broad organisational impact. You&#39;ll be solving novel scaling challenges at the frontier of what&#39;s possible, all while maintaining the security and reliability our mission demands.</p>\n<p>Developer Productivity &amp; Tooling</p>\n<ul>\n<li>Drive cross-functional programs to improve developer environments, CI/CD infrastructure, and release processes that enable rapid innovation while maintaining high security standards</li>\n</ul>\n<ul>\n<li>Coordinate large-scale migrations and platform modernization efforts across engineering teams</li>\n</ul>\n<ul>\n<li>Partner with teams to measure and improve developer productivity metrics, identifying bottlenecks and driving systematic improvements</li>\n</ul>\n<ul>\n<li>Lead initiatives to integrate AI tools into development workflows, helping Anthropic be at the forefront of AI-assisted research and engineering</li>\n</ul>\n<p>Infrastructure Reliability &amp; Operations</p>\n<ul>\n<li>Drive programs to establish and achieve reliability targets across training infrastructure and production services</li>\n</ul>\n<ul>\n<li>Coordinate incident response improvements, post-mortem processes, and on-call rotations that help teams operate effectively</li>\n</ul>\n<ul>\n<li>Establish metrics and dashboards to track infrastructure health, capacity utilisation, and operational excellence</li>\n</ul>\n<p>Cross-functional Coordination</p>\n<ul>\n<li>Serve as the critical bridge between infrastructure teams, research, and product, translating technical complexities into clear updates for a variety of audiences</li>\n</ul>\n<ul>\n<li>Consult with stakeholders to deeply understand infrastructure, data, and compute needs, identifying solutions to support frontier research and product development</li>\n</ul>\n<ul>\n<li>Drive alignment on priorities and timelines across teams with competing constraints</li>\n</ul>\n<p>You&#39;ll be a good fit if you have 5+ years of technical program management experience, with a track record of successfully delivering complex infrastructure programs in ML/AI systems or large-scale distributed systems. You&#39;ll also need a deep technical understanding of infrastructure systems, strong stakeholder management skills, and the ability to navigate competing priorities-confirming data-driven technical decisions.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_70e2591f-d7d","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://www.anthropic.com/","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/5111783008","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$290,000-$365,000 USD","x-skills-required":["Kubernetes","Cloud platforms (AWS, GCP, Azure)","ML infrastructure (GPU/TPU/Trainium clusters)","Developer productivity initiatives","CI/CD systems","Infrastructure scaling"],"x-skills-preferred":["Observability tooling and practices","AI tools to improve engineering productivity","Research teams and translating their needs into concrete technical requirements"],"datePosted":"2026-04-18T15:57:52.097Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA | New York City, NY | Seattle, WA"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Kubernetes, Cloud platforms (AWS, GCP, Azure), ML infrastructure (GPU/TPU/Trainium clusters), Developer productivity initiatives, CI/CD systems, Infrastructure scaling, Observability tooling and practices, AI tools to improve engineering productivity, Research teams and translating their needs into concrete technical requirements","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":290000,"maxValue":365000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_f4cd384f-6ed"},"title":"Senior Software Engineer, Release Engineering","description":"<p>We are seeking a Senior Software Engineer to join our Release Engineering team, focused on building and improving the systems that enable automated, reliable, and scalable software delivery across Temporal&#39;s platform.</p>\n<p>In this role, you will participate in the full software lifecycle , from design and implementation to deployment and long-term operation , and will collaborate with engineering teams to evolve release automation, improve tooling, and reduce manual steps in how we build and ship Temporal.</p>\n<p>Key responsibilities include designing, building, and maintaining tools and systems that support release automation and deployment workflows, writing clean, reliable, and concurrent code that supports distributed systems, collaborating with cross-functional teams to understand and improve release quality and developer productivity, documenting technical designs, deployment practices, and operational procedures, and participating in small-team design reviews and contributing practical engineering solutions.</p>\n<p>As a Senior Software Engineer, you will have the opportunity to explore new ways to use Temporal to power the release and deployment lifecycle, deepen your understanding of Temporal&#39;s architecture and service interactions, and experiment with new automation patterns, testing strategies, and workflow designs that increase release confidence.</p>\n<p>To be successful in this role, you will need strong coding ability, especially in languages used at Temporal (e.g., Go, Java, or similar), a solid understanding of concurrency, distributed systems, and multi-threaded programming, experience contributing to backend systems, tooling, infrastructure, or developer workflows, a track record of solving moderately complex problems with reliable, maintainable solutions, and the ability to collaborate effectively in a remote, fast-paced environment.</p>\n<p>Additionally, you will have familiarity with release automation concepts, CI/CD pipelines, build tools, or deployment orchestration, experience with cloud environments (AWS, GCP) and container tooling, and exposure to distributed systems orchestration, observability tooling, or platform engineering.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_f4cd384f-6ed","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Temporal","sameAs":"https://temporal.io/","logo":"https://logos.yubhub.co/temporal.io.png"},"x-apply-url":"https://job-boards.greenhouse.io/temporaltechnologies/jobs/5090613007","x-work-arrangement":"remote","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$176,000 - $237,600","x-skills-required":["Go","Java","Concurrency","Distributed Systems","Multi-threaded Programming","Backend Systems","Tooling","Infrastructure","Developer Workflows","Release Automation","CI/CD Pipelines","Build Tools","Deployment Orchestration","Cloud Environments","Container Tooling","Distributed Systems Orchestration","Observability Tooling","Platform Engineering"],"x-skills-preferred":[],"datePosted":"2026-04-18T15:57:07.513Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"United States - Remote Opportunity"}},"jobLocationType":"TELECOMMUTE","employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Go, Java, Concurrency, Distributed Systems, Multi-threaded Programming, Backend Systems, Tooling, Infrastructure, Developer Workflows, Release Automation, CI/CD Pipelines, Build Tools, Deployment Orchestration, Cloud Environments, Container Tooling, Distributed Systems Orchestration, Observability Tooling, Platform Engineering","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":176000,"maxValue":237600,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_baad2598-8bc"},"title":"Staff / Senior Software Engineer, Compute Capacity","description":"<p><strong>About the Role</strong></p>\n<p>Anthropic&#39;s Accelerator Capacity Engineering (ACE) team manages one of the largest and fastest-growing accelerator fleets in the industry. As an engineer on ACE, you will build the production systems that power this work: data pipelines that ingest and normalize telemetry from heterogeneous cloud environments, observability tooling that gives the org real-time visibility into fleet health, and performance instrumentation that measures how efficiently every major workload uses the hardware it’s running on.</p>\n<p><strong>What This Team Owns</strong></p>\n<p>The team’s work spans three functional areas: data infrastructure, fleet observability, and compute efficiency. Depending on your background and interests, you’ll focus primarily in one, but the boundaries are fluid and the problems overlap:</p>\n<p><strong>Data Infrastructure</strong></p>\n<p>Collecting, normalizing, and serving the fleet-wide data that powers everything else. This means building pipelines that ingest occupancy and utilization telemetry from Kubernetes clusters, normalizing billing and usage data across cloud providers, and maintaining the BigQuery layer that the rest of the org queries against.</p>\n<p><strong>Fleet Observability</strong></p>\n<p>Making the state of the accelerator fleet legible and actionable in real time. This means building cluster health tooling, capacity planning platforms, alerting on occupancy drops and allocation problems, and driving systemic improvements to scheduling and fragmentation.</p>\n<p><strong>Compute Efficiency</strong></p>\n<p>Measuring and improving how effectively every major workload uses the hardware it’s running on. This means instrumenting utilization metrics across training, inference, and eval systems, building benchmarking infrastructure, establishing per-config baselines, and collaborating directly with system-owning teams to close efficiency gaps.</p>\n<p><strong>What You’ll Do</strong></p>\n<ul>\n<li>Build and operate data pipelines that ingest accelerator occupancy, utilization, and cost data from multiple cloud providers into BigQuery.</li>\n<li>Develop and maintain observability infrastructure , Prometheus recording rules, Grafana dashboards, and alerting systems , that surface actionable signals about fleet health, occupancy, and efficiency.</li>\n<li>Instrument and analyze compute efficiency metrics across training, inference, and eval workloads.</li>\n<li>Build internal tooling and platforms that enable capacity planning, workload attribution, and cluster debugging.</li>\n<li>Operate Kubernetes-native systems at scale , deploying data collection agents, managing workload labeling infrastructure, and understanding how taints, reservations, and scheduling affect capacity.</li>\n<li>Normalize and reconcile data across heterogeneous sources , including AWS, GCP, and Azure billing exports, vendor-specific telemetry formats, and internal systems with different schemas and billing arrangements.</li>\n</ul>\n<p><strong>You May Be a Good Fit If You Have</strong></p>\n<ul>\n<li>5+ years of software engineering experience with a strong track record building and operating production systems.</li>\n<li>Kubernetes fluency at operational depth , you’ve operated production K8s at meaningful scale, not just written manifests.</li>\n<li>Data pipeline engineering experience , designing, building, and owning the full lifecycle of production data pipelines.</li>\n<li>Observability tooling experience , Prometheus, PromQL, and Grafana are in the critical path for this team.</li>\n<li>Python and SQL at production quality.</li>\n<li>Familiarity with at least one major cloud provider (AWS, GCP, or Azure) at the infrastructure level , compute, billing, usage APIs, cost management tooling.</li>\n</ul>\n<p><strong>Strong Candidates May Also Have</strong></p>\n<ul>\n<li>Multi-cloud data ingestion experience , especially working with AWS and GCP APIs, billing exports, or vendor-specific telemetry formats.</li>\n<li>Accelerator infrastructure familiarity , GPU metrics (DCGM), TPU utilization, Trainium power and utilization metrics, or experience working with ML training/inference systems at the hardware level.</li>\n<li>Performance engineering and benchmarking experience , building benchmark harnesses, establishing baselines, reasoning about compute efficiency (FLOPs utilization, memory bandwidth, interconnect throughput), and working with system teams to diagnose and improve performance.</li>\n<li>Data-as-product thinking , experience building internal data products with self-service access, schema contracts, API serving, documentation,</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_baad2598-8bc","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://www.anthropic.co/","logo":"https://logos.yubhub.co/anthropic.co.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/5126702008","x-work-arrangement":"onsite","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":null,"x-skills-required":["Kubernetes","Python","SQL","Prometheus","Grafana","BigQuery","Cloud computing","Data pipeline engineering","Observability tooling"],"x-skills-preferred":["Multi-cloud data ingestion","Accelerator infrastructure","Performance engineering","Data-as-product thinking"],"datePosted":"2026-04-18T15:56:02.706Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA | New York City, NY"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Kubernetes, Python, SQL, Prometheus, Grafana, BigQuery, Cloud computing, Data pipeline engineering, Observability tooling, Multi-cloud data ingestion, Accelerator infrastructure, Performance engineering, Data-as-product thinking"},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_067a9092-157"},"title":"Manager, Software Engineering - Observability","description":"<p>We are seeking a Manager, Software Engineering - Observability to lead our team of engineers responsible for the reliability, scalability, and evolution of Figma&#39;s observability and cost engineering platforms.</p>\n<p>As a key member of our engineering team, you will own and operate Figma&#39;s core observability stack, including vendor platforms such as Datadog, ensuring high availability, strong data quality, and effective signal-to-noise across metrics, logs, and traces.</p>\n<p>You will define and drive the technical strategy for instrumentation standards, observability libraries, agents, and operators used to monitor internal and external facing services. You will also explore and implement innovative, AI-driven approaches to anomaly detection, root cause analysis, signal correlation, and operational automation.</p>\n<p>In addition, you will establish clear frameworks for cost attribution, budgeting, forecasting, and alerting across infrastructure and observability spend, enabling teams to make informed tradeoffs.</p>\n<p>You will partner with infrastructure, product engineering, finance, and security teams to improve visibility into system health and cost efficiency at scale.</p>\n<p>You will lead initiatives to optimize observability footprint and spend, balancing depth of insight with performance and cost considerations.</p>\n<p>You will coach and mentor engineers through career development, performance feedback, and technical leadership, fostering a culture of ownership, collaboration, and high-quality execution.</p>\n<p>We are looking for someone with 4+ years of experience leading infrastructure, observability, or platform engineering teams, with a track record of delivering highly reliable production systems.</p>\n<p>You should have deep hands-on experience with modern observability platforms (e.g., Datadog, OpenTelemetry) across metrics, logs, and distributed tracing.</p>\n<p>You should have a strong understanding of distributed systems, instrumentation best practices, SLO design, and incident response workflows.</p>\n<p>Experience driving cost transparency and accountability initiatives, including cost attribution, budgeting, forecasting, and alerting in cloud environments is also required.</p>\n<p>Preferred skills include experience designing or evolving company-wide observability standards, shared libraries, and agent/operator-based integrations, background in cost optimization for infrastructure or observability tooling, including vendor negotiations and usage modeling, and experience applying AI or machine learning techniques to anomaly detection, root cause analysis, or operational automation.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_067a9092-157","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Figma","sameAs":"https://www.figma.com/","logo":"https://logos.yubhub.co/figma.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/figma/jobs/5807963004","x-work-arrangement":"remote","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$258,000-$376,000 USD","x-skills-required":["observability","datadog","opentelemetry","distributed systems","instrumentation best practices","slo design","incident response workflows","cost transparency","accountability initiatives","cost attribution","budgeting","forecasting","alerting"],"x-skills-preferred":["designing or evolving company-wide observability standards","shared libraries","agent/operator-based integrations","cost optimization for infrastructure or observability tooling","vendor negotiations","usage modeling","applying ai or machine learning techniques to anomaly detection","root cause analysis","operational automation"],"datePosted":"2026-04-18T15:55:20.408Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA • New York, NY • United States"}},"jobLocationType":"TELECOMMUTE","employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"observability, datadog, opentelemetry, distributed systems, instrumentation best practices, slo design, incident response workflows, cost transparency, accountability initiatives, cost attribution, budgeting, forecasting, alerting, designing or evolving company-wide observability standards, shared libraries, agent/operator-based integrations, cost optimization for infrastructure or observability tooling, vendor negotiations, usage modeling, applying ai or machine learning techniques to anomaly detection, root cause analysis, operational automation","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":258000,"maxValue":376000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_b2637f59-e14"},"title":"Full-Stack Software Engineer, Reinforcement Learning","description":"<p>As a Full-Stack Software Engineer in RL, you&#39;ll build the platforms, tools, and interfaces that power environment creation, data collection, and training observability. The quality of Claude&#39;s next generation depends on the quality of the data we train it on , and the systems you build are what make that data possible. You&#39;ll own product surfaces end-to-end , from backend services and APIs to the web UIs that researchers, external vendors, and thousands of data labelers use every day.\\n\\nYou don&#39;t need a background in ML research. What matters is that you can take an ambiguous, high-stakes problem and ship a polished, reliable product against it, fast. This team moves very quickly. Claude writes a lot of the code we commit, which means the bottleneck isn&#39;t typing , it&#39;s judgment, taste, and the ability to react to what researchers need next.\\n\\nYou&#39;ll iterate on data collection strategies to distill the knowledge of thousands of human experts around the world into our models, and you&#39;ll do it in a loop that closes in hours and days, not quarters or months.\\n\\nAnthropic&#39;s Reinforcement Learning organization leads the research and development that trains Claude to be capable, reliable, and safe. We&#39;ve contributed to every Claude model, with significant impact on the autonomy and coding capabilities of our most advanced models.\\n\\nOur work spans teaching models to use computers effectively, advancing code generation through RL, pioneering fundamental RL research for large language models, and building the scalable training methodologies behind our frontier production models.\\n\\nThe RL org is organized around four goals: solving the science of long-horizon tasks and continual learning, scaling RL data and environments to be comprehensive and diverse, automating software engineering end-to-end, and training the frontier production model.\\n\\nOur engineering teams build the environments, evaluation systems, data pipelines, and tooling that make all of this possible , from realistic agentic training environments and scalable code data generation to human data collection platforms and production training operations.\\n\\n### Responsibilities\\n\\n<em>   Build and extend web platforms for RL environment creation, management, and quality review , including environment configuration, versioning, and validation workflows\\n</em>   Develop vendor-facing interfaces and tooling that let external partners create, submit, and iterate on training environments with minimal friction\\n<em>   Design and implement platforms for human data collection at scale, including labeling workflows, quality assurance systems, and feedback mechanisms that surface reward signal integrity issues early\\n</em>   Build evaluation dashboards and observability UIs that give researchers real-time insight into environment quality, training run health, and reward hacking\\n<em>   Create backend services and APIs that connect environment authoring tools, data collection systems, and RL training infrastructure\\n</em>   Build and expand scalable code data generation pipelines, producing diverse programming tasks with robust reward signals across languages and difficulty levels\\n<em>   Develop onboarding automation and documentation tooling so new vendors and internal users ramp up in hours, not weeks\\n</em>   Partner closely with RL researchers, data operations, and vendor management to translate ambiguous requirements into well-scoped, well-designed products\\n\\n### Requirements\\n\\n<em>   Strong software engineering fundamentals and real full-stack range , you&#39;re comfortable owning a surface from database schema to frontend\\n</em>   Proficient in Python and a modern web stack (React, TypeScript, or similar)\\n<em>   Track record of shipping systems that solved a hard problem, not just shipped on time , e.g. you built the thing that made your team 10x faster, or the internal tool nobody thought was possible\\n</em>   Operate with high agency: you identify what needs to be done and drive it forward without waiting for a ticket\\n<em>   Found yourself wondering &quot;why isn&#39;t this moving faster?&quot; in previous roles , and then have done something about it\\n</em>   Care about UX and can build interfaces that are intuitive for both technical researchers and non-technical labelers\\n<em>   Communicate clearly with researchers, operations teams, and engineers, and can turn vague asks into well-scoped work\\n</em>   Thrive in a fast-moving environment where priorities shift, Claude is your pair programmer, and the next problem is often one nobody has solved before\\n<em>   Care about Anthropic&#39;s mission to build safe, beneficial AI and want your work to contribute directly to it\\n\\n### Nice to Have\\n\\n</em>   Built data collection, labeling, or annotation platforms , ideally ones that had to scale across many vendors or many task types\\n<em>   Background building multi-tenant platforms with role-based access, audit trails, and vendor management workflows\\n</em>   Experience with cloud infrastructure (GCP or AWS), Docker, and CI/CD pipelines\\n<em>   Familiarity with LLM training, fine-tuning, or evaluation workflows\\n</em>   Experience with async Python (Trio, asyncio) or high-throughput API design\\n<em>   Background in dashboards, monitoring, or observability tooling\\n</em>   Experience working directly with external vendors or partners on technical integrations\\n<em>   A background that isn&#39;t a straight line , e.g. math or physics into SWE, competitive programming, research into engineering, or a side project that outgrew its scope\\n\\n### Representative Projects\\n\\n</em>   Building a unified platform for human data collection that integrates labeling workflows, vendor management, and QA for complex agentic tasks\\n<em>   Developing vendor onboarding automation that handles Docker registry access, API token management, and environment validation\\n</em>   Creating evaluation and observability dashboards that catch reward hacks, measure environment difficulty, and give real-time feedback during production training\\n<em>   Building environment quality review workflows that let researchers browse, grade, and provide feedback on training environments\\n</em>   Developing automated environment quality pipelines that validate correctness and difficulty calibration before environments hit production training\\n*   Building internal tools for browsing and analyzing training run results, environment statistics, and data collection progress</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_b2637f59-e14","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://www.anthropic.com/","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/5186067008","x-work-arrangement":"hybrid","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":"$300,000-$405,000 USD","x-skills-required":["Python","Modern web stack","React","TypeScript","Strong software engineering fundamentals","Full-stack range","Database schema","Frontend","Cloud infrastructure","Docker","CI/CD pipelines","LLM training","Fine-tuning","Evaluation workflows","Async Python","High-throughput API design","Dashboards","Monitoring","Observability tooling"],"x-skills-preferred":["Data collection","Labeling","Annotation platforms","Multi-tenant platforms","Role-based access","Audit trails","Vendor management workflows"],"datePosted":"2026-04-18T15:54:27.784Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA | New York City, NY"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Python, Modern web stack, React, TypeScript, Strong software engineering fundamentals, Full-stack range, Database schema, Frontend, Cloud infrastructure, Docker, CI/CD pipelines, LLM training, Fine-tuning, Evaluation workflows, Async Python, High-throughput API design, Dashboards, Monitoring, Observability tooling, Data collection, Labeling, Annotation platforms, Multi-tenant platforms, Role-based access, Audit trails, Vendor management workflows","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":300000,"maxValue":405000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_984a9e19-247"},"title":"Product Operations Associate, Air Defense","description":"<p>As a Product Operations Associate on the Air Defence team, your mission is to maintain the health of our deployed Air Defence technology. You will be responsible for identifying, triaging, escalating and managing resolution of all incidents across our deployed fleet of systems.</p>\n<p>You should have an aptitude for debugging and an appetite for real-time response, rapid resolution and root-causing complex issues on electromechanical systems. If you are passionate about ground-breaking technology, contributing to the national security mission, interacting alongside professionals that span a wide-range of disciplines, and providing best-in-class product operations oversight, Anduril is interested in speaking with you.</p>\n<p>Your duties will include sustaining Anduril&#39;s Air Defence deployments by combining an understanding of our customers&#39; missions with deep knowledge of our products and integrations. You will also triage, diagnose and conduct root cause analysis of product incidents; drive post-mortem actions including providing status visibility through resolution.</p>\n<p>Additionally, you will collect, organise and analyse system failure data to define trends, drive proactive sustainment processes and support resource allocation. You will consistently assess and ratchet up the quality of the fleet&#39;s observability and telemetry in partnership with product teams.</p>\n<p>You will communicate and coordinate technical and non-technical efforts across multiple business, engineering and sustainment functions, influencing decision making and driving action to maximise capability availability for end users.</p>\n<p>You will support Anduril&#39;s global customers through proactive communications and detail-oriented execution. You will perform maintenance on Anduril hardware through use of Linux command line function.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_984a9e19-247","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anduril Industries","sameAs":"https://www.andurilindustries.com/","logo":"https://logos.yubhub.co/andurilindustries.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/andurilindustries/jobs/4972456007","x-work-arrangement":"onsite","x-experience-level":"mid","x-job-type":"full-time","x-salary-range":"$77,000-$102,000 USD","x-skills-required":["Technical support experience","Incident driven workflows","On-call support operations","Linux command line function","Observability tooling","Software development tooling"],"x-skills-preferred":["BA or BS degree","Applicable industry certifications","DOD, Law Enforcement, or other Government agency experience","Demonstrated experience as a self-starter","Strong aptitude for problem solving"],"datePosted":"2026-04-18T15:48:03.345Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Irvine, California, United States"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Technical support experience, Incident driven workflows, On-call support operations, Linux command line function, Observability tooling, Software development tooling, BA or BS degree, Applicable industry certifications, DOD, Law Enforcement, or other Government agency experience, Demonstrated experience as a self-starter, Strong aptitude for problem solving","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":77000,"maxValue":102000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_1b773e5c-b51"},"title":"IT Systems Engineer, Corporate Systems & Infrastructure","description":"<p>About the role ---------------- The Corporate Infrastructure team builds and operates the platform layer the rest of IT Engineering runs on , cloud infrastructure hosting our internal services, the CI/CD that ships IT&#39;s own code, the observability stack across the corporate environment, and the cross-system automation that wires together tools never designed to talk to each other.</p>\n<p>You&#39;ll build deployment pipelines and internal tooling that let IT Engineering ship like a product team. You&#39;ll define SLOs for corporate services, build the monitoring to know when we&#39;re missing them, and run on-call for the things you deploy. You&#39;ll partner with our network and AV engineers as their infrastructure counterpart , automating physical-world systems, building the telemetry that tells us an office is degraded before someone files a ticket. The scope is broad and the team is deliberately small, which means you&#39;ll need depth across cloud, CI, and observability, strong judgment about where to invest, and a bias toward infrastructure-as-code over heroic manual fixes.</p>\n<p>Responsibilities ---------------</p>\n<ul>\n<li>Build and operate the cloud infrastructure that hosts IT&#39;s internal services</li>\n<li>Design CI/CD pipelines that let IT Engineering ship through code review and automated testing</li>\n<li>Own observability for corporate infrastructure , monitoring, alerting, dashboards, and SLOs</li>\n<li>Write cross-system automation to integrate third-party systems and internal services</li>\n<li>Partner with network, audiovisual, and physical security to deliver robust infrastructure solutions</li>\n<li>Build internal tools , CLIs, bots, dashboards , that make other IT engineers faster</li>\n<li>Run on-call for corporate infrastructure with post-incident reviews that drive durable fixes</li>\n<li>Deploy infrastructure as code</li>\n</ul>\n<p>Requirements ------------</p>\n<ul>\n<li>8+ years building secure IT systems in complex environments</li>\n<li>Excel at solving ambiguous problems with multiple stakeholders</li>\n<li>Communicate technical concepts clearly to any audience</li>\n<li>View IT Engineering as requiring product engineering rigor</li>\n<li>Successfully deliver complex projects from conception to production</li>\n<li>Write clear documentation as a natural part of your workflow</li>\n<li>Have shipped Infrastructure as Code in production , Terraform or similar, with modules and state you maintained</li>\n<li>Have run services with SLOs, on-call rotations, and post-incident reviews</li>\n<li>Have built internal platforms or tooling that other engineers depend on</li>\n</ul>\n<p>Strong candidates may also -------------------------------</p>\n<ul>\n<li>Have transformed traditional IT operations into engineering-driven organizations</li>\n<li>Have built strong partnerships with Security and Engineering teams</li>\n<li>Practice modern development methods (code reviews, testing, CI/CD)</li>\n<li>Work effectively in distributed teams</li>\n<li>Have experience with ECS, Kubernetes or other container orchestration for internal services</li>\n<li>Have automated physical-world infrastructure deployment (e.g., network configuration, office technology, physical security systems)</li>\n<li>Have worked with enterprise integration or workflow automation platforms (e.g., Workato, n8n, Tines, or equivalents)</li>\n</ul>\n<p>Technical Skills ----------------</p>\n<ul>\n<li>Python, golang, etc</li>\n<li>Terraform and Infrastructure as Code</li>\n<li>Cloud platforms (AWS, GCP, Azure)</li>\n<li>CI/CD pipeline design</li>\n<li>Observability tooling (e.g., Prometheus, Grafana, Datadog, Honeycomb, or equivalent)</li>\n<li>Linux systems administration</li>\n<li>Strong networking skills</li>\n<li>Configuration management</li>\n</ul>\n<p>Experience Level: senior Employment Type: full-time Workplace Type: remote Category: Engineering Industry: Technology Salary Range: $275,000-$325,000 USD Required Skills:</p>\n<ul>\n<li>Python</li>\n<li>Terraform</li>\n<li>Cloud platforms</li>\n<li>CI/CD pipeline design</li>\n<li>Observability tooling</li>\n<li>Linux systems administration</li>\n<li>Strong networking skills</li>\n<li>Configuration management</li>\n</ul>\n<p>Preferred Skills:</p>\n<ul>\n<li>golang</li>\n<li>ECS</li>\n<li>Kubernetes</li>\n<li>Enterprise integration or workflow automation platforms</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_1b773e5c-b51","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://www.anthropic.com/","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/4887952008","x-work-arrangement":"remote","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$275,000-$325,000 USD","x-skills-required":["Python","Terraform","Cloud platforms","CI/CD pipeline design","Observability tooling","Linux systems administration","Strong networking skills","Configuration management"],"x-skills-preferred":["golang","ECS","Kubernetes","Enterprise integration or workflow automation platforms"],"datePosted":"2026-04-18T15:40:30.321Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Remote-Friendly (Travel-Required) | San Francisco, CA | Seattle, WA | New York City, NY"}},"jobLocationType":"TELECOMMUTE","employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Python, Terraform, Cloud platforms, CI/CD pipeline design, Observability tooling, Linux systems administration, Strong networking skills, Configuration management, golang, ECS, Kubernetes, Enterprise integration or workflow automation platforms","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":275000,"maxValue":325000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_76fd624c-e23"},"title":"Full-Stack Software Engineer, Reinforcement Learning","description":"<p>As a Full-Stack Software Engineer in RL, you&#39;ll build the platforms, tools, and interfaces that power environment creation, data collection, and training observability. The quality of Claude&#39;s next generation depends on the quality of the data we train it on , and the systems you build are what make that data possible. You&#39;ll own product surfaces end-to-end , from backend services and APIs to the web UIs that researchers, external vendors, and thousands of data labelers use every day. You don&#39;t need a background in ML research. What matters is that you can take an ambiguous, high-stakes problem and ship a polished, reliable product against it, fast.</p>\n<p>This team moves very quickly. Claude writes a lot of the code we commit, which means the bottleneck isn&#39;t typing , it&#39;s judgment, taste, and the ability to react to what researchers need next. You&#39;ll iterate on data collection strategies to distill the knowledge of thousands of human experts around the world into our models, and you&#39;ll do it in a loop that closes in hours and days, not quarters or months.</p>\n<p>Our work spans teaching models to use computers effectively, advancing code generation through RL, pioneering fundamental RL research for large language models, and building the scalable training methodologies behind our frontier production models. The RL org is organized around four goals: solving the science of long-horizon tasks and continual learning, scaling RL data and environments to be comprehensive and diverse, automating software engineering end-to-end, and training the frontier production model.</p>\n<p>Our engineering teams build the environments, evaluation systems, data pipelines, and tooling that make all of this possible , from realistic agentic training environments and scalable code data generation to human data collection platforms and production training operations.</p>\n<p>Responsibilities:</p>\n<ul>\n<li>Build and extend web platforms for RL environment creation, management, and quality review , including environment configuration, versioning, and validation workflows</li>\n<li>Develop vendor-facing interfaces and tooling that let external partners create, submit, and iterate on training environments with minimal friction</li>\n<li>Design and implement platforms for human data collection at scale, including labeling workflows, quality assurance systems, and feedback mechanisms that surface reward signal integrity issues early</li>\n<li>Build evaluation dashboards and observability UIs that give researchers real-time insight into environment quality, training run health, and reward hacking</li>\n<li>Create backend services and APIs that connect environment authoring tools, data collection systems, and RL training infrastructure</li>\n<li>Build and expand scalable code data generation pipelines, producing diverse programming tasks with robust reward signals across languages and difficulty levels</li>\n<li>Develop onboarding automation and documentation tooling so new vendors and internal users ramp up in hours, not weeks</li>\n<li>Partner closely with RL researchers, data operations, and vendor management to translate ambiguous requirements into well-scoped, well-designed products</li>\n</ul>\n<p>You May Be a Good Fit If You:</p>\n<ul>\n<li>Have strong software engineering fundamentals and real full-stack range , you&#39;re comfortable owning a surface from database schema to frontend</li>\n<li>Are proficient in Python and a modern web stack (React, TypeScript, or similar)</li>\n<li>Have a track record of shipping systems that solved a hard problem, not just shipped on time , e.g. you built the thing that made your team 10x faster, or the internal tool nobody thought was possible</li>\n<li>Operate with high agency: you identify what needs to be done and drive it forward without waiting for a ticket</li>\n<li>Have found yourself wondering &quot;why isn&#39;t this moving faster?&quot; in previous roles , and then have done something about it</li>\n<li>Care about UX and can build interfaces that are intuitive for both technical researchers and non-technical labelers</li>\n<li>Communicate clearly with researchers, operations teams, and engineers, and can turn vague asks into well-scoped work</li>\n<li>Thrive in a fast-moving environment where priorities shift, Claude is your pair programmer, and the next problem is often one nobody has solved before</li>\n<li>Care about Anthropic&#39;s mission to build safe, beneficial AI and want your work to contribute directly to it</li>\n</ul>\n<p>Strong Candidates May Also Have:</p>\n<ul>\n<li>Built data collection, labeling, or annotation platforms , ideally ones that had to scale across many vendors or many task types</li>\n<li>Background building multi-tenant platforms with role-based access, audit trails, and vendor management workflows</li>\n<li>Experience with cloud infrastructure (GCP or AWS), Docker, and CI/CD pipelines</li>\n<li>Familiarity with LLM training, fine-tuning, or evaluation workflows</li>\n<li>Experience with async Python (Trio, asyncio) or high-throughput API design</li>\n<li>Background in dashboards, monitoring, or observability tooling</li>\n<li>Experience working directly with external vendors or partners on technical integrations</li>\n<li>A background that isn&#39;t a straight line , e.g. math or physics into SWE, competitive programming, research into engineering, or a side project that outgrew its scope</li>\n</ul>\n<p>Representative Projects:</p>\n<ul>\n<li>Building a unified platform for human data collection that integrates labeling workflows, vendor management, and QA for complex agentic tasks</li>\n<li>Developing vendor onboarding automation that handles Docker registry access, API token management, and environment validation</li>\n<li>Creating evaluation and observability dashboards that catch reward hacks, measure environment difficulty, and give real-time feedback during production training</li>\n<li>Building environment quality review workflows that let researchers browse, grade, and provide feedback on training environments</li>\n<li>Developing automated environment quality pipelines that validate correctness and difficulty calibration before environments hit production training</li>\n<li>Building internal tools for browsing and analyzing training run results, environment statistics, and data collection progress</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_76fd624c-e23","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://www.anthropic.com/","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/5186067008","x-work-arrangement":"hybrid","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":"$300,000-$405,000 USD","x-skills-required":["Python","Modern web stack","React","TypeScript","Cloud infrastructure","Docker","CI/CD pipelines","LLM training","Fine-tuning","Evaluation workflows","Async Python","High-throughput API design","Dashboards","Monitoring","Observability tooling"],"x-skills-preferred":["Data collection","Labeling","Annotation","Multi-tenant platforms","Role-based access","Audit trails","Vendor management workflows"],"datePosted":"2026-04-18T15:39:16.596Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA | New York City, NY"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Python, Modern web stack, React, TypeScript, Cloud infrastructure, Docker, CI/CD pipelines, LLM training, Fine-tuning, Evaluation workflows, Async Python, High-throughput API design, Dashboards, Monitoring, Observability tooling, Data collection, Labeling, Annotation, Multi-tenant platforms, Role-based access, Audit trails, Vendor management workflows","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":300000,"maxValue":405000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_5579e8fb-227"},"title":"Senior AI Engineer","description":"<p>We&#39;re looking for a Senior AI Engineer who is obsessed with building AI systems that actually work in production: reliable, observable, cost-efficient, and genuinely useful. This is not a research role. You will ship AI-powered features that process real financial data for real businesses.</p>\n<p>LLM &amp; AI Pipeline Engineering - Design, build, and maintain production-grade LLM integration pipelines , including retrieval-augmented generation (RAG), prompt engineering, output parsing, and chain orchestration.</p>\n<p>Develop and operate AI features within Jeeves&#39;s core financial products: spend categorization, document extraction, anomaly detection, financial Q&amp;A, and automated reconciliation.</p>\n<p>Implement structured output validation, fallback handling, and confidence scoring to ensure AI decisions meet reliability standards for financial use cases.</p>\n<p>Evaluate and integrate AI frameworks and tools (LangChain, LlamaIndex, OpenAI API, Anthropic API, HuggingFace, vector databases) and advocate for the right tool for the job.</p>\n<p>Establish prompt versioning and evaluation practices to ensure AI outputs remain accurate and consistent as models and data evolve.</p>\n<p>Retrieval &amp; Vector Search - Design and maintain vector search pipelines using databases such as Pinecone, Weaviate, or pgvector to power semantic search and RAG-based features.</p>\n<p>Build document ingestion and chunking pipelines for Jeeves&#39;s financial data , processing invoices, receipts, policy documents, and transaction records.</p>\n<p>Optimize retrieval quality through embedding model selection, chunk strategy, metadata filtering, and re-ranking techniques.</p>\n<p>ML Model Serving &amp; Operations - Collaborate with data scientists to take trained ML models from experimental notebooks to production serving infrastructure.</p>\n<p>Build and maintain model serving endpoints with appropriate latency SLOs, input validation, and output monitoring.</p>\n<p>Implement model performance monitoring and data drift detection to ensure production models remain accurate over time.</p>\n<p>Support model retraining workflows by designing clean data pipelines and feature engineering that can be continuously updated.</p>\n<p>Backend Integration &amp; Reliability - Integrate AI services cleanly with Jeeves&#39;s backend microservices , designing clear API contracts, circuit breakers, and graceful degradation patterns.</p>\n<p>Write high-quality, testable backend code in Python or Go/Node.js to power AI-integrated features.</p>\n<p>Instrument AI components with structured logging, distributed tracing, latency dashboards, and alerting to ensure operational visibility.</p>\n<p>Build human-in-the-loop review workflows for AI decisions that require oversight , particularly for high-value financial actions.</p>\n<p>Collaboration &amp; Growth - Partner with Product, Backend Engineering, and Data Science to define the AI roadmap and translate requirements into reliable systems.</p>\n<p>Contribute to a culture of quality by writing design docs, reviewing peers&#39; AI system designs, and sharing learnings openly.</p>\n<p>Help grow the AI engineering practice at Jeeves by establishing patterns, tooling, and best practices that the broader team can build on.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_5579e8fb-227","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Jeeves","sameAs":"https://www.jeeves.com/","logo":"https://logos.yubhub.co/jeeves.com.png"},"x-apply-url":"https://jobs.lever.co/tryjeeves/2f00206f-6091-4eed-8b5f-1325afdbfe30","x-work-arrangement":"remote","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":null,"x-skills-required":["LLM pipeline engineering","RAG architecture","ML system operation","Python programming","AI orchestration framework","ML model serving infrastructure","Observability tooling"],"x-skills-preferred":["Fintech experience","Prompt evaluation frameworks","ML lifecycle management tools","Real-time data streaming"],"datePosted":"2026-04-17T12:38:27.085Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Brazil"}},"jobLocationType":"TELECOMMUTE","employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Finance","skills":"LLM pipeline engineering, RAG architecture, ML system operation, Python programming, AI orchestration framework, ML model serving infrastructure, Observability tooling, Fintech experience, Prompt evaluation frameworks, ML lifecycle management tools, Real-time data streaming"},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_076e80d8-7d1"},"title":"Senior/Staff Software Engineer- Application Platform","description":"<p>About the Role\nWe are hiring engineers to build and own Flow&#39;s Application Platform,the layer that sits between product engineering and infrastructure. The team is focused on staying AI first, this role will expect you to be at the forefront of utilizing AI agents and tools to accomplish tasks.</p>\n<p>Responsibilities</p>\n<ul>\n<li>Build and maintain service frameworks and shared libraries</li>\n<li>Drive AI adoption, championing AI tools, harnesses, and agentic software development, and establishing standards for APIs, background jobs, and event-driven systems.</li>\n<li>Improve developer workflows (local development, testing, CI/CD, service scaffolding)</li>\n<li>Partner with infrastructure teams to help define how services run in production</li>\n<li>Ensure services are observable, reliable, and production-ready by default</li>\n<li>Drive adoption of platform standards across the organization</li>\n<li>Contribute to system design and architecture across multiple domains</li>\n</ul>\n<p>Qualifications</p>\n<ul>\n<li>A minimum 10 years in software engineering, site reliability engineering, or platform engineering</li>\n<li>Advocate for the integration of AI throughout the SDLC to accelerate and improve delivery velocity.</li>\n<li>Strong experience building platforms and paved paths for systems in production environments</li>\n<li>Experience with distributed systems and service-oriented architectures</li>\n<li>Proficiency in modern programming languages such as, Kotlin, Typescript, Go, Python</li>\n<li>Familiarity with cloud infrastructure and containerized environments (e.g., GCP, Kubernetes, Docker)</li>\n<li>Experience building shared libraries, frameworks, or internal tooling</li>\n<li>Ability to work across teams and influence technical direction</li>\n</ul>\n<p>Nice to Have</p>\n<ul>\n<li>Familiarity with observability tooling (metrics, logging, tracing)</li>\n<li>Experience with CI/CD systems and developer tooling</li>\n<li>Exposure to high-scale or multi-service environments</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_076e80d8-7d1","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Flow","sameAs":"https://flow.com/","logo":"https://logos.yubhub.co/flow.com.png"},"x-apply-url":"https://jobs.lever.co/flowlife/78f65d23-1b87-4b3f-b78b-55de6ef5f010","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$220,000-270,000 per year","x-skills-required":["Kotlin","Typescript","Go","Python","Cloud infrastructure","Containerized environments","Distributed systems","Service-oriented architectures","Shared libraries","Frameworks","Internal tooling"],"x-skills-preferred":["Observability tooling","CI/CD systems","Developer tooling","High-scale or multi-service environments"],"datePosted":"2026-04-17T12:34:40.435Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"New York"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Kotlin, Typescript, Go, Python, Cloud infrastructure, Containerized environments, Distributed systems, Service-oriented architectures, Shared libraries, Frameworks, Internal tooling, Observability tooling, CI/CD systems, Developer tooling, High-scale or multi-service environments","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":220000,"maxValue":270000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_2a88ee59-dc6"},"title":"Full Stack Engineer (Serverless)","description":"<p>We&#39;re building the fastest and most scalable infrastructure for AI inference. As a Full Stack Engineer on Serverless, you will build the core product across frontend and backend that powers our Serverless platform. This is a deeply product-focused role where you will work side-by-side with Product and Infrastructure to design and ship reusable, scalable systems that enterprise customers rely on in production every day.</p>\n<p>You will be a foundational technical owner of our Serverless product as it scales to thousands of enterprise customers, with real responsibility, autonomy, and impact. This is a chance to help build a new product vertical from the ground up inside a company that is already scaling at rocket-ship speed.</p>\n<p>Your responsibilities will include:</p>\n<ul>\n<li>Building and maintaining core Serverless UI features (dashboards, logs, observability, configuration, usage)</li>\n<li>Designing and implementing backend APIs that power the Serverless product experience</li>\n<li>Improving performance, reliability, and scalability of customer-facing systems</li>\n<li>Working closely with Infrastructure to ensure product features align with platform capabilities</li>\n<li>Owning features end-to-end, from design through production and iteration</li>\n</ul>\n<p>We&#39;re looking for a strong experience working across both frontend and backend, proficiency with TypeScript, Python, Postgres, and Next.js, and experience owning features end-to-end in production systems. Ability to context switch between UI, backend, and performance work, product-minded engineer who values clean abstractions and long-term maintainability, comfortable working in a fast-moving, low-process environment.</p>\n<p>Nice to have experience building developer platforms or infrastructure-adjacent products, familiarity with observability tooling (logging, metrics, tracing) in production environments, background in distributed systems, container orchestration, or cloud-native architectures, experience with real-time systems, streaming logs, or high-throughput data pipelines, exposure to technologies such as Kubernetes, Prometheus, Datadog, gRPC, or similar systems, entrepreneurial mindset and strong ownership mentality.</p>\n<p>We offer interesting and challenging work, competitive salary and equity, a lot of learning and growth opportunities, visa sponsorship and relocation assistance, health, dental, and vision insurance, regular team events and offsite.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_2a88ee59-dc6","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Fal","sameAs":"https://www.fal.com/","logo":"https://logos.yubhub.co/fal.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/fal/jobs/4112697009","x-work-arrangement":"onsite","x-experience-level":"mid","x-job-type":"full-time","x-salary-range":"$150,000 - $230,000 + equity + comprehensive benefits package","x-skills-required":["TypeScript","Python","Postgres","Next.js","serverless","backend APIs","frontend development"],"x-skills-preferred":["observability tooling","distributed systems","container orchestration","cloud-native architectures","real-time systems","streaming logs","high-throughput data pipelines","Kubernetes","Prometheus","Datadog","gRPC"],"datePosted":"2026-04-17T12:32:02.355Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"TypeScript, Python, Postgres, Next.js, serverless, backend APIs, frontend development, observability tooling, distributed systems, container orchestration, cloud-native architectures, real-time systems, streaming logs, high-throughput data pipelines, Kubernetes, Prometheus, Datadog, gRPC","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":150000,"maxValue":230000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_981e6f7e-ede"},"title":"Production Readiness Lead - Game Developer Experience (GDX)","description":"<p>Electronic Arts creates next-level entertainment experiences that inspire players and fans around the world. Here, everyone is part of the story. Part of a community that connects across the globe. A place where creativity thrives, new perspectives are invited, and ideas matter. A team where everyone makes play happen.</p>\n<p>The Electronic Arts Information Technology (EAIT) organization works as a global team to empower EA&#39;s employees and business operations to be creative, collaborative, and productive. As a digital entertainment company, EA&#39;s enterprise technology needs are diverse and span across game development, workforce collaboration, marketing, publishing, player experience, security, and corporate activities. Our mission is to bring creative technology services to each of these areas, working across the company to ensure better play.</p>\n<p>As part of the Game Developer Experience (GDX) organization, the Engineering and Operations team is building a structured, scalable operational lifecycle across GameKit. In this role, you will play a central part in shaping how operational excellence is embedded into product delivery from concept through launch and beyond.</p>\n<p>As the Product Readiness Lead, you will integrate operational standards directly into the Product Development Lifecycle (PDLC), ensuring that reliability, scalability, and support readiness are designed in, not added later. You will collaborate closely with Engineering, Product Management, Site Reliability Engineering (SRE), Customer Support, and Operations partners to help teams meet clearly defined expectations for observability, automation, documentation, and launch readiness.</p>\n<p>This is a hybrid role (3 days per week in the office) based in Vancouver, reporting to the Director of Operations and partnering broadly across the GameKit ecosystem to establish a repeatable, sustainable operational lifecycle model.</p>\n<p>Responsibilities:</p>\n<ul>\n<li>Enable a digital-first, automation-forward support strategy by ensuring products are designed with operational readiness from Day 0.</li>\n<li>Partner with product and engineering teams to embed automation, AI-enabled support capabilities, and agentic workflows into product designs before launch.</li>\n<li>Define and integrate standards for alerting, instrumentation, observability, runbooks, and workflow automation into the PDLC.</li>\n<li>Establish lifecycle checkpoints and measurable readiness indicators (e.g., MTTR, signal coverage, operational maturity).</li>\n<li>Lead structured operational readiness reviews and provide clear, actionable recommendations to support successful launches.</li>\n<li>Be the connector across teams, aligning technical and operational partners around shared reliability and support outcomes.</li>\n</ul>\n<p>Qualifications:</p>\n<ul>\n<li>8+ years of experience in Operations, Site Reliability Engineering (SRE), Technical Program Management, Platform Operations, or a related discipline.</li>\n<li>Demonstrated hands-on experience with Service Level Agreements (SLAs)/Service Level Objectives(SLOs), incident management, observability tooling, dashboards, and automation systems in large-scale, multi-product environments.</li>\n<li>Strong collaboration and influence skills, with the ability to work effectively across engineering, product, and operational teams.</li>\n<li>Experience driving operational consistency and continuous improvement in dynamic, technology-driven organizations.</li>\n</ul>\n<p>Pay Transparency - North America</p>\n<p>COMPENSATION AND BENEFITS</p>\n<p>The ranges listed below are what EA in good faith expects to pay applicants for this role in these locations at the time of this posting. If you reside in a different location, a recruiter will advise on the applicable range and benefits. Pay offered will be determined based on a number of relevant business and candidate factors (e.g. education, qualifications, certifications, experience, skills, geographic location, or business needs).</p>\n<p>PAY RANGES</p>\n<p>• British Columbia (depending on location e.g. Vancouver vs. Victoria) $130,800 - $183,000 CAD</p>\n<p>Pay is just one part of the overall compensation at EA.</p>\n<p>For Canada, we offer a package of benefits including vacation (3 weeks per year to start), 10 days per year of sick time, paid top-up to EI/QPIP benefits up to 100% of base salary when you welcome a new child (12 weeks for maternity, and 4 weeks for parental/adoption leave), extended health/dental/vision coverage, life insurance, disability insurance, retirement plan to regular full-time employees. Certain roles may also be eligible for bonus and equity.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_981e6f7e-ede","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Electronic Arts","sameAs":"https://jobs.ea.com","logo":"https://logos.yubhub.co/jobs.ea.com.png"},"x-apply-url":"https://jobs.ea.com/en_US/careers/JobDetail/Production-Readiness-Lead-Game-Developer-Experience-GDX/212677","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$130,800 - $183,000 CAD","x-skills-required":["Service Level Agreements (SLAs)","Service Level Objectives (SLOs)","incident management","observability tooling","dashboards","automation systems"],"x-skills-preferred":[],"datePosted":"2026-03-10T12:18:03.330Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Vancouver"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Service Level Agreements (SLAs), Service Level Objectives (SLOs), incident management, observability tooling, dashboards, automation systems","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":130800,"maxValue":183000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_817bc7b0-6a7"},"title":"Test Engineer (Product Integration)","description":"<p>Our Product Integration Test Team is looking for 2 Test Engineers (an Intermediate and a Senior) to expand the breadth and depth of testing Vista&#39;s cutting-edge software, across multiple technologies.</p>\n<p>This is a unique role that bridges the gap between QA and DevOps. You&#39;ll help:</p>\n<ul>\n<li>Coordinate and support teams performing system test functions</li>\n<li>Design and execute test solutions that cut across squads and technologies</li>\n<li>Create and execute Integration Test Plans</li>\n<li>Contribute to Integration test automation suites</li>\n<li>Execute cloud-based multi-server deployments using tools like Octopus</li>\n<li>Monitor product observability using tools such as DataDog</li>\n<li>Perform static analysis to identify risks in missions</li>\n</ul>\n<p>About you</p>\n<ul>\n<li>A quality champion with strong ownership</li>\n<li>Expertise in manual and automation testing across multiple technologies</li>\n<li>Strong communication skills</li>\n<li>A collaborative mindset, able to build and maintain good professional relationships across the company</li>\n<li>Proven experience with automation tools, including exposure to C#/.NET, Selenium, etc...</li>\n<li>Basic understanding of SQL</li>\n<li>Knowledge of defect tracking and test management systems</li>\n<li>Familiarity with observability tooling such as Prometheus, DataDog, etc...</li>\n<li>Basic project management skills</li>\n<li>Exposure to DevOps in a cloud environment will be advantageous</li>\n<li>Curiosity, passion, and energy</li>\n</ul>\n<p>This is a hybrid role with a home / office-based split, requiring regular (1-2 days per week) attendance in the Cape Town office. We are only considering applicants with an existing right to work in South Africa, without the need for employer sponsorship.</p>\n<p>Benefits</p>\n<ul>\n<li>Rest &amp; Relax Fridays</li>\n<li>Finish at lunch time on Friday but get paid for the full day</li>\n<li>Annual volunteer day</li>\n<li>Employee Rewards and Benefits with Perkbox</li>\n<li>Compulsory Defined Contribution Company Pension Scheme</li>\n<li>Medical Insurance / Medical Aid (after qualifying period)</li>\n<li>Employee Assistance Programme Service</li>\n<li>Paid Sick leave</li>\n<li>5 days bereavement leave per year</li>\n<li>On-Site Breakfast Bar</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_817bc7b0-6a7","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Vista","sameAs":"https://apply.workable.com","logo":"https://logos.yubhub.co/j.com.png"},"x-apply-url":"https://apply.workable.com/j/532E13FABC","x-work-arrangement":"hybrid","x-experience-level":"mid|senior","x-job-type":"full-time","x-salary-range":null,"x-skills-required":["manual and automation testing","C#/.NET","Selenium","SQL","defect tracking and test management systems","observability tooling","project management"],"x-skills-preferred":["DevOps in a cloud environment"],"datePosted":"2026-03-09T16:20:02.732Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Cape Town"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"manual and automation testing, C#/.NET, Selenium, SQL, defect tracking and test management systems, observability tooling, project management, DevOps in a cloud environment"},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_dd79f871-8f1"},"title":"Technical Program Manager, Infrastructure","description":"<p><strong>About the Role</strong></p>\n<p>Anthropic&#39;s Infrastructure organisation is the engine that powers our mission. Every breakthrough in AI safety research and every interaction users have with Claude depends on the systems we build and operate: massive clusters for training frontier models, production infrastructure serving millions of users reliably, and developer platforms that help engineers move fast without breaking things.</p>\n<p>As a Technical Program Manager for Infrastructure, you&#39;ll work across multiple infrastructure domains to coordinate complex programs that have broad organisational impact. You&#39;ll be solving novel scaling challenges at the frontier of what&#39;s possible, all while maintaining the security and reliability our mission demands.</p>\n<p>This role is ideal for someone who thrives in ambiguity and believes their job is to make everyone around them more effective. You&#39;ll partner closely with engineering leadership to drive strategic initiatives while ensuring seamless coordination between research, engineering, and product teams.</p>\n<p><strong>What you&#39;ll do:</strong></p>\n<p><strong>Developer Productivity &amp; Tooling</strong></p>\n<ul>\n<li>Drive cross-functional programs to improve developer environments, CI/CD infrastructure, and release processes that enable rapid innovation while maintaining high security standards</li>\n</ul>\n<ul>\n<li>Coordinate large-scale migrations and platform modernization efforts across engineering teams</li>\n</ul>\n<ul>\n<li>Partner with teams to measure and improve developer productivity metrics, identifying bottlenecks and driving systematic improvements</li>\n</ul>\n<ul>\n<li>Lead initiatives to integrate AI tools into development workflows, helping Anthropic be at the forefront of AI-assisted research and engineering</li>\n</ul>\n<p><strong>Infrastructure Reliability &amp; Operations</strong></p>\n<ul>\n<li>Drive programs to establish and achieve reliability targets across training infrastructure and production services</li>\n</ul>\n<ul>\n<li>Coordinate incident response improvements, post-mortem processes, and on-call rotations that help teams operate effectively</li>\n</ul>\n<ul>\n<li>Establish metrics and dashboards to track infrastructure health, capacity utilisation, and operational excellence</li>\n</ul>\n<p><strong>Cross-functional Coordination</strong></p>\n<ul>\n<li>Serve as the critical bridge between infrastructure teams, research, and product, translating technical complexities into clear updates for a variety of audiences</li>\n</ul>\n<ul>\n<li>Consult with stakeholders to deeply understand infrastructure, data, and compute needs, identifying solutions to support frontier research and product development</li>\n</ul>\n<ul>\n<li>Drive alignment on priorities and timelines across teams with competing constraints</li>\n</ul>\n<p><strong>You May Be a Good Fit If You</strong></p>\n<ul>\n<li>Have 5+ years of technical program management experience, with a track record of successfully delivering complex infrastructure programs in ML/AI systems or large-scale distributed systems</li>\n</ul>\n<ul>\n<li>Have deep technical understanding of infrastructure systems—enough to engage substantively with engineers, identify technical risks, and add value beyond project tracking</li>\n</ul>\n<ul>\n<li>Excel at creating structure and processes in ambiguous environments, bringing clarity to complex cross-team initiatives</li>\n</ul>\n<ul>\n<li>Have strong stakeholder management skills and can build trust with both technical and non-technical partners</li>\n</ul>\n<ul>\n<li>Are comfortable navigating competing priorities and using data to drive technical decisions</li>\n</ul>\n<ul>\n<li>Have experience with developer productivity initiatives, CI/CD systems, or infrastructure scaling</li>\n</ul>\n<ul>\n<li>Thrive in fast-paced environments and can balance strategic planning with tactical execution</li>\n</ul>\n<ul>\n<li>Are obsessed with reliability, scalability, security, and continuous improvement</li>\n</ul>\n<ul>\n<li>Have a passion for supporting internal partners like research to understand their unique needs</li>\n</ul>\n<ul>\n<li>Are passionate about AI infrastructure and understand the unique challenges of building and operating systems at frontier scale</li>\n</ul>\n<ul>\n<li>Experience with Kubernetes, cloud platforms (AWS, GCP, Azure), and ML infrastructure (GPU/TPU/Trainium clusters)</li>\n</ul>\n<ul>\n<li>Background working with research teams and translating their needs into concrete technical requirements</li>\n</ul>\n<ul>\n<li>Experience driving adoption of AI tools to improve engineering productivity</li>\n</ul>\n<ul>\n<li>Familiarity with observability tooling and practices</li>\n</ul>\n<p><strong>Deadline to Apply:</strong></p>\n<p>None, applications will be received on a rolling basis.</p>\n<p><strong>Logistics</strong></p>\n<p><strong>Education requirements:</strong></p>\n<p>We require at least a Bachelor&#39;s degree in a related field or equivalent experience.</p>\n<p><strong>Location-based hybrid policy:</strong></p>\n<p>Currently, we expect all staff to be in one of our offices at least 25% of the time. However, some roles may require more time in our offices.</p>\n<p><strong>Visa sponsorship:</strong></p>\n<p>We do sponsor visas! However, we aren&#39;t able to successfully sponsor visas for every role and every candidate. But if we make you an offer, we will make every reasonable effort to get you a visa, and we retain an immigration lawyer to help with this.</p>\n<p><strong>We encourage you to apply even if you do not believe you meet every single qualification.</strong></p>\n<p>Not all strong candidates will meet every single qualification as listed. Research shows that people who identify as being from underrepresented groups are more prone to experiencing imposter syndrome and doubting the strength of their candidacy, so we urge you not to exclude yourself prematurely and to submit an application.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_dd79f871-8f1","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://job-boards.greenhouse.io","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/5111783008","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$290,000 - $365,000 USD","x-skills-required":["Technical Program Management","Infrastructure","ML/AI systems","Distributed systems","Kubernetes","Cloud platforms","ML infrastructure"],"x-skills-preferred":["Developer productivity initiatives","CI/CD systems","Infrastructure scaling","Observability tooling and practices"],"datePosted":"2026-03-08T13:49:30.383Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA | New York City, NY | Seattle, WA"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Technical Program Management, Infrastructure, ML/AI systems, Distributed systems, Kubernetes, Cloud platforms, ML infrastructure, Developer productivity initiatives, CI/CD systems, Infrastructure scaling, Observability tooling and practices","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":290000,"maxValue":365000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_1d808aa6-75a"},"title":"Full Stack Engineer, Fleet Scheduling","description":"<p><strong>Full Stack Engineer, Fleet Scheduling</strong></p>\n<p><strong>Location</strong></p>\n<p>San Francisco</p>\n<p><strong>Employment Type</strong></p>\n<p>Full time</p>\n<p><strong>Department</strong></p>\n<p>Scaling</p>\n<p><strong>Compensation</strong></p>\n<ul>\n<li>$230K – $490K • Offers Equity</li>\n</ul>\n<p>The base pay offered may vary depending on multiple individualized factors, including market location, job-related knowledge, skills, and experience. If the role is non-exempt, overtime pay will be provided consistent with applicable laws. In addition to the salary range listed above, total compensation also includes generous equity, performance-related bonus(es) for eligible employees, and the following benefits.</p>\n<ul>\n<li>Medical, dental, and vision insurance for you and your family, with employer contributions to Health Savings Accounts</li>\n</ul>\n<ul>\n<li>Pre-tax accounts for Health FSA, Dependent Care FSA, and commuter expenses (parking and transit)</li>\n</ul>\n<ul>\n<li>401(k) retirement plan with employer match</li>\n</ul>\n<ul>\n<li>Paid parental leave (up to 24 weeks for birth parents and 20 weeks for non-birthing parents), plus paid medical and caregiver leave (up to 8 weeks)</li>\n</ul>\n<ul>\n<li>Paid time off: flexible PTO for exempt employees and up to 15 days annually for non-exempt employees</li>\n</ul>\n<ul>\n<li>13+ paid company holidays, and multiple paid coordinated company office closures throughout the year for focus and recharge, plus paid sick or safe time (1 hour per 30 hours worked, or more, as required by applicable state or local law)</li>\n</ul>\n<ul>\n<li>Mental health and wellness support</li>\n</ul>\n<ul>\n<li>Employer-paid basic life and disability coverage</li>\n</ul>\n<ul>\n<li>Annual learning and development stipend to fuel your professional growth</li>\n</ul>\n<ul>\n<li>Daily meals in our offices, and meal delivery credits as eligible</li>\n</ul>\n<ul>\n<li>Relocation support for eligible employees</li>\n</ul>\n<ul>\n<li>Additional taxable fringe benefits, such as charitable donation matching and wellness stipends, may also be provided.</li>\n</ul>\n<p>More details about our benefits are available to candidates during the hiring process.</p>\n<p>This role is at-will and OpenAI reserves the right to modify base pay and other compensation components at any time based on individual performance, team or company results, or market conditions.</p>\n<p><strong>About the Team</strong> Full Stack engineers within the Fleet Scheduling team are dedicated to building intuitive and scalable interfaces that empower researchers to efficiently manage AI workloads across some of the largest supercomputers in the world. Our focus is on developing robust, high-performance systems that provide real-time insights, resource tracking, and seamless interaction with complex infrastructure. We aim to optimize resource allocation, minimize operational overhead, and create user-friendly tools that enhance researcher productivity and system transparency.</p>\n<p><strong>About the Role</strong> You will design, develop, and operate web-based systems that provide a powerful and intuitive interface to OpenAI’s supercomputing clusters. You will collaborate closely with researcher, product and infrastructure teams to deliver scalable solutions that enable seamless monitoring, job scheduling, and resource management. This is an opportunity to work at the cutting edge of AI infrastructure, designing tools that scale to exascale workloads while maintaining usability and performance.</p>\n<p>This role is based in <strong>San Francisco, CA.</strong> We use a hybrid work model of <strong>3 days in the office per week</strong> and offer relocation assistance to new employees.</p>\n<p><strong>In this role, you will:</strong></p>\n<ul>\n<li>Design and develop full-stack web applications to track, monitor, and manage large-scale AI workloads in real time.</li>\n</ul>\n<ul>\n<li>Collaborate with researchers and infrastructure teams to translate complex operational needs into intuitive UIs and scalable backends.</li>\n</ul>\n<ul>\n<li>Build data visualization tools (e.g., Gantt charts, dashboards) to provide insights into job scheduling and resource allocation.</li>\n</ul>\n<ul>\n<li>Optimize backend services to handle massive data throughput while ensuring low-latency performance and high availability.</li>\n</ul>\n<ul>\n<li>Implement frontend components that provide seamless interactions with scheduling, storage, and compute systems.</li>\n</ul>\n<ul>\n<li>Ensure system security, reliability, and scalability across globally distributed supercomputing infrastructure.</li>\n</ul>\n<p><strong>You might thrive in this role if you:</strong></p>\n<ul>\n<li>Significant experience in full-stack development, with expertise in modern frontend frameworks (React, Vue, or Angular) and backend technologies (Python, Go, or Node.js).</li>\n</ul>\n<ul>\n<li>Experienced in building scalable, high-performance web applications for complex distributed systems.</li>\n</ul>\n<ul>\n<li>Strong understanding of RESTful and GraphQL APIs, distributed databases, and cloud infrastructure (especially Azure).</li>\n</ul>\n<ul>\n<li>Execution-focused with a keen eye for usability, performance, and scalability in enterprise-scale systems.</li>\n</ul>\n<ul>\n<li>Comfortable working in fast-paced, highly collaborative environments with tight timelines and evolving priorities.</li>\n</ul>\n<p><strong>Bonus points if you:</strong></p>\n<ul>\n<li>Have experience working with Kubernetes, Docker, and cloud-native application deployment.</li>\n</ul>\n<ul>\n<li>Understand AI/ML workload scheduling and orchestration challenges.</li>\n</ul>\n<ul>\n<li>Have experience with real-time data processing, visualization libraries, and observability tooling.</li>\n</ul>\n<p><strong>About OpenAI</strong> OpenAI is an AI research and deployment company dedicated to ensuring that general-purpose artificial intelligence benefits all of humanity. We push the boundaries of the capabilities of AI systems and seek to safely deploy them to the world through our products. AI is an extremely powerful tool that must be created with safety and human needs at its core, and to achieve our mission, we must encompass and value the many different perspectives, voices, and experiences that form the full spectrum of humanity.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_1d808aa6-75a","directApply":true,"hiringOrganization":{"@type":"Organization","name":"OpenAI","sameAs":"https://jobs.ashbyhq.com","logo":"https://logos.yubhub.co/openai.com.png"},"x-apply-url":"https://jobs.ashbyhq.com/openai/9d11e1d8-af1d-413b-873f-d8fac2bdee99","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$230K – $490K • Offers Equity","x-skills-required":["full-stack development","modern frontend frameworks","backend technologies","RESTful and GraphQL APIs","distributed databases","cloud infrastructure","Kubernetes","Docker","cloud-native application deployment","AI/ML workload scheduling","orchestration challenges","real-time data processing","visualization libraries","observability tooling"],"x-skills-preferred":["React","Vue","Angular","Python","Go","Node.js","Azure","Gantt charts","dashboards"],"datePosted":"2026-03-06T18:40:33.689Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"full-stack development, modern frontend frameworks, backend technologies, RESTful and GraphQL APIs, distributed databases, cloud infrastructure, Kubernetes, Docker, cloud-native application deployment, AI/ML workload scheduling, orchestration challenges, real-time data processing, visualization libraries, observability tooling, React, Vue, Angular, Python, Go, Node.js, Azure, Gantt charts, dashboards","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":230000,"maxValue":490000,"unitText":"YEAR"}}}]}