{"version":"0.1","company":{"name":"YubHub","url":"https://yubhub.co","jobsUrl":"https://yubhub.co/jobs/skill/distributed-storage-systems"},"x-facet":{"type":"skill","slug":"distributed-storage-systems","display":"Distributed Storage Systems","count":8},"x-feed-size-limit":100,"x-feed-sort":"enriched_at desc","x-feed-notice":"This feed contains at most 100 jobs (the most recently enriched). For the full corpus, use the paginated /stats/by-facet endpoint or /search.","x-generator":"yubhub-xml-generator","x-rights":"Free to redistribute with attribution: \"Data by YubHub (https://yubhub.co)\"","x-schema":"Each entry in `jobs` follows https://schema.org/JobPosting. YubHub-native raw fields carry `x-` prefix.","jobs":[{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_db7b7a31-c41"},"title":"Staff Software Engineer, Storage","description":"<p>We&#39;re seeking a Staff Software Engineer to join our BE Platform team. As a Staff Software Engineer, you will play a key role in building and evolving control and data planes, improving underlying systems, and writing software that implements critical workflows to automate and enhance the operation of our large-scale storage infrastructure.</p>\n<p>Key responsibilities include:</p>\n<ul>\n<li>Design, write, and deliver software to improve the availability, scalability, latency, and efficiency of Reddit&#39;s products in Go, C++, and sometimes Python.</li>\n<li>Dive deep into the codebase of supported storage systems to understand system internals.</li>\n<li>Make system-level improvements, enhancements, and implement complex code modifications.</li>\n<li>Engage actively with the open-source community to implement and upstream changes to the OSS codebase.</li>\n<li>Contribute to the design and implementation of high-performance, large-scale distributed storage systems to power various use cases at Reddit.</li>\n<li>Collaborate closely with engineering teams and stakeholders to integrate storage capabilities into broader storage infrastructure and use cases across Reddit.</li>\n</ul>\n<p>Requirements include:</p>\n<ul>\n<li>7+ years of experience building internet-scale software, preferably with a focus on machine learning storage infrastructure.</li>\n<li>Software development experience in one or more general-purpose programming languages; Golang, Python, C++, Java.</li>\n<li>Hands-on experience implementing features, optimizations, and bug fixes to distributed storage systems.</li>\n<li>Experience contributing code improvements, features, and bug fixes to open-source (OSS) projects.</li>\n<li>Prior experience with operating a large-scale critical infrastructure system with a focus on automation and workflow development is a plus, especially in a role where they were required to be on call.</li>\n<li>Excellent communication skills to collaborate with a service-oriented team and company.</li>\n</ul>\n<p>Benefits include:</p>\n<ul>\n<li>Comprehensive healthcare benefits and income replacement programs.</li>\n<li>401k match.</li>\n<li>Family planning support.</li>\n<li>Gender-affirming care.</li>\n<li>Mental health and coaching benefits.</li>\n<li>Flexible vacation and Reddit global days off.</li>\n<li>Generous paid parental leave.</li>\n<li>Paid volunteer time off.</li>\n</ul>\n<p>This job posting may span more than one career level. In addition to base salary, this job is eligible to receive equity in the form of restricted stock units, and depending on the position offered, it may also be eligible to receive a commission.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_db7b7a31-c41","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Reddit","sameAs":"https://www.redditinc.com","logo":"https://logos.yubhub.co/redditinc.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/reddit/jobs/7511698","x-work-arrangement":"remote","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":"$217,000-$303,900 USD","x-skills-required":["Go","C++","Python","Golang","Java","Distributed storage systems","Open-source community engagement","System-level improvements","High-performance storage systems"],"x-skills-preferred":[],"datePosted":"2026-04-18T15:57:43.541Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Seattle, WA"}},"jobLocationType":"TELECOMMUTE","employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Go, C++, Python, Golang, Java, Distributed storage systems, Open-source community engagement, System-level improvements, High-performance storage systems","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":217000,"maxValue":303900,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_3a40dbfa-d00"},"title":"Staff Software Engineer, Non-Human Identity","description":"<p>Secure Every Identity, from AI to Human Identity is the key to unlocking the potential of AI. Okta secures AI by building the trusted, neutral infrastructure that enables organisations to safely embrace this new era.</p>\n<p>We are looking for builders and owners who operate with speed and urgency and execute with excellence. This is an opportunity to do career-defining work. We&#39;re all in on this mission. If you are too, let&#39;s talk.</p>\n<p>The Team</p>\n<p>The Okta Privileged Access Management (PAM) team is building the future of identity for machines, services, and applications. We are seeking a world-class Staff Engineer to help us architect and build the high-performance core of our non-human identity platform.</p>\n<p>Your work, in close collaboration with our principal engineers and architects, will be the foundation of our strategy for managing privileged access in the modern enterprise. If you are a systems programmer who thrives on influencing the design of high-performance, concurrent, and resilient security software, this is the role for you.</p>\n<p>What you’ll be doing</p>\n<ul>\n<li>Contribute to Core Architecture:</li>\n<li>Partner with principal engineers and architects to design and implement a low-latency, high-throughput secrets engine for non-human identities</li>\n<li>Solve for Massive Scale:</li>\n<li>Write highly concurrent, performance-critical code capable of handling millions of machine-to-machine authentication and authorization requests</li>\n<li>Shape Technical Strategy:</li>\n<li>Play a key role in defining the long-term technical roadmap for scalability and performance, ensuring our platform can meet the demands of the largest enterprises</li>\n<li>Mentor and Elevate:</li>\n<li>As a senior engineer on the team, you will work with junior engineers to help them advance their SDLC expertise.</li>\n<li>On-Call:</li>\n<li>Participate in the rotational on-call activities with SRE and product development team</li>\n</ul>\n<p>What you’ll bring to the role</p>\n<ul>\n<li>Required Experience:</li>\n<li>8+ years of professional software engineering experience, with a heavy focus on backend or systems-level development</li>\n<li>Bachelor’s or Master’s degree in Computer Science, or equivalent practical experience</li>\n<li>Core Technical Expertise:</li>\n<li>Deep, hands-on expertise in multi-platform Go development and building high-performance, concurrent applications</li>\n<li>Experience designing or operating distributed systems</li>\n<li>Experience with secure systems (authn/authz, encryption, TLS, token handling, PKI, CAs, diagnosing TLS issues)</li>\n<li>Deep expertise in distributed storage systems, with a focus on replication, backup, and restore, and data management. (Postgres, etc.)</li>\n<li>Direct experience designing, building, or contributing to a secrets management, service mesh, or machine identity platform</li>\n<li>Expert-level at ergonomic API design (gRPC/openAPI), and building for reliability at scale</li>\n<li>Deep knowledge of cloud-native infrastructure</li>\n<li>Key Attributes:</li>\n<li>You are driven by the challenge of optimizing systems for performance, latency, and throughput, with a proven ability to diagnose complex, multi-system issues</li>\n<li>You have a proven track record of making significant contributions to the architecture of complex, mission-critical systems</li>\n<li>You thrive in an environment where you can focus on deep technical problems</li>\n<li>Bonus Points:</li>\n<li>Experience at a leading Cybersecurity or Infrastructure-as-Code company</li>\n<li>Contributions to open-source projects in the identity, security, or infrastructure space</li>\n</ul>\n<p>And extra credit if you have experience in any of the following!</p>\n<ul>\n<li>Deep expertise in backend systems engineering</li>\n<li>Experience building and scaling beyond standard three-tier monolithic architectures, with a focus on modern distributed systems</li>\n<li>Have worked on projects with complex, established systems</li>\n<li>Possess significant, hands-on experience in a Linux/Unix environment</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_3a40dbfa-d00","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Okta","sameAs":"https://www.okta.com/","logo":"https://logos.yubhub.co/okta.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/okta/jobs/7674829","x-work-arrangement":"hybrid","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":"$194,000-$267,000 USD","x-skills-required":["Go development","Distributed systems","Secure systems","Distributed storage systems","Secrets management","Service mesh","Machine identity platform","Ergonomic API design","Cloud-native infrastructure"],"x-skills-preferred":[],"datePosted":"2026-04-18T15:47:43.090Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, California"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Go development, Distributed systems, Secure systems, Distributed storage systems, Secrets management, Service mesh, Machine identity platform, Ergonomic API design, Cloud-native infrastructure","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":194000,"maxValue":267000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_7fef2a53-3e7"},"title":"Staff Software Engineer, Storage","description":"<p>We&#39;re seeking a Staff Software Engineer to join our BE Platform team. As a Staff Software Engineer, you will play a key role in building and evolving control and data planes, improving underlying systems, and writing software that implements critical workflows to automate and enhance the operation of our large-scale storage infrastructure.</p>\n<p>Your responsibilities will include designing, writing, and delivering software to improve the availability, scalability, latency, and efficiency of Reddit&#39;s products in Go, C++, and sometimes Python. You will dive deep into the codebase of supported storage systems to understand system internals, make system-level improvements, enhancements, and implement complex code modifications. You will engage actively with the open-source community to implement and upstream changes to the OSS codebase.</p>\n<p>You will contribute to the design and implementation of high-performance, large-scale distributed storage systems to power various use cases at Reddit. You will collaborate closely with engineering teams and stakeholders to integrate storage capabilities into broader storage infrastructure and use cases across Reddit. You will mentor and guide other engineers on how to design, build, and evangelize vector storage services across Reddit.</p>\n<p>The ideal candidate will have 7+ years of experience building internet-scale software, preferably with a focus on machine learning storage infrastructure. They will have software development experience in one or more general-purpose programming languages, such as Golang, Python, C++, or Java. They will have hands-on experience implementing features, optimizations, and bug fixes to distributed storage systems. They will have experience contributing code improvements, features, and bug fixes to open-source projects.</p>\n<p>Additionally, the ideal candidate will have excellent communication skills to collaborate with a service-oriented team and company. They will be able to work effectively in a fast-paced environment and prioritize tasks to meet deadlines.</p>\n<p>We offer a comprehensive benefits package, including comprehensive healthcare benefits, income replacement programs, 401(k) match, family planning support, gender-affirming care, mental health and coaching benefits, flexible vacation, Reddit global days off, generous paid parental leave, and paid volunteer time off.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_7fef2a53-3e7","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Reddit","sameAs":"https://www.redditinc.com","logo":"https://logos.yubhub.co/redditinc.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/reddit/jobs/7511679","x-work-arrangement":"remote","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":"$217,000 - $303,900 USD","x-skills-required":["Go","C++","Python","mongoose","Golang","Java","Distributed storage systems","High-performance computing","Large-scale systems"],"x-skills-preferred":[],"datePosted":"2026-04-18T15:47:00.406Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Remote - United States"}},"jobLocationType":"TELECOMMUTE","employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Go, C++, Python, mongoose, Golang, Java, Distributed storage systems, High-performance computing, Large-scale systems","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":217000,"maxValue":303900,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_18ae1499-b22"},"title":"Research Engineer, Discovery","description":"<p>As a Research Engineer on our team, you will work end-to-end across the whole model stack, identifying and addressing key infra blockers on the path to scientific AGI. Strong candidates should have familiarity with elements of language model training, evaluation, and inference and eagerness to quickly dive and get up to speed in areas they are not yet an expert on.</p>\n<p>Responsibilities:</p>\n<ul>\n<li>Design and implement large-scale infrastructure systems to support AI scientist training, evaluation, and deployment across distributed environments</li>\n<li>Identify and resolve infrastructure bottlenecks impeding progress toward scientific capabilities</li>\n<li>Develop robust and reliable evaluation frameworks for measuring progress towards scientific AGI</li>\n<li>Build scalable and performant VM/sandboxing/container architectures to safely execute long-horizon AI tasks and scientific workflows</li>\n<li>Collaborate to translate experimental requirements into production-ready infrastructure</li>\n<li>Develop large scale data pipelines to handle advanced language model training requirements</li>\n<li>Optimize large scale training and inference pipelines for stable and efficient reinforcement learning</li>\n</ul>\n<p>You may be a good fit if you:</p>\n<ul>\n<li>Have 6+ years of highly-relevant experience in infrastructure engineering with demonstrated expertise in large-scale distributed systems</li>\n<li>Are a strong communicator and enjoy working collaboratively</li>\n<li>Possess deep knowledge of performance optimization techniques and system architectures for high-throughput ML workloads</li>\n<li>Have experience with containerization technologies (Docker, Kubernetes) and orchestration at scale</li>\n<li>Have proven track record of building large-scale data pipelines and distributed storage systems</li>\n<li>Excel at diagnosing and resolving complex infrastructure challenges in production environments</li>\n<li>Can work effectively across the full ML stack from data pipelines to performance optimization</li>\n<li>Have experience collaborating with other researchers to scale experimental ideas</li>\n<li>Thrive in fast-paced environments and can rapidly iterate from experimentation to production</li>\n</ul>\n<p>Strong candidates may also have:</p>\n<ul>\n<li>Experience with language model training infrastructure and distributed ML frameworks (PyTorch, JAX, etc.)</li>\n<li>Background in building infrastructure for AI research labs or large-scale ML organizations</li>\n<li>Knowledge of GPU/TPU architectures and language model inference optimization</li>\n<li>Experience with cloud platforms (AWS, GCP) at enterprise scale</li>\n<li>Familiarity with VM and container orchestration</li>\n<li>Experience with workflow orchestration tools and experiment management systems</li>\n<li>History working with large scale reinforcement learning</li>\n<li>Comfort with large scale data pipelines (Beam, Spark, Dask, …)</li>\n</ul>\n<p>The annual compensation range for this role is $350,000-$850,000 USD.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_18ae1499-b22","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://www.anthropic.com/","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/4669581008","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$350,000-$850,000 USD","x-skills-required":["large-scale distributed systems","containerization technologies (Docker, Kubernetes)","performance optimization techniques","system architectures for high-throughput ML workloads","data pipelines","distributed storage systems","ML frameworks (PyTorch, JAX, etc.)","GPU/TPU architectures","cloud platforms (AWS, GCP)","VM and container orchestration","workflow orchestration tools","experiment management systems","reinforcement learning","large scale data pipelines (Beam, Spark, Dask, …)"],"x-skills-preferred":[],"datePosted":"2026-04-18T15:41:42.408Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"large-scale distributed systems, containerization technologies (Docker, Kubernetes), performance optimization techniques, system architectures for high-throughput ML workloads, data pipelines, distributed storage systems, ML frameworks (PyTorch, JAX, etc.), GPU/TPU architectures, cloud platforms (AWS, GCP), VM and container orchestration, workflow orchestration tools, experiment management systems, reinforcement learning, large scale data pipelines (Beam, Spark, Dask, …)","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":350000,"maxValue":850000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_8aa2a018-294"},"title":"Sr. Staff Software Engineer - Distributed System Development","description":"<p>As a Sr. Staff Software Engineer – Distributed Systems at Alluxio, you will lead the end-to-end architecture and technical evolution of our next-generation distributed data platform.</p>\n<p>You will drive system-level design decisions that enable Alluxio to scale to thousands of nodes and exabytes of data, while maintaining performance, reliability, and simplicity for users.</p>\n<p>In this role, you will operate as a technical architect and hands-on engineering leader, partnering closely with engineering teams and product management to translate complex requirements into scalable distributed system designs.</p>\n<p><strong>Responsibilities</strong></p>\n<ul>\n<li>Lead the end-to-end architecture and design of large-scale distributed systems powering the Alluxio platform.</li>\n<li>Drive technical strategy and architectural direction across multiple teams and components.</li>\n<li>Design systems that support high scalability, fault tolerance, performance optimization, and data durability.</li>\n<li>Provide hands-on development and deep technical guidance in critical areas of the system.</li>\n<li>Lead complex system design reviews and mentor senior engineers on distributed systems design.</li>\n<li>Identify and resolve system-level performance bottlenecks and reliability challenges.</li>\n<li>Collaborate with product management and engineering leadership to translate product goals into technical solutions.</li>\n<li>Influence the broader technical ecosystem through open-source contributions and architectural thought leadership.</li>\n</ul>\n<p><strong>Requirements</strong></p>\n<ul>\n<li>Master or BS degree in Computer Science or related technical field, or equivalent practical experience.</li>\n<li>Proven experience of 2+ years in a technical leadership or architect role, driving system-level design and guiding engineering teams.</li>\n<li>Strong hands-on software development experience in one or more general-purpose programming languages, including but not limited to Java, C/C++, or Go.</li>\n<li>Deep architecting expertise in at least two of the following areas:</li>\n<li>Distributed and parallel systems</li>\n<li>Distributed storage systems</li>\n<li>Architecting large-scale software systems</li>\n<li>Demonstrated ability to design and implement high-quality, stable, and scalable end-to-end system architectures in production environments.</li>\n<li>Strong analytical thinking and complex problem-solving skills.</li>\n<li>Excellent communication skills and ability to influence technical direction across teams.</li>\n</ul>\n<p><strong>Nice to Have</strong></p>\n<ul>\n<li>PhD in Computer Science, Distributed Systems, or related fields.</li>\n<li>Deep understanding of consensus algorithms, storage engines, or large-scale data systems.</li>\n<li>Experience building or operating cloud-native infrastructure platforms.</li>\n<li>Experience contributing to or maintaining open-source distributed systems projects.</li>\n<li>Track record of designing systems that operate at massive scale (thousands of nodes or higher).</li>\n<li>Passion for building high-performance infrastructure software.</li>\n<li>Contributions to Alluxio open-source community.</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_8aa2a018-294","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Alluxio","sameAs":"https://www.alluxio.com/","logo":"https://logos.yubhub.co/alluxio.com.png"},"x-apply-url":"https://jobs.lever.co/alluxio/f997ed6c-941f-4873-b308-a1f33b6b78ef","x-work-arrangement":"onsite","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":null,"x-skills-required":["Java","C/C++","Go","Distributed and parallel systems","Distributed storage systems","Architecting large-scale software systems"],"x-skills-preferred":[],"datePosted":"2026-04-17T12:23:20.395Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Beijing"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Java, C/C++, Go, Distributed and parallel systems, Distributed storage systems, Architecting large-scale software systems"},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_94b47f45-76d"},"title":"Distributed Systems Engineer","description":"<p>Are you interested in joining a group of highly talented engineers working on an open source project that is solving challenging problems across big data analytics, machine learning and artificial intelligence?</p>\n<p>As a distributed systems engineer at Alluxio, you will be responsible for evolving the state-of-the-art Alluxio project. The work would involve solving challenging problems in the area of Distributed Data Services, Memory and data structure efficiency, Thread concurrency and locking optimizations, process coordination and caching policies and implementation.</p>\n<p>The role would include developing innovative solutions for scaling systems to thousands of nodes and providing Data Durability and High Availability.</p>\n<p>You will be part of a team that includes leaders, innovators, explorers, and risk-takers with extensive industry experience from top tech companies including Google, Palantir and VMWare and alumni from top computer science programs including CMU, Stanford and UC Berkeley.</p>\n<p>We are looking for someone with a BS degree in Computer Science, similar technical field of study or equivalent practical experience. You should have software development experience in one or more general purpose programming languages including but not limited to: Java, C/C++, or Go.</p>\n<p>Experience working with two or more from the following is a must: distributed and parallel systems, distributed storage systems, architecting large scale software systems, and/or security software development.</p>\n<p>Excellent analytical and problem solving skills are required. Working proficiency and communication skills in verbal and written English are also necessary.</p>\n<p>Preferred qualifications include a Master’s, PhD degree, further education or equivalent practical experience in engineering, computer science or other technical related field. Experience designing, developing, and deploying Kubernetes applications is also desirable.</p>\n<p>If you are interested in contributing to an open source project and want to work in a fast-paced, collaborative and iterative programming environment, please apply.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_94b47f45-76d","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Alluxio","sameAs":"https://www.alluxio.com/","logo":"https://logos.yubhub.co/alluxio.com.png"},"x-apply-url":"https://jobs.lever.co/alluxio/ad547017-b276-4c99-ae4e-4c5a073daf93","x-work-arrangement":"onsite","x-experience-level":"mid","x-job-type":"full-time","x-salary-range":null,"x-skills-required":["Java","C/C++","Go","Distributed systems","Parallel systems","Distributed storage systems","Architecting large scale software systems","Security software development"],"x-skills-preferred":["Kubernetes","Master’s, PhD degree, further education or equivalent practical experience in engineering, computer science or other technical related field"],"datePosted":"2026-04-17T12:23:11.376Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Java, C/C++, Go, Distributed systems, Parallel systems, Distributed storage systems, Architecting large scale software systems, Security software development, Kubernetes, Master’s, PhD degree, further education or equivalent practical experience in engineering, computer science or other technical related field"},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_37049070-1d7"},"title":"Software Engineer, Compute Infrastructure","description":"<p>About Mistral AI\nAt Mistral AI, we believe in the power of AI to simplify tasks, save time, and enhance learning and creativity.</p>\n<p>Our technology is designed to integrate seamlessly into daily working life. We democratize AI through high-performance, optimized, open-source and cutting-edge models, products and solutions. Our comprehensive AI platform is designed to meet enterprise needs, whether on-premises or in cloud environments.</p>\n<p>We are a team passionate about AI and its potential to transform society. Our diverse workforce thrives in competitive environments and is committed to driving innovation. Our teams are distributed between France, USA, UK, Germany and Singapore.</p>\n<p>Role Summary\nWe are building one of Europe&#39;s largest AI infrastructure offerings that will provide our customers a private and integrated stack in every form factor they may need — from bare-metal servers to fully-managed PaaS.</p>\n<p>You will join a fast-growing team to help build, scale and automate our computing management stack. You will be responsible for building fault-tolerant and reliable infrastructure to support both our internal processes and customer platform.</p>\n<p>Location: France and UK as primary locations. Remote in Europe can be considered under conditions.</p>\n<p>Key Responsibilities:\n• Design, build, and operate a scalable Kubernetes-based platform to host large-scale AI and HPC workloads, ensuring high performance, reliability, and security.\n• Own the full lifecycle of cluster management, from bootstrapping and provisioning to global operations, by integrating and developing the necessary software components—including automation, monitoring, and orchestration tools.\n• Drive infrastructure innovation by designing workflows, tooling (scripts, APIs, dashboards), and CI/CD pipelines to optimize system reliability, availability, and observability.\n• Champion a zero-trust security model, strengthening IAM, networking (VPC), and access controls to safeguard the platform.\n• Develop user-centric features that simplify operations for both sysadmins and end customers, reducing friction in daily workflows.\n• Lead incident resolution with rigorous root-cause analysis to prevent recurrence and improve system resilience.</p>\n<p>About you\n• Strong proficiency in software development (preferably Golang) and knowledge of software development best practices\n• Successful experience in an Infrastructure Engineering role (SWE, Platform, DevOps, Cloud...)\n• Deep understanding of Kubernetes internals and hands-on experience with containerization and orchestration tools (Docker, Kubernetes, Openstack...)\n• Familiarity with infrastructure-as-code tools like Terraform or CloudFormation\n• Knowledge of monitoring, logging, alerting and observability tools (Prometheus, Grafana, ELK, Datadog...)\n• Exposure to highly available distributed systems and site reliability issues in critical environments (issue root cause analysis, in-production troubleshooting, on-call rotations...)\n• Experience working against reliability KPIs (observability, alerting, SLAs)\n• Excellent problem-solving and communication skills\n• Self-motivation and ability to thrive in a fast-paced startup environment</p>\n<p>Now, it would be ideal if you also had:\n• Experience with HPC workload managers (Slurm) and distributed storage systems (Lustre, Ceph)\n• Demonstrated history of contributing to open-source projects (e.g., code, documentation, bug fixes, feature development, or community support).</p>\n<p>Additional Information\nLocation &amp; Remote\nThis role is primarily based in one of our European offices — Paris, France and London, UK. We will prioritize candidates who either reside there or are open to relocating. We strongly believe in the value of in-person collaboration to foster strong relationships and seamless communication within our team.</p>\n<p>In certain specific situations, we will also consider remote candidates based in one of the countries listed in this job posting — currently France, UK, Germany, Belgium, Netherlands, Spain and Italy.</p>\n<p>In any case, we ask all new hires to visit our Paris HQ office:\n• for the first week of their onboarding (accommodation and travelling covered)\n• then at least 2 days per month</p>\n<p>What we offer\nCompetitive salary and equity\nHealth insurance\nTransportation allowance\nSport allowance\nMeal vouchers\nPrivate pension plan\nGenerous parental leave policy\nVisa sponsorship</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_37049070-1d7","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Mistral AI","sameAs":"https://mistral.ai"},"x-apply-url":"https://jobs.lever.co/mistral/d60f6c60-ad5e-4753-af8a-56365b7db8b8","x-work-arrangement":"remote","x-experience-level":"mid","x-job-type":"full-time","x-salary-range":null,"x-skills-required":["software development","Golang","Kubernetes","containerization","orchestration","infrastructure-as-code","Terraform","CloudFormation","monitoring","logging","alerting","observability","Prometheus","Grafana","ELK","Datadog"],"x-skills-preferred":["HPC workload managers","distributed storage systems","open-source projects"],"datePosted":"2026-03-10T11:35:56.693Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Paris"}},"jobLocationType":"TELECOMMUTE","employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"software development, Golang, Kubernetes, containerization, orchestration, infrastructure-as-code, Terraform, CloudFormation, monitoring, logging, alerting, observability, Prometheus, Grafana, ELK, Datadog, HPC workload managers, distributed storage systems, open-source projects"},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_da726093-b19"},"title":"Research Engineer, Discovery","description":"<p><strong>About the Role</strong></p>\n<p>As a Research Engineer on our team, you will work end to end across the whole model stack, identifying and addressing key infra blockers on the path to scientific AGI. Strong candidates should have familiarity with elements of language model training, evaluation, and inference and eagerness to quickly dive and get up to speed in areas they are not yet an expert on. This may include performance optimization, distributed systems, VM/sandboxing/container deployment, and large scale data pipelines.</p>\n<p><strong>Responsibilities:</strong></p>\n<ul>\n<li>Design and implement large-scale infrastructure systems to support AI scientist training, evaluation, and deployment across distributed environments</li>\n<li>Identify and resolve infrastructure bottlenecks impeding progress toward scientific capabilities</li>\n<li>Develop robust and reliable evaluation frameworks for measuring progress towards scientific AGI.</li>\n<li>Build scalable and performant VM/sandboxing/container architectures to safely execute long-horizon AI tasks and scientific workflows</li>\n<li>Collaborate to translate experimental requirements into production-ready infrastructure</li>\n<li>Develop large scale data pipelines to handle advanced language model training requirements</li>\n<li>Optimize large scale training and inference pipelines for stable and efficient reinforcement learning</li>\n</ul>\n<p><strong>You may be a good fit if you:</strong></p>\n<ul>\n<li>Have 6+ years of highly-relevant experience in infrastructure engineering with demonstrated expertise in large-scale distributed systems</li>\n<li>Are a strong communicator and enjoy working collaboratively</li>\n<li>Possess deep knowledge of performance optimization techniques and system architectures for high-throughput ML workloads</li>\n<li>Have experience with containerization technologies (Docker, Kubernetes) and orchestration at scale</li>\n<li>Have proven track record of building large-scale data pipelines and distributed storage systems</li>\n<li>Excel at diagnosing and resolving complex infrastructure challenges in production environments</li>\n<li>Can work effectively across the full ML stack from data pipelines to performance optimization</li>\n<li>Have experience collaborating with other researchers to scale experimental ideas</li>\n<li>Thrive in fast-paced environments and can rapidly iterate from experimentation to production</li>\n</ul>\n<p><strong>Strong candidates may also have:</strong></p>\n<ul>\n<li>Experience with language model training infrastructure and distributed ML frameworks (PyTorch, JAX, etc.)</li>\n<li>Background in building infrastructure for AI research labs or large-scale ML organizations</li>\n<li>Knowledge of GPU/TPU architectures and language model inference optimization</li>\n<li>Experience with cloud platforms (AWS, GCP) at enterprise scale</li>\n<li>Familiarity with VM and container orchestration.</li>\n<li>Experience with workflow orchestration tools and experiment management systems</li>\n<li>History working with large scale reinforcement learning</li>\n<li>Comfort with large scale data pipelines (Beam, Spark, Dask, …)</li>\n</ul>\n<p><strong>Logistics</strong></p>\n<ul>\n<li>Education requirements: We require at least a Bachelor&#39;s degree in a related field or equivalent experience.</li>\n<li>Location-based hybrid policy: Currently, we expect all staff to be in one of our offices at least 25% of the time. However, some roles may require more time in our offices.</li>\n<li>Visa sponsorship: We do sponsor visas! However, we aren&#39;t able to successfully sponsor visas for every role and every candidate. But if we make you an offer, we will make every reasonable effort to get you a visa, and we retain an immigration lawyer to help with this.</li>\n</ul>\n<p><strong>We encourage you to apply even if you do not believe you meet every single qualification. Not all strong candidates will meet every single qualification as listed. Research shows that people who identify as being from underrepresented groups are more prone to experiencing imposter syndrome and doubting the strength of their candidacy, so we urge you not to exclude yourself prematurely and to submit an application if you&#39;re interested in this work.</strong></p>\n<p><strong>Your safety matters to us. To protect yourself from potential scams, remember that Anthropic recruiters only contact you from @anthropic.com email addresses. In some cases, we may partner with vetted recruiting agencies who will identify themselves as working on behalf of Anthropic. Be cautious of emails from other domains. Legitimate Anthropic recruiters will never ask for money, fees, or banking information before your first day. If you&#39;re ever unsure about a communication, don&#39;t click any links—visit anthropic.com/careers directly for confirmed position openings.</strong></p>\n<p><strong>How we&#39;re different</strong></p>\n<p>We believe that the highest-impact AI research will be big science. At Anthropic we work as a single cohesive team on just a few large-scale projects, and we&#39;re committed to making a positive impact on the world.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_da726093-b19","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://job-boards.greenhouse.io","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/4669581008","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$350,000 - $850,000 USD","x-skills-required":["infrastructure engineering","large-scale distributed systems","performance optimization","containerization technologies","orchestration at scale","data pipelines","distributed storage systems","complex infrastructure challenges","ML stack","workflow orchestration tools","experiment management systems","reinforcement learning","large scale data pipelines"],"x-skills-preferred":["language model training infrastructure","distributed ML frameworks","GPU/TPU architectures","language model inference optimization","cloud platforms","VM and container orchestration","workflow orchestration tools","experiment management systems","large scale reinforcement learning","large scale data pipelines"],"datePosted":"2026-03-08T13:46:32.661Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"infrastructure engineering, large-scale distributed systems, performance optimization, containerization technologies, orchestration at scale, data pipelines, distributed storage systems, complex infrastructure challenges, ML stack, workflow orchestration tools, experiment management systems, reinforcement learning, large scale data pipelines, language model training infrastructure, distributed ML frameworks, GPU/TPU architectures, language model inference optimization, cloud platforms, VM and container orchestration, workflow orchestration tools, experiment management systems, large scale reinforcement learning, large scale data pipelines","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":350000,"maxValue":850000,"unitText":"YEAR"}}}]}