{"version":"0.1","company":{"name":"YubHub","url":"https://yubhub.co","jobsUrl":"https://yubhub.co/jobs/skill/gpu-systems"},"x-facet":{"type":"skill","slug":"gpu-systems","display":"Gpu Systems","count":3},"x-feed-size-limit":100,"x-feed-sort":"enriched_at desc","x-feed-notice":"This feed contains at most 100 jobs (the most recently enriched). For the full corpus, use the paginated /stats/by-facet endpoint or /search.","x-generator":"yubhub-xml-generator","x-rights":"Free to redistribute with attribution: \"Data by YubHub (https://yubhub.co)\"","x-schema":"Each entry in `jobs` follows https://schema.org/JobPosting. YubHub-native raw fields carry `x-` prefix.","jobs":[{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_07256a9e-2a8"},"title":"Senior Electrical Engineer","description":"<p>We&#39;re seeking a skilled Senior Electrical Engineer to join our team. As a senior member of our engineering team, you will design servers, develop and review board designs, collaborate with exceptional engineers developing cutting-edge AI/ML hardware, and review JDM&#39;s high-speed design. You will also conduct schematics, board design, and power design reviews, take design from concept to mass production, and work closely with manufacturing teams. To be successful in this role, you should have at least 5 years of internal design experience developing complex hardware systems with high-speed design interfaces, strong skills in electrical board design, and solid experience with high-speed interfaces. You should also be able to negotiate and reach consensus with developers and fellow colleagues from interdisciplinary teams, as well as have excellent documentation skills.</p>\n<p>In addition to a competitive salary, we offer a variety of benefits to support your needs, including medical, dental, and vision insurance, company-paid life insurance, voluntary supplemental life insurance, short and long-term disability insurance, flexible spending account, health savings account, tuition reimbursement, ability to participate in employee stock purchase program (ESPP), mental wellness benefits through Spring Health, family-forming support provided by Carrot, paid parental leave, flexible, full-service childcare support with Kinside, 401(k) with a generous employer match, flexible PTO, catered lunch each day in our office and data center locations, and a casual work environment.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_07256a9e-2a8","directApply":true,"hiringOrganization":{"@type":"Organization","name":"CoreWeave","sameAs":"https://www.coreweave.com","logo":"https://logos.yubhub.co/coreweave.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/coreweave/jobs/4606485006","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$165,000 to $242,000","x-skills-required":["electrical board design","high-speed interfaces","server design","power design","thermal design","mechanical design","signal integrity","PCB design","PCBA design","system assembly manufacturing","testing","design for mass manufacturing","reliability"],"x-skills-preferred":["hyperscaler space","GPU systems","ODM/JDM design model","high-speed SI simulation tools"],"datePosted":"2026-04-18T15:50:03.750Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"New York, NY / Sunnyvale, CA / Bellevue, WA"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"electrical board design, high-speed interfaces, server design, power design, thermal design, mechanical design, signal integrity, PCB design, PCBA design, system assembly manufacturing, testing, design for mass manufacturing, reliability, hyperscaler space, GPU systems, ODM/JDM design model, high-speed SI simulation tools","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":165000,"maxValue":242000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_b151fcc2-2fb"},"title":"Member of Technical Staff, High Performance Computing Engineer","description":"<p>We are looking for experienced Member of Technical Staff, High Performance Computing Engineers to help build and scale the infrastructure that trains our frontier models and powers the next evolution of our personal AI, Copilot.</p>\n<p>This role offers the unique opportunity to work on some of the largest scale supercomputers in the world – a rare chance to operate at such a significant scale.</p>\n<p><strong>Responsibilities</strong></p>\n<p>Design, operate, and maintain large-scale HPC environments, drawing on hands-on engineering experience in production settings.</p>\n<p>Own the deployment, configuration, and day-to-day operation of HPC schedulers (e.g., SLURM, Kubernetes), ensuring reliable and efficient job scheduling at scale.</p>\n<p>Serve as a technical owner for at least one core HPC domain (GPU compute, high-performance storage, networking, or similar), including ongoing maintenance, performance tuning, and troubleshooting of massive clusters.</p>\n<p>Develop and maintain automation and tooling using Bash and/or Python to improve cluster reliability, observability, and operational efficiency.</p>\n<p>Partner closely with researchers and engineers to support their workloads, troubleshoot cluster usage issues, and triage failed or underperforming jobs to resolution.</p>\n<p>Drive work forward independently by navigating ambiguity and technical roadblocks, delivering incremental improvements that get capabilities into users’ hands quickly.</p>\n<p><strong>Qualifications</strong></p>\n<p>Do you have a Bachelor’s degree in computer science, or related technical field AND 4+ years technical engineering experience with deploying or operating on-premise or cloud high-performance clusters, AND 4+ years experience working with high-scale training clusters (ex. working with frameworks/tools such as nvidia InfiniBand clusters, SLURM, Kubernetes, Ray, etc.), AND 4+ years experience building scalable services on top of public cloud infrastructure like Azure, AWS, or GCP, OR equivalent experience?</p>\n<p><strong>Preferred Qualifications</strong></p>\n<p>Master’s Degree in Computer Science or related technical field AND 6+ years technical engineering experience with deploying or operating on-premise or cloud high-performance clusters, AND 6+ years experience working with high-scale training clusters (ex. working with frameworks/tools such as nvidia InfiniBand clusters, SLURM, Kubernetes, Ray, etc.), AND 6+ years experience building scalable services on top of public cloud infrastructure like Azure, AWS, or GCP, OR equivalent experience.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_b151fcc2-2fb","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Microsoft AI","sameAs":"https://microsoft.ai","logo":"https://logos.yubhub.co/microsoft.ai.png"},"x-apply-url":"https://microsoft.ai/job/member-of-technical-staff-high-performance-computing-engineer-mai-superintelligence-team-3/","x-work-arrangement":"onsite","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":null,"x-skills-required":["HPC","SLURM","Kubernetes","GPU compute","high-performance storage","networking","Bash","Python","nvidia InfiniBand clusters","Ray"],"x-skills-preferred":["LLM training clusters","AI platforms","Machine Learning frameworks","large-scale HPC or GPU systems"],"datePosted":"2026-03-08T22:15:08.170Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Zürich"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"HPC, SLURM, Kubernetes, GPU compute, high-performance storage, networking, Bash, Python, nvidia InfiniBand clusters, Ray, LLM training clusters, AI platforms, Machine Learning frameworks, large-scale HPC or GPU systems"},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_8a34364f-8c5"},"title":"Member of Technical Staff, Hardware Health","description":"<p><strong>Summary</strong></p>\n<p>Microsoft AI are looking for a talented Member of Technical Staff, Hardware Health, to ensure these systems deliver sustained reliability, performance, and availability across exascale-class deployments.</p>\n<p><strong>About the Role</strong></p>\n<p>We work closely with research, hardware, datacenter, and platform engineering teams to develop predictive health models, failure detection frameworks, and autonomous remediation systems that keep our AI clusters operating at frontier scale. Our team is responsible for Copilot, Bing, Edge, and generative AI research.</p>\n<p><strong>Accountabilities</strong></p>\n<ul>\n<li>Design and develop next-generation hardware health monitoring and diagnostic frameworks for large GPU clusters (NVL16/NVL72/GB200+ scale).</li>\n<li>Build predictive analytics pipelines leveraging telemetry, power, and thermal data to anticipate hardware degradation and systemic issues.</li>\n<li>Collaborate with silicon, firmware, and datacenter engineers to identify root causes and remediate large-scale hardware anomalies.</li>\n<li>Define system health KPIs (e.g., NIS/RIS, MTBF, failure domain analysis) and integrate them into real-time observability platforms.</li>\n<li>Lead incident triage for high-impact GPU, network, and cooling issues across distributed clusters.</li>\n<li>Drive automation in health management to reduce manual intervention to the top 5% of anomalies.</li>\n<li>Partner with cross-functional teams to influence hardware design for reliability, thermal efficiency, and serviceability.</li>\n</ul>\n<p><strong>The Candidate we&#39;re looking for</strong></p>\n<p><strong>Experience:</strong></p>\n<ul>\n<li>Bachelor&#39;s Degree in Computer Science or related technical field AND 6+ years technical engineering experience with coding in languages including, but not limited to, C, C++, C#, Java, JavaScript, or Python OR equivalent experience.</li>\n</ul>\n<p><strong>Technical skills:</strong></p>\n<ul>\n<li>Experience working with large-scale HPC or GPU systems (NVIDIA H100/GB200 or equivalent).</li>\n<li>Deep understanding of GPU architecture, high-speed interconnects (NVLink, InfiniBand, RoCE), and large datacenter topologies.</li>\n<li>Proficiency in hardware telemetry, diagnostics, or failure analysis tools.</li>\n</ul>\n<p><strong>Personal attributes:</strong></p>\n<ul>\n<li>Strong analytical and problem-solving skills.</li>\n<li>Excellent communication and collaboration skills.</li>\n</ul>\n<p><strong>Benefits</strong></p>\n<ul>\n<li>Competitive salary.</li>\n<li>Comprehensive benefits package.</li>\n<li>Opportunities for professional growth and development.</li>\n<li>Collaborative and dynamic work environment.</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_8a34364f-8c5","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Microsoft AI","sameAs":"https://microsoft.ai","logo":"https://logos.yubhub.co/microsoft.ai.png"},"x-apply-url":"https://microsoft.ai/job/member-of-technical-staff-hardware-health-mai-superintelligence-team-5/","x-work-arrangement":"onsite","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":"USD $139,900 – $274,800 per year","x-skills-required":["C","C++","C#","Java","JavaScript","Python","GPU architecture","high-speed interconnects","hardware telemetry","diagnostics","failure analysis tools"],"x-skills-preferred":["experience working with large-scale HPC or GPU systems","deep understanding of GPU architecture","proficiency in hardware telemetry","diagnostics","failure analysis tools"],"datePosted":"2026-03-06T07:33:03.791Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"New York"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"C, C++, C#, Java, JavaScript, Python, GPU architecture, high-speed interconnects, hardware telemetry, diagnostics, failure analysis tools, experience working with large-scale HPC or GPU systems, deep understanding of GPU architecture, proficiency in hardware telemetry, diagnostics, failure analysis tools","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":139900,"maxValue":274800,"unitText":"YEAR"}}}]}