{"version":"0.1","company":{"name":"YubHub","url":"https://yubhub.co","jobsUrl":"https://yubhub.co/jobs/skill/exascale-class-systems"},"x-facet":{"type":"skill","slug":"exascale-class-systems","display":"exascale-class systems","count":1},"x-feed-size-limit":100,"x-feed-sort":"enriched_at desc","x-feed-notice":"This feed contains at most 100 jobs (the most recently enriched). For the full corpus, use the paginated /stats/by-facet endpoint or /search.","x-generator":"yubhub-xml-generator","x-rights":"Free to redistribute with attribution: \"Data by YubHub (https://yubhub.co)\"","x-schema":"Each entry in `jobs` follows https://schema.org/JobPosting. YubHub-native raw fields carry `x-` prefix.","jobs":[{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_0bf1fd91-846"},"title":"Member of Technical Staff, Hardware Health","description":"<p><strong>Summary</strong></p>\n<p>Microsoft AI are looking for a talented Member of Technical Staff, Hardware Health, to ensure these systems deliver sustained reliability, performance, and availability across exascale-class deployments.</p>\n<p><strong>About the Role</strong></p>\n<p>We work closely with research, hardware, datacenter, and platform engineering teams to develop predictive health models, failure detection frameworks, and autonomous remediation systems that keep our AI clusters operating at frontier scale. Our team is responsible for Copilot, Bing, Edge, and generative AI research. Join us and help shape the future of personal computing.</p>\n<p><strong>Accountabilities</strong></p>\n<ul>\n<li>Design and develop next-generation hardware health monitoring and diagnostic frameworks for large GPU clusters (NVL16/NVL72/GB200+ scale).</li>\n<li>Build predictive analytics pipelines leveraging telemetry, power, and thermal data to anticipate hardware degradation and systemic issues.</li>\n</ul>\n<p><strong>The Candidate we&#39;re looking for</strong></p>\n<p><strong>Experience:</strong></p>\n<ul>\n<li>Bachelor&#39;s Degree in Computer Science or related technical field AND 6+ years technical engineering experience with coding in languages including, but not limited to, C, C++, C#, Java, JavaScript, or Python OR equivalent experience.</li>\n</ul>\n<p><strong>Technical skills:</strong></p>\n<ul>\n<li>Proficiency in hardware telemetry, diagnostics, or failure analysis tools.</li>\n<li>Experience with exascale-class systems or cloud-scale AI clusters.</li>\n</ul>\n<p><strong>Personal attributes:</strong></p>\n<ul>\n<li>Strong analytical and problem-solving skills.</li>\n<li>Excellent communication and collaboration skills.</li>\n</ul>\n<p><strong>Benefits</strong></p>\n<ul>\n<li>Competitive salary range: $139,900 - $274,800 per year.</li>\n<li>Comprehensive benefits package, including health insurance, retirement plan, and paid time off.</li>\n<li>Opportunities for professional growth and development.</li>\n<li>Collaborative and dynamic work environment.</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_0bf1fd91-846","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Microsoft AI","sameAs":"https://microsoft.ai","logo":"https://logos.yubhub.co/microsoft.ai.png"},"x-apply-url":"https://microsoft.ai/job/member-of-technical-staff-hardware-health-mai-superintelligence-team-3/?utm_source=yubhub.co&utm_medium=jobs_feed&utm_campaign=apply","x-work-arrangement":"onsite","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":"$139,900 - $274,800 per year","x-skills-required":["C","C++","C#","Java","JavaScript","Python","hardware telemetry","diagnostics","failure analysis tools","exascale-class systems","cloud-scale AI clusters"],"x-skills-preferred":["machine learning","data analysis","problem-solving"],"datePosted":"2026-03-06T07:32:20.374Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Mountain View"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"C, C++, C#, Java, JavaScript, Python, hardware telemetry, diagnostics, failure analysis tools, exascale-class systems, cloud-scale AI clusters, machine learning, data analysis, problem-solving","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":139900,"maxValue":274800,"unitText":"YEAR"}}}]}