{"version":"0.1","company":{"name":"YubHub","url":"https://yubhub.co","jobsUrl":"https://yubhub.co/jobs/skill/inference-serving"},"x-facet":{"type":"skill","slug":"inference-serving","display":"Inference Serving","count":7},"x-feed-size-limit":100,"x-feed-sort":"enriched_at desc","x-feed-notice":"This feed contains at most 100 jobs (the most recently enriched). For the full corpus, use the paginated /stats/by-facet endpoint or /search.","x-generator":"yubhub-xml-generator","x-rights":"Free to redistribute with attribution: \"Data by YubHub (https://yubhub.co)\"","x-schema":"Each entry in `jobs` follows https://schema.org/JobPosting. YubHub-native raw fields carry `x-` prefix.","jobs":[{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_e5554df1-95d"},"title":"Software Engineer","description":"<p><strong>About the Role</strong></p>\n<p>As a Software Engineer on the Model Routing &amp; Inference team at Cursor, you&#39;ll build the inference platform that powers every AI interaction in the product.</p>\n<p>This team owns the full inference path: making Cursor&#39;s AI faster, more reliable, and more cost-effective at a scale few teams in the world get to operate at. Every agent session, every tab completion, and every chat message flows through your stack.</p>\n<p><strong>Example Projects Include...</strong></p>\n<ul>\n<li>Building and evolving our inference gateway, a single abstraction over every provider&#39;s API semantics, so model onboarding becomes a config change.</li>\n<li>Designing intelligent cross-provider failover so no single provider outage causes user-visible degradation.</li>\n<li>Designing routing backpressure and admission control so traffic spikes don&#39;t cascade into providers.</li>\n</ul>\n<p><strong>You May Be a Fit If...</strong></p>\n<ul>\n<li>You have deep experience building high-throughput, low-latency distributed systems, especially in inference serving, traffic routing, or real-time data pipelines.</li>\n<li>You&#39;re comfortable reasoning about cost/performance tradeoffs at scale (GPU utilization, provider economics, capacity planning).</li>\n<li>You have strong software engineering fundamentals and enjoy shipping production systems that handle millions of requests.</li>\n<li>You make good calls in the gray area: weighing reliability, cost, latency, and user experience when there isn&#39;t a single &#39;right&#39; answer.</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_e5554df1-95d","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Cursor","sameAs":"https://cursor.com","logo":"https://logos.yubhub.co/cursor.com.png"},"x-apply-url":"https://cursor.com/careers/software-engineer-model-routing-inference","x-work-arrangement":"remote","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":null,"x-skills-required":["high-throughput distributed systems","inference serving","traffic routing","real-time data pipelines","cost/performance tradeoffs"],"x-skills-preferred":[],"datePosted":"2026-04-24T14:17:45.329Z","jobLocationType":"TELECOMMUTE","employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"high-throughput distributed systems, inference serving, traffic routing, real-time data pipelines, cost/performance tradeoffs"},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_5e20ca92-993"},"title":"Principal Software Engineer","description":"<p>Monetization Engineering is responsible for building a unified, intelligent, and resilient monetization platform that drives revenue across Microsoft’s AI-native surfaces, including Copilot, Search, MSN, Shopping, and both first-party and third-party ecosystems.</p>\n<p>Our mission is to enhance advertiser value, optimize platform performance, and achieve long-term revenue growth through large-scale systems, machine learning-driven optimization, experimentation, and cross-surface innovation.</p>\n<p>We are seeking an experienced professional with expertise in GPU inference optimization and a deep understanding of LLM/SLM architecture to join our team.</p>\n<p>This is a unique opportunity to contribute to cutting-edge advancements in AI and deep learning while driving impactful solutions for Microsoft’s advertising and monetization platforms.</p>\n<p>Microsoft’s mission is to empower every person and every organization on the planet to achieve more.</p>\n<p>As employees we come together with a growth mindset, innovate to empower others, and collaborate to realize our shared goals.</p>\n<p>Each day we build on our values of respect, integrity, and accountability to create a culture of inclusion where everyone can thrive at work and beyond.</p>\n<p>Starting January 26, 2026, Microsoft AI (MAI) employees who live within a 50-mile commute of a designated Microsoft office in the U.S. or 25-mile commute of a non-U.S., country-specific location are expected to work from the office at least four days per week.</p>\n<p>This expectation is subject to local law and may vary by jurisdiction.</p>\n<p>Responsibilities:</p>\n<p>Serves as the technological core of Microsoft’s rapidly expanding digital advertising business.</p>\n<p>Focus on accelerating Microsoft’s large-scale deep learning inference for Ads, Shopping, Copilot, and other surfaces, including both offline and online applications that support OpenAI LLM models and next-generation LLMs/SLMs.</p>\n<p>Play a pivotal role in bridging state-of-the-art GPU and deep learning technologies with critical business applications.</p>\n<p>Qualifications:</p>\n<p>Required Qualifications:</p>\n<p>Bachelor’s Degree in Computer Science or related technical field AND 8+ years technical engineering experience with coding in languages including, but not limited to, C, C++, C#, Java, JavaScript, or Python OR equivalent experience.</p>\n<p>Ability to meet Microsoft, customer and/or government security screening requirements are required for this role.</p>\n<p>These requirements include but are not limited to the following specialized security screenings:</p>\n<p>Microsoft Cloud Background Check: This position will be required to pass the Microsoft Cloud background check upon hire/transfer and every two years thereafter.</p>\n<p>Preferred Qualifications:</p>\n<p>Master’s Degree in Computer Science or related technical field AND 12+ years technical engineering experience with coding in languages including, but not limited to, C, C++, C#, Java, JavaScript, or Python OR Bachelor’s Degree in Computer Science or related technical field AND 15+ years technical engineering experience with coding in languages including, but not limited to, C, C++, C#, Java, JavaScript, or Python OR equivalent experience.</p>\n<p>Solid experience in GPU inference optimization (CUDA, TensorRT, Triton, or custom GPU kernels).</p>\n<p>Proficiency in profiling tools (Nsight, TensorBoard, PyTorch profiler) and ability to identify CPU/GPU bottlenecks.</p>\n<p>Deep understanding of LLM/SLM architectures (attention, embeddings, MoE, decoders).</p>\n<p>Experience optimizing latency-critical online services.</p>\n<p>Experience with model compression (quantization, distillation, SVD, low-rank methods).</p>\n<p>Experience in building high-throughput inference serving stacks (continuous batching, KV-cache optimizations, routing).</p>\n<p>Familiarity with Microsoft’s DLIS, Talon routing, Triton/TensorRT-LLM stack, and Azure/H100/A100 GPU environments.</p>\n<p>Publications, competition wins, or real-world deployments related to model efficiency.</p>\n<p>#MicrosoftAI</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_5e20ca92-993","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Microsoft","sameAs":"https://microsoft.ai","logo":"https://logos.yubhub.co/microsoft.ai.png"},"x-apply-url":"https://microsoft.ai/job/principal-software-engineer-47/","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$163,000 - $296,400 per year","x-skills-required":["GPU inference optimization","LLM/SLM architecture","C","C++","C#","Java","JavaScript","Python","CUDA","TensorRT","Triton","custom GPU kernels","profiling tools","CPU/GPU bottlenecks","model compression","high-throughput inference serving stacks","DLIS","Talon routing","Triton/TensorRT-LLM stack","Azure/H100/A100 GPU environments"],"x-skills-preferred":[],"datePosted":"2026-04-24T12:10:41.636Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Redmond"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"GPU inference optimization, LLM/SLM architecture, C, C++, C#, Java, JavaScript, Python, CUDA, TensorRT, Triton, custom GPU kernels, profiling tools, CPU/GPU bottlenecks, model compression, high-throughput inference serving stacks, DLIS, Talon routing, Triton/TensorRT-LLM stack, Azure/H100/A100 GPU environments","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":163000,"maxValue":296400,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_ac45e205-e7d"},"title":"Engineering Manager, Inference Routing and Performance","description":"<p><strong>About the role</strong></p>\n<p>Every request that hits Claude , from claude.ai, the API, our cloud partners, or internal research , passes through a routing decision. Not a generic load balancer round-robin, but a decision that accounts for what&#39;s already cached where, which accelerator the request runs best on, and what else is in flight across the fleet.</p>\n<p>The Inference Routing team owns this layer. We build the cluster-level routing and coordination plane for Anthropic&#39;s inference fleet , the system that sits between the API surface and the inference engines themselves, making fleet-wide efficiency decisions in real time.</p>\n<p><strong>Representative work:</strong></p>\n<p>Things the Inference Routing EM actually spends time on:</p>\n<ul>\n<li>Deciding whether a proposed routing algorithm change is worth the deploy risk, given the modeled throughput gain and the blast radius if it regresses</li>\n</ul>\n<ul>\n<li>Sequencing a quarter where KV-cache offload, a new coordination protocol, and two model launches all compete for the same engineers</li>\n</ul>\n<ul>\n<li>Working through a persistent tail-latency regression with the team , walking down from fleet-level metrics to per-replica behavior to a root cause in the networking stack</li>\n</ul>\n<ul>\n<li>Building the case (with numbers) to peer teams for why a cross-team protocol change unlocks the next efficiency win</li>\n</ul>\n<ul>\n<li>Running the post-incident review after a cache-eviction bug caused a capacity event, and turning it into process changes that stick</li>\n</ul>\n<ul>\n<li>Interviewing a candidate who has built schedulers at supercomputing scale, and deciding whether they&#39;d be additive to a team that already goes deep</li>\n</ul>\n<p><strong>Drive system-level performance</strong></p>\n<ul>\n<li>Own the technical roadmap for cluster-level inference efficiency , routing decisions, cache placement and eviction, cross-replica coordination, and the protocols that keep routing and inference engines in sync</li>\n</ul>\n<ul>\n<li>Partner with the inference engine, kernels, and performance teams to identify fleet-level throughput and latency wins, then turn those into shipped improvements with measurable results</li>\n</ul>\n<ul>\n<li>Build the team&#39;s habit of quantitative performance modeling: claim a win only when you can measure it, and know before you ship what the expected effect is</li>\n</ul>\n<p><strong>Deliver reliably and operate cleanly</strong></p>\n<ul>\n<li>Set technical strategy for how routing evolves across heterogeneous hardware (GPUs, TPUs, Trainium) and across all our serving surfaces</li>\n</ul>\n<ul>\n<li>Run the team&#39;s operational backbone , on-call rotation, incident response, postmortem review, deploy safety , so the team can ship aggressively without the system becoming fragile</li>\n</ul>\n<ul>\n<li>Create clarity at a seam: Inference Routing sits between the API surface, the inference engines, and the cloud deployment teams. You&#39;ll make sure commitments are realistic, dependencies are understood, and nobody is surprised</li>\n</ul>\n<p><strong>Build and grow the team</strong></p>\n<ul>\n<li>Develop and retain a strong existing team, and hire against the bar described above: people who can go to the OS and framework level when the problem demands it, and who care about production reliability</li>\n</ul>\n<ul>\n<li>Coach engineers through a roadmap where priorities shift with model launches, new hardware, and scaling demands. We pair a lot here , you&#39;ll help make that collaboration pattern productive</li>\n</ul>\n<ul>\n<li>Pick up slack when it matters. This is a small team in a critical path; sometimes the EM is the one unblocking a stuck deploy or synthesizing a design debate</li>\n</ul>\n<p><strong>You may be a good fit if you:</strong></p>\n<ul>\n<li>Have 5+ years of engineering management experience, ideally with at least part of that leading teams on critical-path production infrastructure at scale</li>\n</ul>\n<ul>\n<li>Have a deep systems background , load balancing, scheduling, cache-coherent distributed state, high-performance networking, or similar. You need enough depth to make architectural calls about routing and efficiency, and to evaluate candidates who go to the kernel and framework level</li>\n</ul>\n<ul>\n<li>Have shipped performance improvements in large-scale systems and can explain, with numbers, what the impact was</li>\n</ul>\n<ul>\n<li>Have run production infrastructure with real operational stakes: on-call, incident response, capacity events, deploy discipline</li>\n</ul>\n<ul>\n<li>Are results-oriented with a bias toward impact, and comfortable working in a space where throughput, latency, stability, and feature velocity all pull in different directions</li>\n</ul>\n<ul>\n<li>Build strong relationships across team boundaries , this is a seam role, and much of the job is making sure other teams can rely on yours</li>\n</ul>\n<ul>\n<li>Are curious about machine learning systems. You don&#39;t need an ML research background, but you should want to learn how transformer inference actually works and how that shapes the systems problems</li>\n</ul>\n<p><strong>Strong candidates may also have:</strong></p>\n<ul>\n<li>Experience with LLM inference serving , KV caching, continuous batching, request scheduling, prefill/decode disaggregation</li>\n</ul>\n<ul>\n<li>Background in cluster schedulers, load balancers, service meshes, or coordination planes at scale</li>\n</ul>\n<ul>\n<li>Familiarity with heterogeneous accelerator fleets (GPU/TPU/Trainium) and how hardware differences affect workload placement</li>\n</ul>\n<ul>\n<li>Experience with GPU/accelerator programming, ML framework internals, or OS-level performance debugging , enough to follow and evaluate the technical work, not necessarily to do it daily</li>\n</ul>\n<ul>\n<li>Led teams at supercomputing or hyperscaler infrastructure scale</li>\n</ul>\n<ul>\n<li>Led teams through rapid-growth periods where hiring and onboarding competed with roadmap delivery</li>\n</ul>\n<p>The annual compensation range for this role is listed below.</p>\n<p>For sales roles, the range provided is the role’s On Target Earnings (“OTE”) range, meaning that the range includes both the sales commissions/sales bonuses target and annual base salary for the role.</p>\n<p>Annual Salary: $405,000-$485,000 USD</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_ac45e205-e7d","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://www.anthropic.com/","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/5155391008","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"Annual Salary: $405,000-$485,000 USD","x-skills-required":["engineering management","inference routing","cluster-level routing","cache placement and eviction","cross-replica coordination","protocols","heterogeneous hardware","GPUs","TPUs","Trainium","machine learning systems","transformer inference","LLM inference serving","KV caching","continuous batching","request scheduling","prefill/decode disaggregation","cluster schedulers","load balancers","service meshes","coordination planes","GPU/accelerator programming","ML framework internals","OS-level performance debugging"],"x-skills-preferred":[],"datePosted":"2026-04-24T11:25:04.722Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA | New York City, NY"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"engineering management, inference routing, cluster-level routing, cache placement and eviction, cross-replica coordination, protocols, heterogeneous hardware, GPUs, TPUs, Trainium, machine learning systems, transformer inference, LLM inference serving, KV caching, continuous batching, request scheduling, prefill/decode disaggregation, cluster schedulers, load balancers, service meshes, coordination planes, GPU/accelerator programming, ML framework internals, OS-level performance debugging","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":405000,"maxValue":485000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_460d00aa-b48"},"title":"Senior / Staff+ Software Engineer, Voice Platform","description":"<p>About the role</p>\n<p>We&#39;re building the infrastructure that lets people talk to Claude,real-time, bidirectional voice conversations that feel natural, responsive, and safe. This is foundational work for how millions of people will interact with AI.</p>\n<p>The Voice Platform team designs and operates the serving systems, streaming pipelines, and APIs that bring Anthropic&#39;s audio models from research into production across Claude.ai, our mobile apps, and the Anthropic API. You&#39;ll work at the intersection of real-time media, low-latency inference, and distributed systems,building infrastructure where every millisecond of latency is felt by the user.</p>\n<p>We partner closely with the Audio research team, who train the speech understanding and generation models, and with product teams shipping voice experiences to users. Your job is to make those models fast, reliable, and delightful to talk to at scale.</p>\n<p>Responsibilities</p>\n<ul>\n<li>Design and build the real-time streaming infrastructure that powers voice conversations with Claude,ingesting microphone audio, orchestrating model inference, and streaming synthesized speech back with minimal latency</li>\n</ul>\n<ul>\n<li>Build low-latency serving systems for speech models, optimizing time-to-first-audio and end-to-end conversational responsiveness</li>\n</ul>\n<ul>\n<li>Develop the public and internal APIs that expose voice capabilities to Claude.ai, mobile clients, and third-party developers</li>\n</ul>\n<ul>\n<li>Own the audio transport layer,codecs, jitter buffers, adaptive bitrate, packet loss recovery,so conversations stay smooth across unreliable networks</li>\n</ul>\n<ul>\n<li>Build observability and quality-measurement systems for voice: latency distributions, audio quality metrics, interruption handling, and turn-taking accuracy</li>\n</ul>\n<ul>\n<li>Partner with Audio research to move new model architectures from experiment to production, and feed real-world performance data back into research</li>\n</ul>\n<ul>\n<li>Collaborate with mobile and product engineering on client-side audio capture, playback, and the end-to-end user experience</li>\n</ul>\n<p>You may be a good fit if you</p>\n<ul>\n<li>Have 6+ years of experience building distributed systems, real-time infrastructure, or platform services at scale</li>\n</ul>\n<ul>\n<li>Have shipped production systems where latency is measured in tens of milliseconds and users notice when you miss</li>\n</ul>\n<ul>\n<li>Are comfortable working across the stack,from transport protocols and serving infrastructure up to the APIs product teams build on</li>\n</ul>\n<ul>\n<li>Are results-oriented, with a bias toward flexibility and impact</li>\n</ul>\n<ul>\n<li>Pick up slack, even if it goes outside your job description</li>\n</ul>\n<ul>\n<li>Enjoy pair programming (we love to pair!)</li>\n</ul>\n<ul>\n<li>Care about the societal impacts of voice AI and want to help shape how these systems are developed responsibly</li>\n</ul>\n<ul>\n<li>Are comfortable with ambiguity,voice is a fast-moving space, and you&#39;ll help define the architecture as we learn what works</li>\n</ul>\n<p>Strong candidates may also have experience with</p>\n<ul>\n<li>Real-time media protocols and stacks: WebRTC, RTP, gRPC bidirectional streaming, or WebSockets at scale</li>\n</ul>\n<ul>\n<li>Audio engineering fundamentals: codecs (Opus, AAC), voice activity detection, echo cancellation, jitter buffering, or audio DSP</li>\n</ul>\n<ul>\n<li>Low-latency ML inference serving, streaming model outputs, or GPU-based serving infrastructure</li>\n</ul>\n<ul>\n<li>Telephony, live streaming, video conferencing, or voice assistant platforms</li>\n</ul>\n<ul>\n<li>Mobile audio pipelines on iOS (AVAudioEngine, AudioUnits) or Android (Oboe, AAudio)</li>\n</ul>\n<ul>\n<li>Working alongside ML researchers to productionize models,speech experience is a plus but not required</li>\n</ul>\n<p>Representative projects</p>\n<ul>\n<li>Driving time-to-first-audio below human perceptual thresholds by co-designing the serving pipeline with the Audio research team</li>\n</ul>\n<ul>\n<li>Building a streaming inference orchestrator that interleaves speech recognition, LLM reasoning, and speech synthesis with overlapping execution</li>\n</ul>\n<ul>\n<li>Designing the voice mode API surface for the Anthropic API so developers can build their own voice agents on Claude</li>\n</ul>\n<ul>\n<li>Implementing graceful barge-in and interruption handling so users can cut Claude off mid-sentence naturally</li>\n</ul>\n<ul>\n<li>Instrumenting end-to-end audio quality metrics and building dashboards that catch regressions before users do</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_460d00aa-b48","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://www.anthropic.com/","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/5172245008","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$320,000-$485,000 USD","x-skills-required":["Real-time media protocols and stacks","Audio engineering fundamentals","Low-latency ML inference serving","Distributed systems","Streaming pipelines","APIs"],"x-skills-preferred":["WebRTC","RTP","gRPC bidirectional streaming","WebSockets","Opus","AAC","Voice activity detection","Echo cancellation","Jitter buffering","Audio DSP","GPU-based serving infrastructure","Telephony","Live streaming","Video conferencing","Voice assistant platforms","Mobile audio pipelines on iOS","Android","Working alongside ML researchers"],"datePosted":"2026-04-18T15:59:54.712Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA | New York City, NY | Seattle, WA"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Real-time media protocols and stacks, Audio engineering fundamentals, Low-latency ML inference serving, Distributed systems, Streaming pipelines, APIs, WebRTC, RTP, gRPC bidirectional streaming, WebSockets, Opus, AAC, Voice activity detection, Echo cancellation, Jitter buffering, Audio DSP, GPU-based serving infrastructure, Telephony, Live streaming, Video conferencing, Voice assistant platforms, Mobile audio pipelines on iOS, Android, Working alongside ML researchers","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":320000,"maxValue":485000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_8a3caae4-044"},"title":"Member of Technical Staff - Imagine Model","description":"<p>As a Member of Technical Staff on the Imagine Model Team, you will develop cutting-edge AI experiences beyond text, with a strong focus on enabling high-fidelity understanding and generation across image and video modalities. Responsibilities span data curation, modeling, training, inference serving, and product integration, covering both pretraining and post-training phases. You will collaborate closely with product teams to push model frontiers and deliver exceptional end-to-end user experiences.</p>\n<p>Key responsibilities include creating and driving engineering agendas to advance multimodal capabilities, improving data quality through annotation, filtering, augmentation, synthetic generation, captioning, and in-depth data studies, designing evaluation frameworks, metrics, benchmarks, evals, and reward models tailored to image/video/audio quality and coherence, implementing efficient algorithms for state-of-the-art model performance, and developing scalable data collection and processing pipelines for multimodal (primarily image/video-focused) datasets.</p>\n<p>The ideal candidate will have a track record in leading studies that significantly improve neural network capabilities and performance through better data or modeling, experience in data-driven experiment designs, systematic analysis, and iterative model debugging, experience developing or working with large-scale distributed machine learning systems, and ability to deliver optimal end-to-end user experiences.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_8a3caae4-044","directApply":true,"hiringOrganization":{"@type":"Organization","name":"xAI","sameAs":"https://www.xai.com/","logo":"https://logos.yubhub.co/xai.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/xai/jobs/5051985007","x-work-arrangement":"hybrid","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":"$180,000 - $440,000 USD","x-skills-required":["data curation","modeling","training","inference serving","product integration","large-scale distributed machine learning systems"],"x-skills-preferred":["SFT","RL","evals","human/synthetic data collection","agentic systems","Python","JAX/XLA","PyTorch","Rust/C++","Spark","Ray"],"datePosted":"2026-04-18T15:58:43.641Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Palo Alto, CA; Seattle, WA"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"data curation, modeling, training, inference serving, product integration, large-scale distributed machine learning systems, SFT, RL, evals, human/synthetic data collection, agentic systems, Python, JAX/XLA, PyTorch, Rust/C++, Spark, Ray","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":180000,"maxValue":440000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_ce9f3d34-c8a"},"title":"Senior / Staff+ Software Engineer, Voice Platform","description":"<p>We&#39;re building the infrastructure that lets people talk to Claude,real-time, bidirectional voice conversations that feel natural, responsive, and safe. This is foundational work for how millions of people will interact with AI.</p>\n<p>The Voice Platform team designs and operates the serving systems, streaming pipelines, and APIs that bring Anthropic&#39;s audio models from research into production across Claude.ai, our mobile apps, and the Anthropic API. You&#39;ll work at the intersection of real-time media, low-latency inference, and distributed systems,building infrastructure where every millisecond of latency is felt by the user.</p>\n<p>We partner closely with the Audio research team, who train the speech understanding and generation models, and with product teams shipping voice experiences to users. Your job is to make those models fast, reliable, and delightful to talk to at scale.</p>\n<p>Responsibilities:</p>\n<ul>\n<li>Design and build the real-time streaming infrastructure that powers voice conversations with Claude,ingesting microphone audio, orchestrating model inference, and streaming synthesized speech back with minimal latency</li>\n</ul>\n<ul>\n<li>Build low-latency serving systems for speech models, optimizing time-to-first-audio and end-to-end conversational responsiveness</li>\n</ul>\n<ul>\n<li>Develop the public and internal APIs that expose voice capabilities to Claude.ai, mobile clients, and third-party developers</li>\n</ul>\n<ul>\n<li>Own the audio transport layer,codecs, jitter buffers, adaptive bitrate, packet loss recovery,so conversations stay smooth across unreliable networks</li>\n</ul>\n<ul>\n<li>Build observability and quality-measurement systems for voice: latency distributions, audio quality metrics, interruption handling, and turn-taking accuracy</li>\n</ul>\n<ul>\n<li>Partner with Audio research to move new model architectures from experiment to production, and feed real-world performance data back into research</li>\n</ul>\n<ul>\n<li>Collaborate with mobile and product engineering on client-side audio capture, playback, and the end-to-end user experience</li>\n</ul>\n<p>You may be a good fit if you</p>\n<ul>\n<li>Have 6+ years of experience building distributed systems, real-time infrastructure, or platform services at scale</li>\n</ul>\n<ul>\n<li>Have shipped production systems where latency is measured in tens of milliseconds and users notice when you miss</li>\n</ul>\n<ul>\n<li>Are comfortable working across the stack,from transport protocols and serving infrastructure up to the APIs product teams build on</li>\n</ul>\n<ul>\n<li>Are results-oriented, with a bias toward flexibility and impact</li>\n</ul>\n<ul>\n<li>Pick up slack, even if it goes outside your job description</li>\n</ul>\n<ul>\n<li>Enjoy pair programming (we love to pair!)</li>\n</ul>\n<ul>\n<li>Care about the societal impacts of voice AI and want to help shape how these systems are developed responsibly</li>\n</ul>\n<ul>\n<li>Are comfortable with ambiguity,voice is a fast-moving space, and you&#39;ll help define the architecture as we learn what works</li>\n</ul>\n<p>Strong candidates may also have experience with</p>\n<ul>\n<li>Real-time media protocols and stacks: WebRTC, RTP, gRPC bidirectional streaming, or WebSockets at scale</li>\n</ul>\n<ul>\n<li>Audio engineering fundamentals: codecs (Opus, AAC), voice activity detection, echo cancellation, jitter buffering, or audio DSP</li>\n</ul>\n<ul>\n<li>Low-latency ML inference serving, streaming model outputs, or GPU-based serving infrastructure</li>\n</ul>\n<ul>\n<li>Telephony, live streaming, video conferencing, or voice assistant platforms</li>\n</ul>\n<ul>\n<li>Mobile audio pipelines on iOS (AVAudioEngine, AudioUnits) or Android (Oboe, AAudio)</li>\n</ul>\n<ul>\n<li>Working alongside ML researchers to productionize models,speech experience is a plus but not required</li>\n</ul>\n<p>Representative projects</p>\n<ul>\n<li>Driving time-to-first-audio below human perceptual thresholds by co-designing the serving pipeline with the Audio research team</li>\n</ul>\n<ul>\n<li>Building a streaming inference orchestrator that interleaves speech recognition, LLM reasoning, and speech synthesis with overlapping execution</li>\n</ul>\n<ul>\n<li>Designing the voice mode API surface for the Anthropic API so developers can build their own voice agents on Claude</li>\n</ul>\n<ul>\n<li>Implementing graceful barge-in and interruption handling so users can cut Claude off mid-sentence naturally</li>\n</ul>\n<ul>\n<li>Instrumenting end-to-end audio quality metrics and building dashboards that catch regressions before users do</li>\n</ul>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_ce9f3d34-c8a","directApply":true,"hiringOrganization":{"@type":"Organization","name":"Anthropic","sameAs":"https://www.anthropic.com/","logo":"https://logos.yubhub.co/anthropic.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/anthropic/jobs/5172245008","x-work-arrangement":"hybrid","x-experience-level":"senior","x-job-type":"full-time","x-salary-range":"$320,000-$485,000 USD","x-skills-required":["Real-time media protocols and stacks","Audio engineering fundamentals","Low-latency ML inference serving","Distributed systems","API design"],"x-skills-preferred":["WebRTC","RTP","gRPC bidirectional streaming","WebSockets","Opus","AAC","voice activity detection","echo cancellation","jitter buffering","audio DSP","GPU-based serving infrastructure","telephony","live streaming","video conferencing","voice assistant platforms","mobile audio pipelines on iOS","Android","pair programming"],"datePosted":"2026-04-18T15:55:09.622Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"San Francisco, CA | New York City, NY | Seattle, WA"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Real-time media protocols and stacks, Audio engineering fundamentals, Low-latency ML inference serving, Distributed systems, API design, WebRTC, RTP, gRPC bidirectional streaming, WebSockets, Opus, AAC, voice activity detection, echo cancellation, jitter buffering, audio DSP, GPU-based serving infrastructure, telephony, live streaming, video conferencing, voice assistant platforms, mobile audio pipelines on iOS, Android, pair programming","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":320000,"maxValue":485000,"unitText":"YEAR"}}},{"@context":"https://schema.org","@type":"JobPosting","identifier":{"@type":"PropertyValue","name":"YubHub","value":"job_3ac0b2f4-6c9"},"title":"Member of Technical Staff - Imagine Product","description":"<p><strong>About the Role</strong></p>\n<p>The Imagine Product team is redefining AI-driven media experiences for Grok users worldwide. You&#39;ll build and scale robust, high-performance systems that power immersive, multi-modal media interactions,leveraging cutting-edge AI to enable seamless generation, processing, and delivery of images, video, audio, and beyond.</p>\n<p>Your work will drive engaging, real-time user experiences that captivate and delight millions, turning advanced multimodal models into production-grade features. If you&#39;re a driven problem-solver passionate about AI, media technologies, and creating scalable solutions that shape the future of consumer AI, this is your opportunity to make a lasting impact.</p>\n<p><strong>Responsibilities</strong></p>\n<ul>\n<li>Design and implement scalable systems to support Grok&#39;s AI-driven media experiences, ensuring high performance, reliability, and low-latency at global scale.</li>\n<li>Architect robust infrastructure for real-time multi-modal interactions, including handling generation requests, media processing, and seamless integration with frontend and model serving layers.</li>\n<li>Build and optimise large-scale data pipelines to ingest, process, and analyse multi-modal data (images, video, audio), fueling continuous improvement and personalisation of Grok&#39;s media capabilities.</li>\n<li>Collaborate closely with frontend engineers, AI researchers, and product teams to deliver captivating, media-rich features and end-to-end user experiences.</li>\n<li>Own full-cycle development of solutions: from system design and prototyping to deployment, monitoring, observability, and iterative refinement.</li>\n<li>Deliver production-ready, maintainable code that powers features reaching hundreds of millions of users.</li>\n</ul>\n<p><strong>Basic Qualifications</strong></p>\n<ul>\n<li>Proficiency in Python or Rust, with a strong track record of writing clean, efficient, maintainable, and scalable code.</li>\n<li>Experience designing and building systems for consumer-facing products, with emphasis on performance, reliability, and handling high-throughput workloads.</li>\n<li>Hands-on expertise in large-scale data infrastructure and pipelines, particularly for multi-modal or media-heavy AI applications.</li>\n<li>Proven ability to deliver robust, production-grade solutions to millions of users while maintaining high standards of quality and uptime.</li>\n<li>Strong problem-solving skills and a passion for turning innovative ideas into high-impact, scalable realities.</li>\n<li>Deep enthusiasm for AI and media technologies, with a commitment to building user-focused products that inspire and engage.</li>\n</ul>\n<p><strong>Preferred Skills and Experience</strong></p>\n<ul>\n<li>Experience with real-time systems, inference serving, or multi-modal data processing at scale.</li>\n<li>Familiarity with distributed systems, containerisation (e.g., Kubernetes), observability tools, or performance tuning for AI workloads.</li>\n<li>Background in AI-driven consumer products or media generation technologies.</li>\n<li>Track record collaborating across engineering, research, and product teams to ship delightful features quickly.</li>\n</ul>\n<p><strong>Compensation and Benefits</strong></p>\n<p>$180,000 - $440,000 USD</p>\n<p>Base salary is just one part of our total rewards package at xAI, which also includes equity, comprehensive medical, vision, and dental coverage, access to a 401(k) retirement plan, short &amp; long-term disability insurance, life insurance, and various other discounts and perks.</p>\n<p style=\"margin-top:24px;font-size:13px;color:#666;\">XML job scraping automation by <a href=\"https://yubhub.co\">YubHub</a></p>","url":"https://yubhub.co/jobs/job_3ac0b2f4-6c9","directApply":true,"hiringOrganization":{"@type":"Organization","name":"xAI","sameAs":"https://xAI.com","logo":"https://logos.yubhub.co/xai.com.png"},"x-apply-url":"https://job-boards.greenhouse.io/xai/jobs/5052027007","x-work-arrangement":"onsite","x-experience-level":"staff","x-job-type":"full-time","x-salary-range":"$180,000 - $440,000 USD","x-skills-required":["Python","Rust","clean, efficient, maintainable, and scalable code","large-scale data infrastructure and pipelines","multi-modal or media-heavy AI applications","production-grade solutions","quality and uptime"],"x-skills-preferred":["real-time systems","inference serving","multi-modal data processing at scale","distributed systems","containerisation","observability tools","performance tuning for AI workloads","AI-driven consumer products","media generation technologies"],"datePosted":"2026-04-18T15:41:51.975Z","jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Palo Alto, CA"}},"employmentType":"FULL_TIME","occupationalCategory":"Engineering","industry":"Technology","skills":"Python, Rust, clean, efficient, maintainable, and scalable code, large-scale data infrastructure and pipelines, multi-modal or media-heavy AI applications, production-grade solutions, quality and uptime, real-time systems, inference serving, multi-modal data processing at scale, distributed systems, containerisation, observability tools, performance tuning for AI workloads, AI-driven consumer products, media generation technologies","baseSalary":{"@type":"MonetaryAmount","currency":"USD","value":{"@type":"QuantitativeValue","minValue":180000,"maxValue":440000,"unitText":"YEAR"}}}]}