<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <title>Aksel with AI</title>
  <link rel="alternate" type="text/html" href="https://www.akselwithai.xyz/" />
  <link rel="self" type="application/atom+xml" href="https://www.akselwithai.xyz/feed.xml" />
  <id>https://www.akselwithai.xyz/</id>
  <updated>2026-04-17T12:00:00.000Z</updated>
  <author><name>Aksel Aghajanyan</name></author>
  <generator uri="https://www.akselwithai.xyz" version="1.0">build-rss</generator>
  <entry>
    <id>https://www.akselwithai.xyz/#gpu-memory</id>
    <title type="text">GPU Memory Bottlenecks in Large Language Model Inference: Understanding the Real Limits of Real-Time AI</title>
    <link rel="alternate" type="text/html" href="https://www.akselwithai.xyz/#gpu-memory"/>
    <published>2026-04-17T12:00:00.000Z</published>
    <updated>2026-04-17T12:00:00.000Z</updated>
    <summary type="text">KV cache, memory bandwidth vs capacity, and mitigation strategies for real-time LLM serving — full text below; PDF still available.</summary>
    <category term="gpu"/>
    <category term="kv-cache"/>
    <category term="inference"/>
    <category term="llm"/>
  </entry>
  <entry>
    <id>https://www.akselwithai.xyz/#serving-notes</id>
    <title type="text">Serving LLMs in Production: Latency, Batching, and a Few Lines of Python</title>
    <link rel="alternate" type="text/html" href="https://www.akselwithai.xyz/#serving-notes"/>
    <published>2026-04-10T12:00:00.000Z</published>
    <updated>2026-04-10T12:00:00.000Z</updated>
    <summary type="text">Part 2 of the inference mini-series: practical knobs for batching, streaming, and measuring what users actually feel.</summary>
    <category term="inference"/>
    <category term="latency"/>
    <category term="python"/>
  </entry>
</feed>
