api7 · membphis · May 26, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the
 
 ## Status
 
-Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson.
+Initial implementation complete: scalar, AVX2/PCLMUL, and ARM64 NEON/PMULL structural scanners (runtime-dispatched); root-path and cursor APIs; escape-decoded strings; integer/float/bool/typeof/len accessors; FFI panic barrier; and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson on x86_64; ARM64 NEON/PMULL is correctness-tested via the scanner cross-check suite, with a parse + access benchmark on Apple M4 reported in `docs/benchmarks.md` (cjson comparison only).
 
 ## Building
 

diff --git a/benches/arm_bench.lua b/benches/arm_bench.lua
@@ -0,0 +1,152 @@
+-- ARM64 NEON benchmark: qjson vs lua-cjson (parse + access only)
+-- Run from worktree root:
+--   LUA_PATH='./lua/?.lua;;' DYLD_LIBRARY_PATH=./target/release \
+--     luajit benches/arm_bench.lua
+--
+-- qjson loads its native lib via `ffi.load` (honors DYLD_LIBRARY_PATH /
+-- LD_LIBRARY_PATH), so only cjson needs a package.cpath entry below.
+
+package.cpath = "./vendor/lua-cjson/?.so;" .. package.cpath
+
+local qjson  = require("qjson")
+local cjson  = require("cjson")
+local function make_b64_block()
+    local b64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+    local rng = 12345
+    local t = {}
+    for i = 1, 64 * 1024 do
+        rng = (rng * 48271) % 2147483647
+        local idx = (rng % 64) + 1
+        t[i] = b64_chars:sub(idx, idx)
+    end
+    return table.concat(t)
+end
+
+local B64_BLOCK = make_b64_block()
+local B64_BLOCK_LEN = #B64_BLOCK
+
+local function make_b64(size)
+    if size <= B64_BLOCK_LEN then
+        return B64_BLOCK:sub(1, size)
+    end
+    local reps = math.ceil(size / B64_BLOCK_LEN)
+    return string.rep(B64_BLOCK, reps):sub(1, size)
+end
+
+local function make_payload(target_bytes)
+    local message_count = math.max(1, math.ceil(target_bytes / (1024 * 1024)))
+    local envelope = '{"model":"gpt-4-vision","temperature":0.7,"messages":[]}'
+    local text = string.rep("Q", 256)
+    local text_part = '{"type":"text","text":"' .. text .. '"}'
+    local image_prefix = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,'
+    local image_suffix = '"}}'
+    local message_overhead = #('{"role":"user","content":[,]}') + #text_part
+        + #image_prefix + #image_suffix
+    local remaining = target_bytes - #envelope - (message_count * message_overhead)
+    local image_size = math.max(1024, math.floor(remaining / message_count))
+
+    local messages = {}
+    for i = 1, message_count do
+        local role = i % 2 == 1 and "user" or "assistant"
+        local b64 = make_b64(image_size)
+        local image_part = image_prefix .. b64 .. image_suffix
+        messages[i] = '{"role":"' .. role .. '","content":['
+            .. text_part .. "," .. image_part .. ']}'
+    end
+
+    return '{"model":"gpt-4-vision","temperature":0.7,"messages":['
+        .. table.concat(messages, ",") .. ']}'
+end
+
+local ROUNDS = 5
+
+local function bench(name, iters, fn)
+    local warmup = math.max(50, math.floor(iters / 5))
+    for _ = 1, warmup do fn() end
+
+    collectgarbage("collect")
+
+    local ops = {}
+    for r = 1, ROUNDS do
+        local t0 = os.clock()
+        for _ = 1, iters do fn() end
+        local t1 = os.clock()
+        ops[r] = iters / (t1 - t0)
+    end
+
+    table.sort(ops)
+    return ops[math.ceil(ROUNDS / 2)]
+end
+
+local content_paths_cache = {}
+
+local function content_paths(n)
+    local paths = content_paths_cache[n]
+    if paths then return paths end
+    paths = {}
+    for i = 0, n - 1 do
+        paths[i + 1] = "messages[" .. i .. "].content"
+    end
+    content_paths_cache[n] = paths
+    return paths
+end
+
+local scenarios = {
+    {name = "small",   target = 2 * 1024,    iters = 5000},
+    {name = "medium",  target = 60 * 1024,   iters = 500},
+    {name = "100k",    target = 100 * 1024,  iters = 200},
+    {name = "1m",      target = 1024 * 1024, iters = 50},
+    {name = "10m",     target = 10 * 1024 * 1024, iters = 5},
+}
+
+io.write("Generating payloads...")
+io.flush()
+local payloads = {}
+for _, s in ipairs(scenarios) do
+    payloads[s.name] = make_payload(s.target)
+    io.write(" " .. s.name)
+    io.flush()
+end
+print(" done.")
+print("")
+
+local header_fmt = "%-10s %-10s %-12s %-12s %-10s"
+print(string.format(header_fmt, "Scenario", "Size", "cjson", "qjson.parse", "speedup"))
+print(string.rep("-", 58))
+
+for _, s in ipairs(scenarios) do
+    local payload = payloads[s.name]
+    local size_kb = #payload / 1024
+    local size_label
+    if size_kb >= 1024 then
+        size_label = string.format("%.1f MB", size_kb / 1024)
+    else
+        size_label = string.format("%.0f KB", size_kb)
+    end
+
+    local cjson_ops = bench("cjson " .. s.name, s.iters, function()
+        local obj = cjson.decode(payload)
+        local _ = obj.model
+        local _ = obj.temperature
+        if obj.messages then
+            for _, msg in ipairs(obj.messages) do
+                local _ = msg.content
+            end
+        end
+    end)
+
+    local qjson_ops = bench("qjson " .. s.name, s.iters, function()
+        local doc = qjson.parse(payload)
+        local _ = doc:get_str("model")
+        local _ = doc:get_f64("temperature")
+        local n = doc:len("messages") or 0
+        local paths = content_paths(n)
+        for i = 1, n do
+            local _ = doc:typeof(paths[i])
+        end
+    end)
+
+    local speedup = qjson_ops / cjson_ops
+    print(string.format("%-10s %-10s %-12.0f %-12.0f %-10.1fx",
+        s.name, size_label, cjson_ops, qjson_ops, speedup))
+end
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
@@ -22,6 +22,10 @@ Lua-table baselines.
 | `lua-cjson` | vendored `openresty/lua-cjson` |
 | `lua-resty-simdjson` | `Kong/lua-resty-simdjson` commit `77322db640927c14968f1314a9fb1bb2bc084015`, installed under OpenResty lualib |
 
+> **Platform scope:** x86_64 benchmarks include simdjson and modify+encode
+> scenarios. ARM64 NEON benchmarks cover parse + access (cjson comparison
+> only).
+
 ## Methodology
 
 The harness lives at `benches/lua_bench.lua`. For each scenario:
@@ -87,7 +91,7 @@ harness prints a skip message and omits the simdjson rows.
 
 Numbers below come from one such run.
 
-## Results — throughput (median ops/s)
+## Results — throughput (x86_64, median ops/s)
 
 Each row is "parse + access request fields" on the named payload.
 
@@ -105,6 +109,29 @@ Each row is "parse + access request fields" on the named payload.
 | 10m        | 10.00 MB |      51 |     363 |   1,830 |   1,783 |   1,749 |
 | interleaved (100k/200k/500k/1m, cycled) | — | 1,125 | 9,701 | 34,173 | 36,278 | 36,456 |
 
+## Results — throughput (ARM64 NEON, median ops/s)
+
+Each row is "parse + access request fields" on the named payload. The same
+workload as the x86_64 table above, with `simdjson` omitted. Numbers below
+come from a single run on Apple M4.
+
+| Scenario | Size | cjson | `qjson.parse` | speedup vs. cjson |
+|---|---:|---:|---:|---:|
+| small   |   2 KB  | 493,827 | 906,618 |  1.8× |
+| medium  |  60 KB  |  24,847 | 215,146 |  8.7× |
+| 100k    | 100 KB  |  15,475 | 146,413 |  9.5× |
+| 1m      | 1.0 MB  |   1,468 |  20,251 | 13.8× |
+| 10m     | 10.0 MB |     150 |   2,058 | 13.8× |
+
+> **Environment:** Apple M4 (ARM64), 16 GB, macOS 15.x. LuaJIT 2.1.1774896198
+> (Homebrew). `qjson` release build, NEON + PMULL scanner active.
+> `lua-cjson` from vendored `openresty/lua-cjson`. Reproduce with:
+> ```sh
+> cargo build --release
+> LUA_PATH='./lua/?.lua;;' DYLD_LIBRARY_PATH=./target/release \
+>   luajit benches/arm_bench.lua
+> ```
+
 ### Modify + encode throughput (PR #54)
 
 One-shot modify-then-encode benchmarks. Exercises the decode → mutate →
@@ -128,7 +155,7 @@ fresh-process run on x86_64 Linux (AMD EPYC Rome, Zen 2).
 For a before/after comparison against the pre-#54 baseline, see the
 [PR #54 benchmark comment](https://github.com/api7/lua-qjson/pull/54#issuecomment-4525477361).
 
-### Speed-up vs. baselines
+### Speed-up vs. baselines (x86_64)
 
 | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson |
 |---|---:|---:|---:|---:|
@@ -204,10 +231,17 @@ key into the Lua table heap.
    redundant tree walks and array/object re-scans inside the encoder.
    Large payloads (≥5 MB) are dominated by the root-container
    materialization cost, which copies all fields into a plain table.
-8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
-   interference between payload sizes. Each size now runs in its own
-   `resty` process, eliminating the systemic cross-scenario variance
-   observed in earlier benchmark runs.
+ 8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
+    interference between payload sizes. Each size now runs in its own
+    `resty` process, eliminating the systemic cross-scenario variance
+    observed in earlier benchmark runs.
+ 9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
+    workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
- 8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
-    interference between payload sizes. Each size now runs in its own
-    `resty` process, eliminating the systemic cross-scenario variance
-    observed in earlier benchmark runs.
- 9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
-    workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
+8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
+   interference between payload sizes. Each size now runs in its own
+   `resty` process, eliminating the systemic cross-scenario variance
+   observed in earlier benchmark runs.
+9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
+   workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
+   x86_64 at equivalent sizes (~9.5× vs ~30.3× at 100 KB) primarily
+   because cjson runs faster on ARM64 hardware (JIT-compiled scalar code
+   benefits from wider out-of-order execution on M4). The absolute
+   `qjson.parse` throughput is competitive: ~146k ops/s at 100 KB vs
+   ~84k on the x86_64 Zen 2.
- 8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
-    interference between payload sizes. Each size now runs in its own
-    `resty` process, eliminating the systemic cross-scenario variance
-    observed in earlier benchmark runs.
- 9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
-    workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
+8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
+   interference between payload sizes. Each size now runs in its own
+   `resty` process, eliminating the systemic cross-scenario variance
+   observed in earlier benchmark runs.
+9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
+   workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
+   x86_64 at equivalent sizes (~9.5× vs ~30.3× at 100 KB) primarily
+   because cjson runs faster on ARM64 hardware (JIT-compiled scalar code
+   benefits from wider out-of-order execution on M4). The absolute
+   `qjson.parse` throughput is competitive: ~146k ops/s at 100 KB vs
+   ~84k on the x86_64 Zen 2.
+    x86_64 at equivalent sizes (~9.5× vs ~30.3× at 100 KB) primarily
+    because cjson runs faster on ARM64 hardware (JIT-compiled scalar code
+    benefits from wider out-of-order execution on M4). The absolute
+    `qjson.parse` throughput is competitive: ~146k ops/s at 100 KB vs
+    ~84k on the x86_64 Zen 2.
 
 ## When to pick which