Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the

## Status

Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson.
Initial implementation complete: scalar, AVX2/PCLMUL, and ARM64 NEON/PMULL structural scanners (runtime-dispatched); root-path and cursor APIs; escape-decoded strings; integer/float/bool/typeof/len accessors; FFI panic barrier; and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson on x86_64; ARM64 NEON/PMULL is correctness-tested via the scanner cross-check suite, with a parse + access benchmark on Apple M4 reported in `docs/benchmarks.md` (cjson comparison only).

## Building

Expand Down
152 changes: 152 additions & 0 deletions benches/arm_bench.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
-- ARM64 NEON benchmark: qjson vs lua-cjson (parse + access only)
-- Run from worktree root:
-- LUA_PATH='./lua/?.lua;;' DYLD_LIBRARY_PATH=./target/release \
-- luajit benches/arm_bench.lua
--
-- qjson loads its native lib via `ffi.load` (honors DYLD_LIBRARY_PATH /
-- LD_LIBRARY_PATH), so only cjson needs a package.cpath entry below.

package.cpath = "./vendor/lua-cjson/?.so;" .. package.cpath

local qjson = require("qjson")
local cjson = require("cjson")
local function make_b64_block()
local b64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
local rng = 12345
local t = {}
for i = 1, 64 * 1024 do
rng = (rng * 48271) % 2147483647
local idx = (rng % 64) + 1
t[i] = b64_chars:sub(idx, idx)
end
return table.concat(t)
end

local B64_BLOCK = make_b64_block()
local B64_BLOCK_LEN = #B64_BLOCK

local function make_b64(size)
if size <= B64_BLOCK_LEN then
return B64_BLOCK:sub(1, size)
end
local reps = math.ceil(size / B64_BLOCK_LEN)
return string.rep(B64_BLOCK, reps):sub(1, size)
end

local function make_payload(target_bytes)
local message_count = math.max(1, math.ceil(target_bytes / (1024 * 1024)))
local envelope = '{"model":"gpt-4-vision","temperature":0.7,"messages":[]}'
local text = string.rep("Q", 256)
local text_part = '{"type":"text","text":"' .. text .. '"}'
local image_prefix = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,'
local image_suffix = '"}}'
local message_overhead = #('{"role":"user","content":[,]}') + #text_part
+ #image_prefix + #image_suffix
local remaining = target_bytes - #envelope - (message_count * message_overhead)
local image_size = math.max(1024, math.floor(remaining / message_count))

local messages = {}
for i = 1, message_count do
local role = i % 2 == 1 and "user" or "assistant"
local b64 = make_b64(image_size)
local image_part = image_prefix .. b64 .. image_suffix
messages[i] = '{"role":"' .. role .. '","content":['
.. text_part .. "," .. image_part .. ']}'
end

return '{"model":"gpt-4-vision","temperature":0.7,"messages":['
.. table.concat(messages, ",") .. ']}'
end

local ROUNDS = 5

local function bench(name, iters, fn)
local warmup = math.max(50, math.floor(iters / 5))
for _ = 1, warmup do fn() end

collectgarbage("collect")

local ops = {}
for r = 1, ROUNDS do
local t0 = os.clock()
for _ = 1, iters do fn() end
local t1 = os.clock()
ops[r] = iters / (t1 - t0)
end

table.sort(ops)
return ops[math.ceil(ROUNDS / 2)]
end

local content_paths_cache = {}

local function content_paths(n)
local paths = content_paths_cache[n]
if paths then return paths end
paths = {}
for i = 0, n - 1 do
paths[i + 1] = "messages[" .. i .. "].content"
end
content_paths_cache[n] = paths
return paths
end

local scenarios = {
{name = "small", target = 2 * 1024, iters = 5000},
{name = "medium", target = 60 * 1024, iters = 500},
{name = "100k", target = 100 * 1024, iters = 200},
{name = "1m", target = 1024 * 1024, iters = 50},
{name = "10m", target = 10 * 1024 * 1024, iters = 5},
}

io.write("Generating payloads...")
io.flush()
local payloads = {}
for _, s in ipairs(scenarios) do
payloads[s.name] = make_payload(s.target)
io.write(" " .. s.name)
io.flush()
end
print(" done.")
print("")

local header_fmt = "%-10s %-10s %-12s %-12s %-10s"
print(string.format(header_fmt, "Scenario", "Size", "cjson", "qjson.parse", "speedup"))
print(string.rep("-", 58))

for _, s in ipairs(scenarios) do
local payload = payloads[s.name]
local size_kb = #payload / 1024
local size_label
if size_kb >= 1024 then
size_label = string.format("%.1f MB", size_kb / 1024)
else
size_label = string.format("%.0f KB", size_kb)
end

local cjson_ops = bench("cjson " .. s.name, s.iters, function()
local obj = cjson.decode(payload)
local _ = obj.model
local _ = obj.temperature
if obj.messages then
for _, msg in ipairs(obj.messages) do
local _ = msg.content
end
end
end)

local qjson_ops = bench("qjson " .. s.name, s.iters, function()
local doc = qjson.parse(payload)
local _ = doc:get_str("model")
local _ = doc:get_f64("temperature")
local n = doc:len("messages") or 0
local paths = content_paths(n)
for i = 1, n do
local _ = doc:typeof(paths[i])
end
end)

local speedup = qjson_ops / cjson_ops
print(string.format("%-10s %-10s %-12.0f %-12.0f %-10.1fx",
s.name, size_label, cjson_ops, qjson_ops, speedup))
end
46 changes: 40 additions & 6 deletions docs/benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ Lua-table baselines.
| `lua-cjson` | vendored `openresty/lua-cjson` |
| `lua-resty-simdjson` | `Kong/lua-resty-simdjson` commit `77322db640927c14968f1314a9fb1bb2bc084015`, installed under OpenResty lualib |

> **Platform scope:** x86_64 benchmarks include simdjson and modify+encode
> scenarios. ARM64 NEON benchmarks cover parse + access (cjson comparison
> only).

## Methodology

The harness lives at `benches/lua_bench.lua`. For each scenario:
Expand Down Expand Up @@ -87,7 +91,7 @@ harness prints a skip message and omits the simdjson rows.

Numbers below come from one such run.

## Results — throughput (median ops/s)
## Results — throughput (x86_64, median ops/s)

Each row is "parse + access request fields" on the named payload.

Expand All @@ -105,6 +109,29 @@ Each row is "parse + access request fields" on the named payload.
| 10m | 10.00 MB | 51 | 363 | 1,830 | 1,783 | 1,749 |
| interleaved (100k/200k/500k/1m, cycled) | — | 1,125 | 9,701 | 34,173 | 36,278 | 36,456 |

## Results — throughput (ARM64 NEON, median ops/s)

Each row is "parse + access request fields" on the named payload. The same
workload as the x86_64 table above, with `simdjson` omitted. Numbers below
come from a single run on Apple M4.

| Scenario | Size | cjson | `qjson.parse` | speedup vs. cjson |
|---|---:|---:|---:|---:|
| small | 2 KB | 493,827 | 906,618 | 1.8× |
| medium | 60 KB | 24,847 | 215,146 | 8.7× |
| 100k | 100 KB | 15,475 | 146,413 | 9.5× |
| 1m | 1.0 MB | 1,468 | 20,251 | 13.8× |
| 10m | 10.0 MB | 150 | 2,058 | 13.8× |

> **Environment:** Apple M4 (ARM64), 16 GB, macOS 15.x. LuaJIT 2.1.1774896198
> (Homebrew). `qjson` release build, NEON + PMULL scanner active.
> `lua-cjson` from vendored `openresty/lua-cjson`. Reproduce with:
> ```sh
> cargo build --release
> LUA_PATH='./lua/?.lua;;' DYLD_LIBRARY_PATH=./target/release \
> luajit benches/arm_bench.lua
> ```

### Modify + encode throughput (PR #54)

One-shot modify-then-encode benchmarks. Exercises the decode → mutate →
Expand All @@ -128,7 +155,7 @@ fresh-process run on x86_64 Linux (AMD EPYC Rome, Zen 2).
For a before/after comparison against the pre-#54 baseline, see the
[PR #54 benchmark comment](https://github.com/api7/lua-qjson/pull/54#issuecomment-4525477361).

### Speed-up vs. baselines
### Speed-up vs. baselines (x86_64)

| Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson |
|---|---:|---:|---:|---:|
Expand Down Expand Up @@ -204,10 +231,17 @@ key into the Lua table heap.
redundant tree walks and array/object re-scans inside the encoder.
Large payloads (≥5 MB) are dominated by the root-container
materialization cost, which copies all fields into a plain table.
8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
interference between payload sizes. Each size now runs in its own
`resty` process, eliminating the systemic cross-scenario variance
observed in earlier benchmark runs.
8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
interference between payload sizes. Each size now runs in its own
`resty` process, eliminating the systemic cross-scenario variance
observed in earlier benchmark runs.
9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
Comment on lines +234 to +239
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Fix ordered-list indentation to satisfy markdownlint (MD005).

Line 234 and Line 238 are indented one space more than other top-level list items; this triggers the lint warning and can cause inconsistent rendering.

Suggested fix
- 8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
-    interference between payload sizes. Each size now runs in its own
-    `resty` process, eliminating the systemic cross-scenario variance
-    observed in earlier benchmark runs.
- 9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
-    workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
-    x86_64 at equivalent sizes (~9.5× vs ~30.3× at 100 KB) primarily
-    because cjson runs faster on ARM64 hardware (JIT-compiled scalar code
-    benefits from wider out-of-order execution on M4). The absolute
-    `qjson.parse` throughput is competitive: ~146k ops/s at 100 KB vs
-    ~84k on the x86_64 Zen 2.
+8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
+   interference between payload sizes. Each size now runs in its own
+   `resty` process, eliminating the systemic cross-scenario variance
+   observed in earlier benchmark runs.
+9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
+   workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
+   x86_64 at equivalent sizes (~9.5× vs ~30.3× at 100 KB) primarily
+   because cjson runs faster on ARM64 hardware (JIT-compiled scalar code
+   benefits from wider out-of-order execution on M4). The absolute
+   `qjson.parse` throughput is competitive: ~146k ops/s at 100 KB vs
+   ~84k on the x86_64 Zen 2.
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
interference between payload sizes. Each size now runs in its own
`resty` process, eliminating the systemic cross-scenario variance
observed in earlier benchmark runs.
9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
interference between payload sizes. Each size now runs in its own
`resty` process, eliminating the systemic cross-scenario variance
observed in earlier benchmark runs.
9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
x86_64 at equivalent sizes (~9.5× vs ~30.3× at 100 KB) primarily
because cjson runs faster on ARM64 hardware (JIT-compiled scalar code
benefits from wider out-of-order execution on M4). The absolute
`qjson.parse` throughput is competitive: ~146k ops/s at 100 KB vs
~84k on the x86_64 Zen 2.
🧰 Tools
🪛 markdownlint-cli2 (0.22.1)

[warning] 234-234: Inconsistent indentation for list items at the same level
Expected: 0; Actual: 1

(MD005, list-indent)


[warning] 238-238: Inconsistent indentation for list items at the same level
Expected: 0; Actual: 1

(MD005, list-indent)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@docs/benchmarks.md` around lines 234 - 239, The ordered-list items "8.
**Fresh-process isolation**" and "9. **ARM64 NEON delivers 1.8–13.8× over
cjson**" are indented one space too many; remove the extra leading space so
these top-level list lines align with the other numbered items (no additional
indent) to satisfy markdownlint MD005 and ensure consistent rendering.

x86_64 at equivalent sizes (~9.5× vs ~30.3× at 100 KB) primarily
because cjson runs faster on ARM64 hardware (JIT-compiled scalar code
benefits from wider out-of-order execution on M4). The absolute
`qjson.parse` throughput is competitive: ~146k ops/s at 100 KB vs
~84k on the x86_64 Zen 2.

## When to pick which

Expand Down
Loading