DTXT/ref-impl/python/benchmark.py at main · Open-Tech-Foundation/DTXT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import dtxt
try:
    import dtxt_rs
except ImportError:
    dtxt_rs = None
import json
import time
import random
import os

def generate_large_data(count):
    data = {
        "title": "DTXT vs JSON (JSON-native types only)",
        "description": "Benchmark for base format overhead (unquoted keys, short literals)",
        "entries": []
    }

    for i in range(count):
        data["entries"].append({
            "id": i,
            "uid": f"user-{i}",
            "isActive": i % 2 == 0,
            "score": random.random() * 1000,
            "tags": ["data", "benchmark", "storage", "json", "dtxt"],
            "meta": {
                "level": i % 10,
                "verified": i % 3 == 0,
                "note": None,
                "nested": {
                    "a": 1,
                    "b": False,
                    "c": "nested string"
                }
            }
        })
    return data

DATASET_SIZE = 30000

def run_benchmark():
    print(f"Generating dataset with {DATASET_SIZE} entries (JSON-native types only)...")
    raw_data = generate_large_data(DATASET_SIZE)

    # 1. Payload Size Comparison
    json_str = json.dumps(raw_data)
    dtxt_str = dtxt.dumps(raw_data)

    base_path = "../../benchmarks/python"
    json_path = os.path.join(base_path, "bench_v2.json")
    dtxt_path = os.path.join(base_path, "bench_v2.dtxt")

    with open(json_path, "w") as f:
        f.write(json_str)
    with open(dtxt_path, "w") as f:
        f.write(dtxt_str)

    json_size = os.path.getsize(json_path)
    dtxt_size = os.path.getsize(dtxt_path)

    print("\n--- Payload Size ---")
    print(f"JSON: {json_size / 1024 / 1024:.2f} MB")
    print(f"DTXT:  {dtxt_size / 1024 / 1024:.2f} MB")
    print(f"Reduction: {(1 - dtxt_size / json_size) * 100:.1f}%")

    # 2. Performance Comparison (Time)
    iterations = 5

    print("\n--- Parsing Performance (Average of 5 runs) ---")

    json_parse_total = 0
    for _ in range(iterations):
        start = time.perf_counter()
        json.loads(json_str)
        json_parse_total += (time.perf_counter() - start) * 1000
    print(f"json.loads:     {json_parse_total / iterations:.2f} ms")

    # Force pure Python for comparison
    original_rs = dtxt.dtxt_rs
    dtxt.dtxt_rs = None
    pure_python_parse_total = 0
    for _ in range(iterations):
        start = time.perf_counter()
        dtxt.loads(dtxt_str)
        pure_python_parse_total += (time.perf_counter() - start) * 1000
    print(f"dtxt.loads (Pure Python): {pure_python_parse_total / iterations:.2f} ms")
    dtxt.dtxt_rs = original_rs

    if dtxt.dtxt_rs:
        rust_ext_parse_total = 0
        for _ in range(iterations):
            start = time.perf_counter()
            dtxt.loads(dtxt_str)
            rust_ext_parse_total += (time.perf_counter() - start) * 1000
        print(f"dtxt.loads (Rust Ext):    {rust_ext_parse_total / iterations:.2f} ms")
        print(f"Speedup: {pure_python_parse_total / rust_ext_parse_total:.1f}x")

    print("\n--- Serialization Performance (Average of 5 runs) ---")

    json_stringify_total = 0
    for _ in range(iterations):
        start = time.perf_counter()
        json.dumps(raw_data)
        json_stringify_total += (time.perf_counter() - start) * 1000
    print(f"json.dumps:  {json_stringify_total / iterations:.2f} ms")

    dtxt_stringify_total = 0
    for _ in range(iterations):
        start = time.perf_counter()
        dtxt.dumps(raw_data)
        dtxt_stringify_total += (time.perf_counter() - start) * 1000
    print(f"dtxt.dumps:   {dtxt_stringify_total / iterations:.2f} ms")

if __name__ == "__main__":
    run_benchmark()