Roberto Tacconelli commited on
Commit
a150be5
·
unverified ·
1 Parent(s): 54b699e

Add files via upload

Browse files
Files changed (11) hide show
  1. Makefile +24 -0
  2. ablation.c +377 -0
  3. arith.h +171 -0
  4. delta_vs_noise.c +226 -0
  5. fastmath.h +88 -0
  6. highctx.h +208 -0
  7. match.h +234 -0
  8. mdc.c +295 -0
  9. ppm.h +198 -0
  10. tweedie.h +280 -0
  11. word.h +443 -0
Makefile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC = gcc
2
+ CFLAGS = -O3 -march=native -Wall -Wextra
3
+ LDFLAGS = -lm
4
+
5
+ HEADERS = arith.h ppm.h tweedie.h match.h word.h highctx.h fastmath.h
6
+
7
+ all: mdc ablation
8
+
9
+ mdc: mdc.c $(HEADERS)
10
+ $(CC) $(CFLAGS) -o mdc mdc.c $(LDFLAGS)
11
+
12
+ ablation: ablation.c $(HEADERS)
13
+ $(CC) $(CFLAGS) -o ablation ablation.c $(LDFLAGS)
14
+
15
+ test_arith: test_arith.c arith.h
16
+ $(CC) $(CFLAGS) -o test_arith test_arith.c $(LDFLAGS)
17
+
18
+ test_ppm: test_ppm.c arith.h ppm.h
19
+ $(CC) $(CFLAGS) -o test_ppm test_ppm.c $(LDFLAGS)
20
+
21
+ clean:
22
+ rm -f mdc ablation bench test_arith test_ppm
23
+
24
+ .PHONY: all clean
ablation.c ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Ablation study — C implementation
3
+ * Measures incremental contribution of each pipeline layer.
4
+ *
5
+ * Usage:
6
+ * ./ablation # alice29.txt only
7
+ * ./ablation file1 file2 ... # specific files
8
+ */
9
+
10
+ #include <stdio.h>
11
+ #include <stdlib.h>
12
+ #include <string.h>
13
+ #include <math.h>
14
+ #include <time.h>
15
+ #include <libgen.h>
16
+
17
+ #include "fastmath.h"
18
+ #include "arith.h"
19
+ #include "ppm.h"
20
+ #include "tweedie.h"
21
+ #include "match.h"
22
+ #include "word.h"
23
+ #include "highctx.h"
24
+
25
+ #define SCALE (1 << 14)
26
+
27
+ /* ── Flags ── */
28
+ #define FLAG_TWEEDIE 1
29
+ #define FLAG_MATCH 2
30
+ #define FLAG_WORD 4
31
+ #define FLAG_HIGHCTX 8
32
+
33
+ /* ── Helpers ── */
34
+
35
+ static void probs_to_cumfreqs(const double *probs, int64_t *cumfreqs,
36
+ int64_t *out_total) {
37
+ cumfreqs[0] = 0;
38
+ for (int i = 0; i < 256; i++) {
39
+ int64_t f = (int64_t)(probs[i] * SCALE + 0.5);
40
+ if (f < 1) f = 1;
41
+ cumfreqs[i + 1] = cumfreqs[i] + f;
42
+ }
43
+ *out_total = cumfreqs[256];
44
+ }
45
+
46
+ static void clamp_normalize(double *probs) {
47
+ double sum = 0.0;
48
+ for (int i = 0; i < 256; i++) {
49
+ if (probs[i] < 1e-10) probs[i] = 1e-10;
50
+ sum += probs[i];
51
+ }
52
+ double inv = 1.0 / sum;
53
+ for (int i = 0; i < 256; i++)
54
+ probs[i] *= inv;
55
+ }
56
+
57
+ static inline double now_sec(void) {
58
+ struct timespec ts;
59
+ clock_gettime(CLOCK_MONOTONIC, &ts);
60
+ return ts.tv_sec + ts.tv_nsec * 1e-9;
61
+ }
62
+
63
+ /* ── Configurable compress ── */
64
+
65
+ static uint8_t *do_compress(const uint8_t *data, size_t data_len,
66
+ int flags, size_t *out_len, double *out_time) {
67
+ PPMModel ppm; ppm_init(&ppm);
68
+ MatchModel match; if (flags & FLAG_MATCH) match_init(&match);
69
+ WordModel word; if (flags & FLAG_WORD) word_init(&word);
70
+ HighCtxModel hctx; if (flags & FLAG_HIGHCTX) highctx_init(&hctx);
71
+ ArithEncoder enc; ae_init(&enc);
72
+ TweedieDenoiser twd; if (flags & FLAG_TWEEDIE) tweedie_init(&twd);
73
+
74
+ double probs[256], word_probs[256], hctx_probs[256];
75
+ int64_t cumfreqs[257];
76
+ int64_t total;
77
+
78
+ double t0 = now_sec();
79
+
80
+ for (size_t i = 0; i < data_len; i++) {
81
+ uint8_t byte = data[i];
82
+ double confidence;
83
+ int order;
84
+
85
+ ppm_predict(&ppm, probs, &confidence, &order);
86
+
87
+ if (flags & FLAG_TWEEDIE) {
88
+ tweedie_denoise(&twd, probs, order, confidence);
89
+ }
90
+ clamp_normalize(probs);
91
+
92
+ if (flags & FLAG_MATCH) {
93
+ int match_byte;
94
+ double match_conf;
95
+ match_predict(&match, &match_byte, &match_conf);
96
+ blend_match(probs, match_byte, match_conf);
97
+ }
98
+
99
+ if (flags & FLAG_WORD) {
100
+ double w_conf;
101
+ if (word_predict_cached(&word, word_probs, &w_conf))
102
+ blend_word_model(probs, word_probs, w_conf);
103
+ }
104
+
105
+ if (flags & FLAG_HIGHCTX) {
106
+ double hctx_conf;
107
+ if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
108
+ blend_highctx(probs, hctx_probs, hctx_conf);
109
+ }
110
+
111
+ probs_to_cumfreqs(probs, cumfreqs, &total);
112
+ ae_encode(&enc, cumfreqs, byte, total);
113
+
114
+ /* Updates */
115
+ if (flags & FLAG_TWEEDIE)
116
+ tweedie_update(&twd, byte);
117
+ if (flags & FLAG_MATCH)
118
+ match_update(&match, byte);
119
+ if (flags & FLAG_WORD)
120
+ word_update(&word, byte);
121
+ if (flags & FLAG_HIGHCTX)
122
+ highctx_update(&hctx, byte);
123
+ ppm_update(&ppm, byte);
124
+
125
+ if ((i + 1) % 50000 == 0) {
126
+ double elapsed = now_sec() - t0;
127
+ double pct = (i + 1) * 100.0 / data_len;
128
+ double speed = (i + 1) / elapsed;
129
+ fprintf(stderr, "\r %5.1f%% (%zu/%zu) %.0f B/s",
130
+ pct, i + 1, data_len, speed);
131
+ }
132
+ }
133
+
134
+ ae_finish(&enc);
135
+ double elapsed = now_sec() - t0;
136
+ if (data_len >= 50000)
137
+ fprintf(stderr, "\r \r");
138
+
139
+ *out_time = elapsed;
140
+
141
+ /* Copy output */
142
+ *out_len = enc.buf_len;
143
+ uint8_t *result = (uint8_t *)malloc(enc.buf_len);
144
+ memcpy(result, enc.buf, enc.buf_len);
145
+
146
+ ae_free(&enc);
147
+ ppm_free(&ppm);
148
+ if (flags & FLAG_MATCH) match_free(&match);
149
+ if (flags & FLAG_WORD) word_free(&word);
150
+ if (flags & FLAG_HIGHCTX) highctx_free(&hctx);
151
+
152
+ return result;
153
+ }
154
+
155
+ /* ── Configurable decompress ── */
156
+
157
+ static uint8_t *do_decompress(const uint8_t *compressed, size_t comp_len,
158
+ size_t original_size, int flags,
159
+ double *out_time) {
160
+ PPMModel ppm; ppm_init(&ppm);
161
+ MatchModel match; if (flags & FLAG_MATCH) match_init(&match);
162
+ WordModel word; if (flags & FLAG_WORD) word_init(&word);
163
+ HighCtxModel hctx; if (flags & FLAG_HIGHCTX) highctx_init(&hctx);
164
+ ArithDecoder dec; ad_init(&dec, compressed, comp_len);
165
+ TweedieDenoiser twd; if (flags & FLAG_TWEEDIE) tweedie_init(&twd);
166
+
167
+ uint8_t *result = (uint8_t *)malloc(original_size);
168
+
169
+ double probs[256], word_probs[256], hctx_probs[256];
170
+ int64_t cumfreqs[257];
171
+ int64_t total;
172
+
173
+ double t0 = now_sec();
174
+
175
+ for (size_t i = 0; i < original_size; i++) {
176
+ double confidence;
177
+ int order;
178
+
179
+ ppm_predict(&ppm, probs, &confidence, &order);
180
+
181
+ if (flags & FLAG_TWEEDIE) {
182
+ tweedie_denoise(&twd, probs, order, confidence);
183
+ }
184
+ clamp_normalize(probs);
185
+
186
+ if (flags & FLAG_MATCH) {
187
+ int match_byte;
188
+ double match_conf;
189
+ match_predict(&match, &match_byte, &match_conf);
190
+ blend_match(probs, match_byte, match_conf);
191
+ }
192
+
193
+ if (flags & FLAG_WORD) {
194
+ double w_conf;
195
+ if (word_predict_cached(&word, word_probs, &w_conf))
196
+ blend_word_model(probs, word_probs, w_conf);
197
+ }
198
+
199
+ if (flags & FLAG_HIGHCTX) {
200
+ double hctx_conf;
201
+ if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
202
+ blend_highctx(probs, hctx_probs, hctx_conf);
203
+ }
204
+
205
+ probs_to_cumfreqs(probs, cumfreqs, &total);
206
+ int sym = ad_decode(&dec, cumfreqs, total);
207
+ result[i] = (uint8_t)sym;
208
+
209
+ if (flags & FLAG_TWEEDIE)
210
+ tweedie_update(&twd, (uint8_t)sym);
211
+ if (flags & FLAG_MATCH)
212
+ match_update(&match, (uint8_t)sym);
213
+ if (flags & FLAG_WORD)
214
+ word_update(&word, (uint8_t)sym);
215
+ if (flags & FLAG_HIGHCTX)
216
+ highctx_update(&hctx, (uint8_t)sym);
217
+ ppm_update(&ppm, (uint8_t)sym);
218
+ }
219
+
220
+ *out_time = now_sec() - t0;
221
+
222
+ ppm_free(&ppm);
223
+ if (flags & FLAG_MATCH) match_free(&match);
224
+ if (flags & FLAG_WORD) word_free(&word);
225
+ if (flags & FLAG_HIGHCTX) highctx_free(&hctx);
226
+
227
+ return result;
228
+ }
229
+
230
+ /* ── Ablation configs ── */
231
+
232
+ typedef struct {
233
+ const char *label;
234
+ int flags;
235
+ } AblationConfig;
236
+
237
+ static const AblationConfig CONFIGS[] = {
238
+ { "Base PPM", 0 },
239
+ { "+ Tweedie", FLAG_TWEEDIE },
240
+ { "+ Twd + Match", FLAG_TWEEDIE | FLAG_MATCH },
241
+ { "+ Twd + Match + Word", FLAG_TWEEDIE | FLAG_MATCH | FLAG_WORD },
242
+ { "+ Twd + M + W + H", FLAG_TWEEDIE | FLAG_MATCH | FLAG_WORD | FLAG_HIGHCTX },
243
+ };
244
+ #define N_CONFIGS 5
245
+
246
+ typedef struct {
247
+ const char *label;
248
+ size_t c_size;
249
+ double ratio;
250
+ double c_time;
251
+ } AblationResult;
252
+
253
+ static void run_ablation(const char *filepath) {
254
+ FILE *f = fopen(filepath, "rb");
255
+ if (!f) { fprintf(stderr, "File not found: %s\n", filepath); return; }
256
+ fseek(f, 0, SEEK_END);
257
+ long file_size = ftell(f);
258
+ fseek(f, 0, SEEK_SET);
259
+ uint8_t *data = (uint8_t *)malloc(file_size);
260
+ if (fread(data, 1, file_size, f) != (size_t)file_size) {
261
+ fprintf(stderr, "Read error: %s\n", filepath);
262
+ fclose(f); free(data); return;
263
+ }
264
+ fclose(f);
265
+
266
+ /* basename */
267
+ char *path_copy = strdup(filepath);
268
+ const char *filename = basename(path_copy);
269
+ size_t original_size = (size_t)file_size;
270
+
271
+ printf("\n======================================================================\n");
272
+ printf(" ABLATION: %s (%zu bytes)\n", filename, original_size);
273
+ printf("======================================================================\n");
274
+
275
+ AblationResult results[N_CONFIGS];
276
+
277
+ for (int c = 0; c < N_CONFIGS; c++) {
278
+ printf("\n [%s]\n", CONFIGS[c].label);
279
+ printf(" Compressing...");
280
+ fflush(stdout);
281
+
282
+ size_t comp_len;
283
+ double c_time;
284
+ uint8_t *compressed = do_compress(data, original_size, CONFIGS[c].flags,
285
+ &comp_len, &c_time);
286
+ double ratio = (double)comp_len / original_size;
287
+ printf(" %zu bytes (%.2f%%) in %.1fs\n", comp_len, ratio * 100.0, c_time);
288
+
289
+ /* Verify round-trip */
290
+ printf(" Verifying...");
291
+ fflush(stdout);
292
+
293
+ /*double d_time;
294
+ uint8_t *decompressed = do_decompress(compressed, comp_len, original_size,
295
+ CONFIGS[c].flags, &d_time);
296
+
297
+ if (memcmp(data, decompressed, original_size) == 0) {
298
+ printf(" OK (%.1fs)\n", d_time);
299
+ } else {
300
+ printf(" FAILED!\n");
301
+ // Find first mismatch
302
+ for (size_t i = 0; i < original_size; i++) {
303
+ if (data[i] != decompressed[i]) {
304
+ printf(" First mismatch at byte %zu: expected %d, got %d\n",
305
+ i, data[i], decompressed[i]);
306
+ break;
307
+ }
308
+ }
309
+ free(compressed);
310
+ free(decompressed);
311
+ free(data);
312
+ free(path_copy);
313
+ exit(1);
314
+ } */
315
+
316
+ results[c].label = CONFIGS[c].label;
317
+ results[c].c_size = comp_len;
318
+ results[c].ratio = ratio;
319
+ results[c].c_time = c_time;
320
+
321
+ free(compressed);
322
+ //free(decompressed);
323
+ }
324
+
325
+ /* ── Summary table ── */
326
+ printf("\n======================================================================\n");
327
+ printf(" RESULTS: %s (%zu bytes)\n", filename, original_size);
328
+ printf("======================================================================\n");
329
+ printf(" %-32s %8s %8s %9s %9s %7s\n",
330
+ "Layer", "Size", "Ratio", "Layer +%", "Total +%", "Time");
331
+ printf(" -------------------------------- -------- -------- --------- --------- -------\n");
332
+
333
+ size_t base_size = results[0].c_size;
334
+ size_t prev_size = results[0].c_size;
335
+
336
+ for (int i = 0; i < N_CONFIGS; i++) {
337
+ size_t c_size = results[i].c_size;
338
+ double ratio = results[i].ratio;
339
+ double c_time = results[i].c_time;
340
+
341
+ if (i == 0) {
342
+ printf(" %-32s %8zu %6.2f%% %9s %9s %6.1fs\n",
343
+ results[i].label, c_size, ratio * 100.0, "", "", c_time);
344
+ } else {
345
+ double layer_imp = (double)(prev_size - c_size) / prev_size * 100.0;
346
+ double total_imp = (double)(base_size - c_size) / base_size * 100.0;
347
+ printf(" %-32s %8zu %6.2f%% %+8.2f%% %+8.2f%% %6.1fs\n",
348
+ results[i].label, c_size, ratio * 100.0,
349
+ layer_imp, total_imp, c_time);
350
+ }
351
+ prev_size = c_size;
352
+ }
353
+
354
+ printf(" -------------------------------- -------- -------- --------- --------- -------\n");
355
+ size_t final_size = results[N_CONFIGS - 1].c_size;
356
+ double total_imp = (double)(base_size - final_size) / base_size * 100.0;
357
+ printf(" %-32s %8s %8s %9s %+8.2f%%\n",
358
+ "TOTAL IMPROVEMENT", "", "", "", total_imp);
359
+ printf("\n");
360
+
361
+ free(data);
362
+ free(path_copy);
363
+ }
364
+
365
+ /* ── Main ── */
366
+
367
+ int main(int argc, char **argv) {
368
+ if (argc > 1) {
369
+ for (int i = 1; i < argc; i++)
370
+ run_ablation(argv[i]);
371
+ } else {
372
+ run_ablation("../alice29.txt");
373
+ }
374
+
375
+ /* Cross-file comparison would go here for multiple files */
376
+ return 0;
377
+ }
arith.h ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef ARITH_H
2
+ #define ARITH_H
3
+
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+
8
+ /* Arithmetic encoder */
9
+ typedef struct {
10
+ uint32_t low;
11
+ uint32_t high;
12
+ int pending;
13
+ uint8_t *buf; /* output byte buffer */
14
+ size_t buf_len;
15
+ size_t buf_cap;
16
+ int bit_buf; /* accumulates 8 bits before flushing a byte */
17
+ int bit_count; /* bits in bit_buf (0..7) */
18
+ } ArithEncoder;
19
+
20
+ /* Arithmetic decoder */
21
+ typedef struct {
22
+ const uint8_t *data;
23
+ size_t data_len;
24
+ size_t bit_pos;
25
+ uint32_t low;
26
+ uint32_t high;
27
+ uint32_t value;
28
+ } ArithDecoder;
29
+
30
+ /* ── Encoder ── */
31
+
32
+ static inline void ae_init(ArithEncoder *e) {
33
+ e->low = 0;
34
+ e->high = 0xFFFFFFFF;
35
+ e->pending = 0;
36
+ e->buf_cap = 4096;
37
+ e->buf_len = 0;
38
+ e->buf = (uint8_t *)malloc(e->buf_cap);
39
+ e->bit_buf = 0;
40
+ e->bit_count = 0;
41
+ }
42
+
43
+ static inline void ae_flush_byte(ArithEncoder *e) {
44
+ if (e->buf_len >= e->buf_cap) {
45
+ e->buf_cap *= 2;
46
+ e->buf = (uint8_t *)realloc(e->buf, e->buf_cap);
47
+ }
48
+ e->buf[e->buf_len++] = (uint8_t)e->bit_buf;
49
+ e->bit_buf = 0;
50
+ e->bit_count = 0;
51
+ }
52
+
53
+ static inline void ae_output_bit(ArithEncoder *e, int bit) {
54
+ e->bit_buf = (e->bit_buf << 1) | bit;
55
+ e->bit_count++;
56
+ if (e->bit_count == 8) ae_flush_byte(e);
57
+
58
+ int inv = 1 - bit;
59
+ while (e->pending > 0) {
60
+ e->bit_buf = (e->bit_buf << 1) | inv;
61
+ e->bit_count++;
62
+ if (e->bit_count == 8) ae_flush_byte(e);
63
+ e->pending--;
64
+ }
65
+ }
66
+
67
+ static inline void ae_encode(ArithEncoder *e, const int64_t *cumfreqs,
68
+ int symbol, int64_t total) {
69
+ uint64_t rng = (uint64_t)e->high - e->low + 1;
70
+ e->high = e->low + (uint32_t)((rng * cumfreqs[symbol + 1]) / total) - 1;
71
+ e->low = e->low + (uint32_t)((rng * cumfreqs[symbol]) / total);
72
+
73
+ for (;;) {
74
+ if (e->high < 0x80000000u) {
75
+ ae_output_bit(e, 0);
76
+ } else if (e->low >= 0x80000000u) {
77
+ ae_output_bit(e, 1);
78
+ e->low -= 0x80000000u;
79
+ e->high -= 0x80000000u;
80
+ } else if (e->low >= 0x40000000u && e->high < 0xC0000000u) {
81
+ e->pending++;
82
+ e->low -= 0x40000000u;
83
+ e->high -= 0x40000000u;
84
+ } else {
85
+ break;
86
+ }
87
+ e->low = (e->low << 1) & 0xFFFFFFFF;
88
+ e->high = ((e->high << 1) | 1) & 0xFFFFFFFF;
89
+ }
90
+ }
91
+
92
+ static inline void ae_finish(ArithEncoder *e) {
93
+ e->pending++;
94
+ if (e->low < 0x40000000u)
95
+ ae_output_bit(e, 0);
96
+ else
97
+ ae_output_bit(e, 1);
98
+
99
+ /* pad remaining bits in the last byte */
100
+ if (e->bit_count > 0) {
101
+ e->bit_buf <<= (8 - e->bit_count);
102
+ ae_flush_byte(e);
103
+ }
104
+ }
105
+
106
+ static inline void ae_free(ArithEncoder *e) {
107
+ free(e->buf);
108
+ e->buf = NULL;
109
+ }
110
+
111
+ /* ── Decoder ── */
112
+
113
+ static inline int ad_read_bit(ArithDecoder *d) {
114
+ size_t byte_idx = d->bit_pos / 8;
115
+ if (byte_idx >= d->data_len) {
116
+ d->bit_pos++;
117
+ return 0;
118
+ }
119
+ int bit = (d->data[byte_idx] >> (7 - (d->bit_pos % 8))) & 1;
120
+ d->bit_pos++;
121
+ return bit;
122
+ }
123
+
124
+ static inline void ad_init(ArithDecoder *d, const uint8_t *data, size_t len) {
125
+ d->data = data;
126
+ d->data_len = len;
127
+ d->bit_pos = 0;
128
+ d->low = 0;
129
+ d->high = 0xFFFFFFFF;
130
+ d->value = 0;
131
+ for (int i = 0; i < 32; i++)
132
+ d->value = (d->value << 1) | ad_read_bit(d);
133
+ }
134
+
135
+ static inline int ad_decode(ArithDecoder *d, const int64_t *cumfreqs,
136
+ int64_t total) {
137
+ uint64_t rng = (uint64_t)d->high - d->low + 1;
138
+ int64_t scaled = (int64_t)(((uint64_t)(d->value - d->low + 1) * total - 1) / rng);
139
+
140
+ /* linear search (matches Python behavior) */
141
+ int sym = 0;
142
+ for (sym = 0; sym < 256; sym++) {
143
+ if (cumfreqs[sym + 1] > scaled)
144
+ break;
145
+ }
146
+
147
+ d->high = d->low + (uint32_t)((rng * cumfreqs[sym + 1]) / total) - 1;
148
+ d->low = d->low + (uint32_t)((rng * cumfreqs[sym]) / total);
149
+
150
+ for (;;) {
151
+ if (d->high < 0x80000000u) {
152
+ /* nothing */
153
+ } else if (d->low >= 0x80000000u) {
154
+ d->low -= 0x80000000u;
155
+ d->high -= 0x80000000u;
156
+ d->value -= 0x80000000u;
157
+ } else if (d->low >= 0x40000000u && d->high < 0xC0000000u) {
158
+ d->low -= 0x40000000u;
159
+ d->high -= 0x40000000u;
160
+ d->value -= 0x40000000u;
161
+ } else {
162
+ break;
163
+ }
164
+ d->low = (d->low << 1) & 0xFFFFFFFF;
165
+ d->high = ((d->high << 1) | 1) & 0xFFFFFFFF;
166
+ d->value = ((d->value << 1) | ad_read_bit(d)) & 0xFFFFFFFF;
167
+ }
168
+ return sym;
169
+ }
170
+
171
+ #endif /* ARITH_H */
delta_vs_noise.c ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * delta_vs_noise.c — Experiment: |δ| vs noise level γ
3
+ *
4
+ * Compresses a file using the full pipeline, then dumps the
5
+ * calibration table statistics showing mean |δ| per confidence bin.
6
+ *
7
+ * Usage: ./delta_vs_noise <input_file>
8
+ *
9
+ * Output: TSV table of (conf_bin, γ_approx, mean_|δ|, weighted_mean_|δ|, total_obs)
10
+ */
11
+
12
+ #include <stdio.h>
13
+ #include <stdlib.h>
14
+ #include <string.h>
15
+ #include <math.h>
16
+ #include <time.h>
17
+
18
+ #include "fastmath.h"
19
+ #include "arith.h"
20
+ #include "ppm.h"
21
+ #include "tweedie.h"
22
+ #include "match.h"
23
+ #include "word.h"
24
+ #include "highctx.h"
25
+
26
+ #define SCALE (1 << 14)
27
+
28
+ static void probs_to_cumfreqs(const double *probs, int64_t *cumfreqs,
29
+ int64_t *out_total) {
30
+ cumfreqs[0] = 0;
31
+ for (int i = 0; i < 256; i++) {
32
+ int64_t f = (int64_t)(probs[i] * SCALE + 0.5);
33
+ if (f < 1) f = 1;
34
+ cumfreqs[i + 1] = cumfreqs[i] + f;
35
+ }
36
+ *out_total = cumfreqs[256];
37
+ }
38
+
39
+ static void clamp_normalize(double *probs) {
40
+ double sum = 0.0;
41
+ for (int i = 0; i < 256; i++) {
42
+ if (probs[i] < 1e-10) probs[i] = 1e-10;
43
+ sum += probs[i];
44
+ }
45
+ double inv = 1.0 / sum;
46
+ for (int i = 0; i < 256; i++)
47
+ probs[i] *= inv;
48
+ }
49
+
50
+ /* Representative C value for each confidence bin.
51
+ * twd_conf_bin uses: bin = (int)(ln(C) / 1.3863)
52
+ * with C < 4 → bin 0.
53
+ * Bin boundaries: 0:[0,4), 1:[4,e^1.39)≈[4,16), 2:[16,59), ... */
54
+ static double conf_bin_representative_C(int bin) {
55
+ if (bin == 0) return 2.0; /* midpoint of [0, 4) */
56
+ /* bin = floor(ln(C) / 1.3863), so midpoint is exp((bin + 0.5) * 1.3863) */
57
+ return exp((bin + 0.5) * 1.3862943611198906);
58
+ }
59
+
60
+ int main(int argc, char **argv) {
61
+ if (argc < 2) {
62
+ fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
63
+ return 1;
64
+ }
65
+
66
+ FILE *fin = fopen(argv[1], "rb");
67
+ if (!fin) { perror(argv[1]); return 1; }
68
+ fseek(fin, 0, SEEK_END);
69
+ long file_size = ftell(fin);
70
+ fseek(fin, 0, SEEK_SET);
71
+ uint8_t *data = (uint8_t *)malloc(file_size);
72
+ if (fread(data, 1, file_size, fin) != (size_t)file_size) {
73
+ fprintf(stderr, "Read error\n"); fclose(fin); return 1;
74
+ }
75
+ fclose(fin);
76
+
77
+ fprintf(stderr, "Processing %s (%ld bytes)...\n", argv[1], file_size);
78
+
79
+ /* Run the full pipeline to populate calibration tables */
80
+ PPMModel ppm; ppm_init(&ppm);
81
+ MatchModel match; match_init(&match);
82
+ WordModel word; word_init(&word);
83
+ HighCtxModel hctx; highctx_init(&hctx);
84
+ ArithEncoder enc; ae_init(&enc);
85
+ TweedieDenoiser twd; tweedie_init(&twd);
86
+
87
+ double probs[256], word_probs[256], hctx_probs[256];
88
+ int64_t cumfreqs[257];
89
+ int64_t total;
90
+
91
+ struct timespec t0, t1;
92
+ clock_gettime(CLOCK_MONOTONIC, &t0);
93
+
94
+ for (long i = 0; i < file_size; i++) {
95
+ uint8_t byte = data[i];
96
+
97
+ double confidence;
98
+ int order;
99
+ ppm_predict(&ppm, probs, &confidence, &order);
100
+
101
+ tweedie_denoise(&twd, probs, order, confidence);
102
+ clamp_normalize(probs);
103
+
104
+ int match_byte;
105
+ double match_conf;
106
+ match_predict(&match, &match_byte, &match_conf);
107
+ blend_match(probs, match_byte, match_conf);
108
+
109
+ double w_conf;
110
+ if (word_predict_cached(&word, word_probs, &w_conf))
111
+ blend_word_model(probs, word_probs, w_conf);
112
+
113
+ double hctx_conf;
114
+ if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
115
+ blend_highctx(probs, hctx_probs, hctx_conf);
116
+
117
+ probs_to_cumfreqs(probs, cumfreqs, &total);
118
+ ae_encode(&enc, cumfreqs, byte, total);
119
+
120
+ tweedie_update(&twd, byte);
121
+ match_update(&match, byte);
122
+ word_update(&word, byte);
123
+ highctx_update(&hctx, byte);
124
+ ppm_update(&ppm, byte);
125
+
126
+ if ((i + 1) % 50000 == 0) {
127
+ clock_gettime(CLOCK_MONOTONIC, &t1);
128
+ double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
129
+ fprintf(stderr, "\r %5.1f%% (%.0f B/s)",
130
+ (i + 1) * 100.0 / file_size, (i + 1) / elapsed);
131
+ }
132
+ }
133
+
134
+ clock_gettime(CLOCK_MONOTONIC, &t1);
135
+ double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
136
+ fprintf(stderr, "\r Done in %.1fs \n", elapsed);
137
+
138
+ /* ── Analyze calibration tables: mean |δ| per confidence bin ── */
139
+
140
+ /* Aggregate across all steps, bit contexts, order groups, shapes, prob bins */
141
+ double sum_abs_delta[TWD_N_CONF];
142
+ double sum_weight[TWD_N_CONF];
143
+ double sum_weighted_abs_delta[TWD_N_CONF];
144
+ int count[TWD_N_CONF];
145
+ memset(sum_abs_delta, 0, sizeof(sum_abs_delta));
146
+ memset(sum_weight, 0, sizeof(sum_weight));
147
+ memset(sum_weighted_abs_delta, 0, sizeof(sum_weighted_abs_delta));
148
+ memset(count, 0, sizeof(count));
149
+
150
+ for (int t = 0; t < TWD_STEPS; t++)
151
+ for (int b = 0; b < TWD_N_BCTX; b++)
152
+ for (int o = 0; o < TWD_N_ORD; o++)
153
+ for (int s = 0; s < TWD_N_SHAPE; s++)
154
+ for (int c = 0; c < TWD_N_CONF; c++)
155
+ for (int p = 0; p < TWD_N_PROB; p++) {
156
+ TwdCalibEntry *e = &twd.table[t][b][o][s][c][p];
157
+ double real_obs = e->total - TWD_PRIOR_WEIGHT;
158
+ if (real_obs < 1.0) continue; /* skip bins with only prior */
159
+
160
+ double avg_pred = e->sum_pred / e->total;
161
+ double emp_rate = e->hits / e->total;
162
+ double delta = emp_rate - avg_pred;
163
+
164
+ sum_abs_delta[c] += fabs(delta);
165
+ sum_weighted_abs_delta[c] += fabs(delta) * real_obs;
166
+ sum_weight[c] += real_obs;
167
+ count[c]++;
168
+ }
169
+
170
+ /* ── Output ── */
171
+ printf("# Delta vs Noise Level — %s (%ld bytes)\n", argv[1], file_size);
172
+ printf("# conf_bin\tC_repr\tgamma\tmean_abs_delta\tweighted_abs_delta\tactive_bins\ttotal_obs\n");
173
+
174
+ for (int c = 0; c < TWD_N_CONF; c++) {
175
+ double C_repr = conf_bin_representative_C(c);
176
+ double gamma = 128.0 / (C_repr + 128.0);
177
+ double mean_d = (count[c] > 0) ? sum_abs_delta[c] / count[c] : 0.0;
178
+ double wmean_d = (sum_weight[c] > 0) ? sum_weighted_abs_delta[c] / sum_weight[c] : 0.0;
179
+
180
+ printf("%d\t%.1f\t%.4f\t%.6f\t%.6f\t%d\t%.0f\n",
181
+ c, C_repr, gamma, mean_d, wmean_d, count[c], sum_weight[c]);
182
+ }
183
+
184
+ /* ── Also output per-step breakdown ── */
185
+ printf("\n# Per-step breakdown:\n");
186
+ printf("# step\tconf_bin\tgamma\tweighted_abs_delta\ttotal_obs\n");
187
+
188
+ for (int t = 0; t < TWD_STEPS; t++) {
189
+ double step_sum_wd[TWD_N_CONF] = {0};
190
+ double step_sum_w[TWD_N_CONF] = {0};
191
+
192
+ for (int b = 0; b < TWD_N_BCTX; b++)
193
+ for (int o = 0; o < TWD_N_ORD; o++)
194
+ for (int s = 0; s < TWD_N_SHAPE; s++)
195
+ for (int c = 0; c < TWD_N_CONF; c++)
196
+ for (int p = 0; p < TWD_N_PROB; p++) {
197
+ TwdCalibEntry *e = &twd.table[t][b][o][s][c][p];
198
+ double real_obs = e->total - TWD_PRIOR_WEIGHT;
199
+ if (real_obs < 1.0) continue;
200
+
201
+ double avg_pred = e->sum_pred / e->total;
202
+ double emp_rate = e->hits / e->total;
203
+ double delta = emp_rate - avg_pred;
204
+
205
+ step_sum_wd[c] += fabs(delta) * real_obs;
206
+ step_sum_w[c] += real_obs;
207
+ }
208
+
209
+ for (int c = 0; c < TWD_N_CONF; c++) {
210
+ double C_repr = conf_bin_representative_C(c);
211
+ double gamma = 128.0 / (C_repr + 128.0);
212
+ double wmean_d = (step_sum_w[c] > 0) ? step_sum_wd[c] / step_sum_w[c] : 0.0;
213
+ printf("%d\t%d\t%.4f\t%.6f\t%.0f\n",
214
+ t, c, gamma, wmean_d, step_sum_w[c]);
215
+ }
216
+ }
217
+
218
+ free(data);
219
+ ppm_free(&ppm);
220
+ match_free(&match);
221
+ word_free(&word);
222
+ highctx_free(&hctx);
223
+ ae_free(&enc);
224
+
225
+ return 0;
226
+ }
fastmath.h ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef FASTMATH_H
2
+ #define FASTMATH_H
3
+
4
+ #include <stdint.h>
5
+ #include <math.h>
6
+
7
+ /*
8
+ * Fast log/exp approximations using IEEE 754 bit tricks + polynomial correction.
9
+ * Accurate to ~1e-4 relative error — sufficient for probability manipulation.
10
+ */
11
+
12
+ /* Fast natural log. Relative error < 2e-4 over [1e-30, 1.0] */
13
+ static inline double fast_log(double x) {
14
+ union { double d; uint64_t u; } v = { .d = x };
15
+ /* Extract exponent and mantissa from IEEE 754 */
16
+ int64_t exp_bits = (int64_t)((v.u >> 52) & 0x7FF) - 1023;
17
+ /* Set exponent to 0 → mantissa in [1, 2) */
18
+ v.u = (v.u & 0x000FFFFFFFFFFFFFULL) | 0x3FF0000000000000ULL;
19
+ double m = v.d;
20
+ /* Polynomial approx of log(m) for m in [1,2): Remez-like */
21
+ /* log(m) ≈ (m-1) - (m-1)^2/2 + (m-1)^3/3 ... but use minimax */
22
+ double t = m - 1.0;
23
+ double log_m = t * (1.0 + t * (-0.5 + t * (0.333333333 + t * (-0.25 + t * 0.2))));
24
+ return log_m + exp_bits * 0.6931471805599453; /* exp_bits * ln(2) */
25
+ }
26
+
27
+ /* Fast exp. Relative error < 3e-4 over [-90, 0] (typical range for log-probs) */
28
+ static inline double fast_exp(double x) {
29
+ if (x < -700.0) return 0.0;
30
+ if (x > 709.0) return 1e308;
31
+ /* exp(x) = 2^(x/ln2) = 2^(k+f) where k=floor, f=frac */
32
+ double t = x * 1.4426950408889634; /* x / ln(2) */
33
+ int64_t k = (int64_t)t;
34
+ if (t < k) k--; /* floor for negative */
35
+ double f = t - k;
36
+ /* 2^f for f in [0,1): minimax polynomial */
37
+ double p = 1.0 + f * (0.6931471805599453 + f * (0.24022650695910071
38
+ + f * (0.05550410866482158 + f * (0.009618129107628477
39
+ + f * 0.0013333558146428443))));
40
+ /* Multiply by 2^k via bit manipulation */
41
+ union { double d; uint64_t u; } v;
42
+ v.u = (uint64_t)(k + 1023) << 52;
43
+ return p * v.d;
44
+ }
45
+
46
+ /* Fast log(a / (1-a)) — logit function via single IEEE bit trick.
47
+ * Uses the identity: logit(p) = log(p) - log(1-p)
48
+ * We can compute log(p/(1-p)) in one pass by exploiting IEEE 754. */
49
+ static inline double fast_logit(double p) {
50
+ if (p < 1e-8) p = 1e-8;
51
+ if (p > 1.0 - 1e-8) p = 1.0 - 1e-8;
52
+ /* For p near 0.5, use rational approx; otherwise use fast_log */
53
+ double r = p / (1.0 - p);
54
+ return fast_log(r);
55
+ }
56
+
57
+ /*
58
+ * Precomputed logit lookup table for probabilities.
59
+ * Maps probability [0..65536]/65536 → logit value.
60
+ * Avoids per-symbol log computation entirely.
61
+ */
62
+ #define LOGIT_TABLE_SIZE 65537
63
+ typedef struct {
64
+ double table[LOGIT_TABLE_SIZE];
65
+ int initialized;
66
+ } LogitTable;
67
+
68
+ static inline void logit_table_init(LogitTable *lt) {
69
+ for (int i = 0; i < LOGIT_TABLE_SIZE; i++) {
70
+ double p = (double)i / 65536.0;
71
+ if (p < 1e-8) p = 1e-8;
72
+ if (p > 1.0 - 1e-8) p = 1.0 - 1e-8;
73
+ lt->table[i] = log(p / (1.0 - p));
74
+ }
75
+ lt->initialized = 1;
76
+ }
77
+
78
+ static inline double logit_table_lookup(const LogitTable *lt, double p) {
79
+ int idx = (int)(p * 65536.0 + 0.5);
80
+ if (idx < 0) idx = 0;
81
+ if (idx >= LOGIT_TABLE_SIZE) idx = LOGIT_TABLE_SIZE - 1;
82
+ return lt->table[idx];
83
+ }
84
+
85
+ /* Fast sqrt (just use hardware — it's already fast) */
86
+ /* static inline double fast_sqrt(double x) { return sqrt(x); } */
87
+
88
+ #endif /* FASTMATH_H */
highctx.h ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef HIGHCTX_H
2
+ #define HIGHCTX_H
3
+
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+
8
+ /*
9
+ * High-Order Context Model (orders 5-8)
10
+ *
11
+ * Extends effective context beyond PPM's order-4 limit without modifying PPM.
12
+ * Uses hash tables mapping context_hash → count[256] for orders 5, 6, 7, 8.
13
+ * Unlike the match model (which finds one position, predicts one byte),
14
+ * this aggregates ALL matching positions into a full probability distribution.
15
+ *
16
+ * Blended after SSE in the pipeline, preserving diffusion's contribution.
17
+ */
18
+
19
+ #define HCTX_NSYM 256
20
+ #define HCTX_N_ORDERS 4 /* orders 5, 6, 7, 8 */
21
+ #define HCTX_MIN_ORDER 5
22
+ #define HCTX_MAX_ENTRIES (1 << 20) /* 1M entries per table, ~500MB total max */
23
+
24
+ /* Hash table entry: context hash → byte counts */
25
+ typedef struct {
26
+ uint64_t key;
27
+ uint16_t counts[HCTX_NSYM];
28
+ uint32_t total;
29
+ } HCtxEntry;
30
+
31
+ typedef struct {
32
+ HCtxEntry *entries;
33
+ uint32_t capacity;
34
+ uint32_t mask;
35
+ uint32_t used;
36
+ } HCtxTable;
37
+
38
+ static inline void hctx_table_init(HCtxTable *t, uint32_t cap) {
39
+ t->capacity = cap;
40
+ t->mask = cap - 1;
41
+ t->used = 0;
42
+ t->entries = (HCtxEntry *)calloc(cap, sizeof(HCtxEntry));
43
+ }
44
+
45
+ static inline void hctx_table_free(HCtxTable *t) {
46
+ free(t->entries);
47
+ t->entries = NULL;
48
+ }
49
+
50
+ static inline void hctx_table_grow(HCtxTable *t) {
51
+ uint32_t old_cap = t->capacity;
52
+ HCtxEntry *old = t->entries;
53
+ uint32_t new_cap = old_cap * 2;
54
+ t->entries = (HCtxEntry *)calloc(new_cap, sizeof(HCtxEntry));
55
+ t->capacity = new_cap;
56
+ t->mask = new_cap - 1;
57
+ t->used = 0;
58
+ for (uint32_t i = 0; i < old_cap; i++) {
59
+ if (old[i].key != 0) {
60
+ uint32_t idx = (uint32_t)(old[i].key & t->mask);
61
+ while (t->entries[idx].key != 0)
62
+ idx = (idx + 1) & t->mask;
63
+ t->entries[idx] = old[i];
64
+ t->used++;
65
+ }
66
+ }
67
+ free(old);
68
+ }
69
+
70
+ /* Find or create entry. Returns pointer to entry (NULL if full and create). */
71
+ static inline HCtxEntry *hctx_table_get(HCtxTable *t, uint64_t key, int create) {
72
+ if (create && t->used * 5 > t->capacity * 3) {
73
+ if (t->capacity < HCTX_MAX_ENTRIES)
74
+ hctx_table_grow(t);
75
+ else
76
+ create = 0; /* at max capacity, only look up existing */
77
+ }
78
+ uint32_t idx = (uint32_t)(key & t->mask);
79
+ for (;;) {
80
+ if (t->entries[idx].key == key)
81
+ return &t->entries[idx];
82
+ if (t->entries[idx].key == 0) {
83
+ if (!create) return NULL;
84
+ t->entries[idx].key = key;
85
+ memset(t->entries[idx].counts, 0, sizeof(t->entries[idx].counts));
86
+ t->entries[idx].total = 0;
87
+ t->used++;
88
+ return &t->entries[idx];
89
+ }
90
+ idx = (idx + 1) & t->mask;
91
+ }
92
+ }
93
+
94
+ /* FNV-1a hash for context bytes */
95
+ static inline uint64_t hctx_hash(const uint8_t *data, int len) {
96
+ uint64_t h = 14695981039346656037ULL;
97
+ for (int i = 0; i < len; i++) {
98
+ h ^= data[i];
99
+ h *= 1099511628211ULL;
100
+ }
101
+ if (h == 0) h = 1;
102
+ return h;
103
+ }
104
+
105
+ typedef struct {
106
+ HCtxTable tables[HCTX_N_ORDERS]; /* orders 5, 6, 7, 8 */
107
+ uint8_t *history;
108
+ int hist_len;
109
+ int hist_cap;
110
+ } HighCtxModel;
111
+
112
+ static inline void highctx_init(HighCtxModel *m) {
113
+ for (int i = 0; i < HCTX_N_ORDERS; i++)
114
+ hctx_table_init(&m->tables[i], 8192);
115
+ m->hist_cap = 4096;
116
+ m->hist_len = 0;
117
+ m->history = (uint8_t *)malloc(m->hist_cap);
118
+ }
119
+
120
+ static inline void highctx_free(HighCtxModel *m) {
121
+ for (int i = 0; i < HCTX_N_ORDERS; i++)
122
+ hctx_table_free(&m->tables[i]);
123
+ free(m->history);
124
+ m->history = NULL;
125
+ }
126
+
127
+ /*
128
+ * Predict: try highest order first (8, 7, 6, 5).
129
+ * Use the highest order that has a context with total >= min_count.
130
+ * Returns 1 if prediction available, fills probs[256] and *out_conf.
131
+ */
132
+ static inline int highctx_predict(HighCtxModel *m, double *probs, double *out_conf) {
133
+ int n = m->hist_len;
134
+
135
+ for (int oidx = HCTX_N_ORDERS - 1; oidx >= 0; oidx--) {
136
+ int order = HCTX_MIN_ORDER + oidx; /* 8, 7, 6, 5 */
137
+ if (n < order) continue;
138
+
139
+ uint64_t key = hctx_hash(m->history + n - order, order);
140
+ HCtxEntry *e = hctx_table_get(&m->tables[oidx], key, 0);
141
+ if (!e || e->total < 4) continue;
142
+
143
+ /* Build distribution: sparse smoothing to avoid zero probs */
144
+ double smooth = 1e-4;
145
+ double total_smooth = e->total + smooth * HCTX_NSYM;
146
+ double inv = 1.0 / total_smooth;
147
+ for (int s = 0; s < HCTX_NSYM; s++)
148
+ probs[s] = (e->counts[s] + smooth) * inv;
149
+
150
+ /* Confidence: ramps slowly, requires real data */
151
+ double count_conf = (e->total - 4.0) / (e->total + 8.0); /* 0 at total=4, ~0.7 at 20 */
152
+ if (count_conf < 0) count_conf = 0;
153
+ double order_factor = 0.4 + (order - HCTX_MIN_ORDER) * 0.1; /* 0.4 for o5, 0.7 for o8 */
154
+ *out_conf = count_conf * order_factor;
155
+
156
+ return 1;
157
+ }
158
+
159
+ *out_conf = 0.0;
160
+ return 0;
161
+ }
162
+
163
+ /*
164
+ * Update: increment counts for all available orders.
165
+ */
166
+ static inline void highctx_update(HighCtxModel *m, uint8_t byte) {
167
+ int n = m->hist_len;
168
+
169
+ for (int oidx = 0; oidx < HCTX_N_ORDERS; oidx++) {
170
+ int order = HCTX_MIN_ORDER + oidx;
171
+ if (n >= order) {
172
+ uint64_t key = hctx_hash(m->history + n - order, order);
173
+ HCtxEntry *e = hctx_table_get(&m->tables[oidx], key, 1);
174
+ if (e) {
175
+ e->counts[byte]++;
176
+ e->total++;
177
+ }
178
+ }
179
+ }
180
+
181
+ /* Append to history */
182
+ if (m->hist_len >= m->hist_cap) {
183
+ m->hist_cap *= 2;
184
+ m->history = (uint8_t *)realloc(m->history, m->hist_cap);
185
+ }
186
+ m->history[m->hist_len++] = byte;
187
+ }
188
+
189
+ /*
190
+ * Blend high-context prediction into existing probability distribution.
191
+ */
192
+ static inline void blend_highctx(double *probs, const double *hctx_probs,
193
+ double hctx_conf) {
194
+ if (hctx_conf < 0.01) return;
195
+ double weight = hctx_conf * 2.0;
196
+ if (weight > 0.60) weight = 0.60;
197
+ double sum = 0.0;
198
+ for (int i = 0; i < HCTX_NSYM; i++) {
199
+ probs[i] = probs[i] * (1.0 - weight) + hctx_probs[i] * weight;
200
+ if (probs[i] < 1e-10) probs[i] = 1e-10;
201
+ sum += probs[i];
202
+ }
203
+ double inv = 1.0 / sum;
204
+ for (int i = 0; i < HCTX_NSYM; i++)
205
+ probs[i] *= inv;
206
+ }
207
+
208
+ #endif /* HIGHCTX_H */
match.h ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef MATCH_H
2
+ #define MATCH_H
3
+
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+
8
+ #define MATCH_NSYM 256
9
+ #define MATCH_N_CTX 5 /* context lengths: 4, 6, 8, 12, 16 */
10
+
11
+ /* Hash table entry: context hash → position in history */
12
+ typedef struct {
13
+ uint64_t key;
14
+ uint32_t pos;
15
+ } MatchHTEntry;
16
+
17
+ typedef struct {
18
+ MatchHTEntry *entries;
19
+ uint32_t capacity;
20
+ uint32_t mask;
21
+ } MatchHT;
22
+
23
+ static inline void mht_init(MatchHT *t, uint32_t cap) {
24
+ t->capacity = cap;
25
+ t->mask = cap - 1;
26
+ t->entries = (MatchHTEntry *)calloc(cap, sizeof(MatchHTEntry));
27
+ }
28
+
29
+ static inline void mht_free(MatchHT *t) {
30
+ free(t->entries);
31
+ }
32
+
33
+ static inline void mht_grow(MatchHT *t) {
34
+ uint32_t old_cap = t->capacity;
35
+ MatchHTEntry *old = t->entries;
36
+ uint32_t new_cap = old_cap * 2;
37
+ t->entries = (MatchHTEntry *)calloc(new_cap, sizeof(MatchHTEntry));
38
+ t->capacity = new_cap;
39
+ t->mask = new_cap - 1;
40
+ for (uint32_t i = 0; i < old_cap; i++) {
41
+ if (old[i].key != 0) {
42
+ uint32_t idx = (uint32_t)(old[i].key & t->mask);
43
+ while (t->entries[idx].key != 0)
44
+ idx = (idx + 1) & t->mask;
45
+ t->entries[idx] = old[i];
46
+ }
47
+ }
48
+ free(old);
49
+ }
50
+
51
+ static inline void mht_set(MatchHT *t, uint64_t key, uint32_t pos,
52
+ uint32_t *used) {
53
+ if (*used * 5 > t->capacity * 3) mht_grow(t);
54
+ uint32_t idx = (uint32_t)(key & t->mask);
55
+ for (;;) {
56
+ if (t->entries[idx].key == key || t->entries[idx].key == 0) {
57
+ if (t->entries[idx].key == 0) (*used)++;
58
+ t->entries[idx].key = key;
59
+ t->entries[idx].pos = pos;
60
+ return;
61
+ }
62
+ idx = (idx + 1) & t->mask;
63
+ }
64
+ }
65
+
66
+ static inline int mht_get(MatchHT *t, uint64_t key, uint32_t *out_pos) {
67
+ uint32_t idx = (uint32_t)(key & t->mask);
68
+ for (;;) {
69
+ if (t->entries[idx].key == key) {
70
+ *out_pos = t->entries[idx].pos;
71
+ return 1;
72
+ }
73
+ if (t->entries[idx].key == 0) return 0;
74
+ idx = (idx + 1) & t->mask;
75
+ }
76
+ }
77
+
78
+ typedef struct {
79
+ int ctx_lens[MATCH_N_CTX];
80
+ MatchHT tables[MATCH_N_CTX];
81
+ uint32_t table_used[MATCH_N_CTX];
82
+ uint8_t *history;
83
+ int hist_len;
84
+ int hist_cap;
85
+
86
+ /* active match state */
87
+ int match_read_pos;
88
+ int match_active;
89
+ int match_streak;
90
+
91
+ /* adaptive accuracy */
92
+ double hits;
93
+ double total;
94
+ } MatchModel;
95
+
96
+ static inline uint64_t match_ctx_hash(const uint8_t *data, int len) {
97
+ uint64_t h = 14695981039346656037ULL;
98
+ for (int i = 0; i < len; i++) {
99
+ h ^= data[i];
100
+ h *= 1099511628211ULL;
101
+ }
102
+ if (h == 0) h = 1;
103
+ return h;
104
+ }
105
+
106
+ static inline void match_init(MatchModel *m) {
107
+ m->ctx_lens[0] = 4;
108
+ m->ctx_lens[1] = 6;
109
+ m->ctx_lens[2] = 8;
110
+ m->ctx_lens[3] = 12;
111
+ m->ctx_lens[4] = 16;
112
+ for (int i = 0; i < MATCH_N_CTX; i++) {
113
+ mht_init(&m->tables[i], 4096);
114
+ m->table_used[i] = 0;
115
+ }
116
+ m->hist_cap = 4096;
117
+ m->hist_len = 0;
118
+ m->history = (uint8_t *)malloc(m->hist_cap);
119
+ m->match_read_pos = -1;
120
+ m->match_active = 0;
121
+ m->match_streak = 0;
122
+ m->hits = 1.0;
123
+ m->total = 2.0;
124
+ }
125
+
126
+ static inline void match_free(MatchModel *m) {
127
+ for (int i = 0; i < MATCH_N_CTX; i++)
128
+ mht_free(&m->tables[i]);
129
+ free(m->history);
130
+ m->history = NULL;
131
+ }
132
+
133
+ /*
134
+ * predict: returns predicted byte via *out_byte, confidence via *out_conf.
135
+ * Returns 1 if prediction available, 0 otherwise.
136
+ */
137
+ static inline int match_predict(MatchModel *m, int *out_byte, double *out_conf) {
138
+ /* 1. Continue active match */
139
+ if (m->match_active && m->match_read_pos >= 0
140
+ && m->match_read_pos < m->hist_len) {
141
+ *out_byte = m->history[m->match_read_pos];
142
+ double base = m->hits / m->total;
143
+ double conf = base * (0.65 + m->match_streak * 0.04);
144
+ if (conf > 0.96) conf = 0.96;
145
+ *out_conf = conf;
146
+ return 1;
147
+ }
148
+
149
+ m->match_active = 0;
150
+
151
+ /* 2. Try new match (longest context first) */
152
+ for (int idx = MATCH_N_CTX - 1; idx >= 0; idx--) {
153
+ int ctx_len = m->ctx_lens[idx];
154
+ int n = m->hist_len;
155
+ if (n < ctx_len) continue;
156
+
157
+ uint64_t key = match_ctx_hash(m->history + n - ctx_len, ctx_len);
158
+ uint32_t pos;
159
+ if (mht_get(&m->tables[idx], key, &pos) && pos < (uint32_t)n) {
160
+ *out_byte = m->history[pos];
161
+ m->match_active = 1;
162
+ m->match_read_pos = (int)pos;
163
+ m->match_streak = 0;
164
+ double base = m->hits / m->total;
165
+ double conf = base * (ctx_len / 6.0);
166
+ if (conf > base * 0.9) conf = base * 0.9;
167
+ *out_conf = conf;
168
+ return 1;
169
+ }
170
+ }
171
+
172
+ *out_byte = -1;
173
+ *out_conf = 0.0;
174
+ return 0;
175
+ }
176
+
177
+ static inline void match_update(MatchModel *m, uint8_t actual_byte) {
178
+ /* track accuracy of active match */
179
+ if (m->match_active && m->match_read_pos >= 0
180
+ && m->match_read_pos < m->hist_len) {
181
+ int predicted = m->history[m->match_read_pos];
182
+ m->total += 1.0;
183
+ if (predicted == actual_byte) {
184
+ m->hits += 1.0;
185
+ m->match_streak++;
186
+ m->match_read_pos++;
187
+ } else {
188
+ m->match_active = 0;
189
+ m->match_streak = 0;
190
+ }
191
+ if (m->total > 500.0) {
192
+ m->hits *= 0.99;
193
+ m->total *= 0.99;
194
+ }
195
+ }
196
+
197
+ /* store context → position */
198
+ int n = m->hist_len;
199
+ for (int tidx = 0; tidx < MATCH_N_CTX; tidx++) {
200
+ int ctx_len = m->ctx_lens[tidx];
201
+ if (n >= ctx_len) {
202
+ uint64_t key = match_ctx_hash(m->history + n - ctx_len, ctx_len);
203
+ mht_set(&m->tables[tidx], key, (uint32_t)n,
204
+ &m->table_used[tidx]);
205
+ }
206
+ }
207
+
208
+ /* append to history */
209
+ if (m->hist_len >= m->hist_cap) {
210
+ m->hist_cap *= 2;
211
+ m->history = (uint8_t *)realloc(m->history, m->hist_cap);
212
+ }
213
+ m->history[m->hist_len++] = actual_byte;
214
+ }
215
+
216
+ static inline void blend_match(double *probs, int match_byte,
217
+ double match_confidence) {
218
+ if (match_byte < 0) return;
219
+ double weight = match_confidence * 0.85;
220
+ if (weight > 0.95) weight = 0.95;
221
+ for (int i = 0; i < MATCH_NSYM; i++)
222
+ probs[i] *= (1.0 - weight);
223
+ probs[match_byte] += weight;
224
+ double sum = 0.0;
225
+ for (int i = 0; i < MATCH_NSYM; i++) {
226
+ if (probs[i] < 1e-10) probs[i] = 1e-10;
227
+ sum += probs[i];
228
+ }
229
+ double inv = 1.0 / sum;
230
+ for (int i = 0; i < MATCH_NSYM; i++)
231
+ probs[i] *= inv;
232
+ }
233
+
234
+ #endif /* MATCH_H */
mdc.c ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Midicoth Compressor — C implementation
3
+ * Pipeline: PPM + Tweedie Denoising + Match + Word + HighCtx
4
+ *
5
+ * Usage:
6
+ * ./mdc compress <input> <output>
7
+ * ./mdc decompress <input> <output>
8
+ */
9
+
10
+ #include <stdio.h>
11
+ #include <stdlib.h>
12
+ #include <string.h>
13
+ #include <math.h>
14
+ #include <time.h>
15
+
16
+ #include "fastmath.h"
17
+ #include "arith.h"
18
+ #include "ppm.h"
19
+ #include "tweedie.h"
20
+ #include "match.h"
21
+ #include "word.h"
22
+ #include "highctx.h"
23
+
24
+ #define MAGIC "MDC7"
25
+ #define SCALE (1 << 14)
26
+
27
+ /* ── Helpers ── */
28
+
29
+ static void probs_to_cumfreqs(const double *probs, int64_t *cumfreqs,
30
+ int64_t *out_total) {
31
+ cumfreqs[0] = 0;
32
+ for (int i = 0; i < 256; i++) {
33
+ int64_t f = (int64_t)(probs[i] * SCALE + 0.5);
34
+ if (f < 1) f = 1;
35
+ cumfreqs[i + 1] = cumfreqs[i] + f;
36
+ }
37
+ *out_total = cumfreqs[256];
38
+ }
39
+
40
+ static void clamp_normalize(double *probs) {
41
+ double sum = 0.0;
42
+ for (int i = 0; i < 256; i++) {
43
+ if (probs[i] < 1e-10) probs[i] = 1e-10;
44
+ sum += probs[i];
45
+ }
46
+ double inv = 1.0 / sum;
47
+ for (int i = 0; i < 256; i++)
48
+ probs[i] *= inv;
49
+ }
50
+
51
+ /* ── Compress ── */
52
+
53
+ static int do_compress(const char *input_path, const char *output_path) {
54
+ FILE *fin = fopen(input_path, "rb");
55
+ if (!fin) { perror(input_path); return 1; }
56
+ fseek(fin, 0, SEEK_END);
57
+ long file_size = ftell(fin);
58
+ fseek(fin, 0, SEEK_SET);
59
+ uint8_t *data = (uint8_t *)malloc(file_size);
60
+ if (fread(data, 1, file_size, fin) != (size_t)file_size) {
61
+ fprintf(stderr, "Read error\n"); fclose(fin); return 1;
62
+ }
63
+ fclose(fin);
64
+
65
+ uint64_t original_size = (uint64_t)file_size;
66
+ printf(" Input: %s (%lu bytes)\n", input_path, (unsigned long)original_size);
67
+
68
+ if (original_size == 0) {
69
+ FILE *fout = fopen(output_path, "wb");
70
+ fwrite(MAGIC, 1, 4, fout);
71
+ uint64_t zero = 0;
72
+ fwrite(&zero, 8, 1, fout);
73
+ fclose(fout);
74
+ printf(" Empty file -> 12 bytes\n");
75
+ free(data);
76
+ return 0;
77
+ }
78
+
79
+ PPMModel ppm; ppm_init(&ppm);
80
+ MatchModel match; match_init(&match);
81
+ WordModel word; word_init(&word);
82
+ HighCtxModel hctx; highctx_init(&hctx);
83
+ ArithEncoder enc; ae_init(&enc);
84
+ TweedieDenoiser twd; tweedie_init(&twd);
85
+
86
+ double probs[256], word_probs[256], hctx_probs[256];
87
+ int64_t cumfreqs[257];
88
+ int64_t total;
89
+
90
+ struct timespec t0, t1;
91
+ clock_gettime(CLOCK_MONOTONIC, &t0);
92
+
93
+ for (uint64_t i = 0; i < original_size; i++) {
94
+ uint8_t byte = data[i];
95
+
96
+ double confidence;
97
+ int order;
98
+ ppm_predict(&ppm, probs, &confidence, &order);
99
+
100
+ tweedie_denoise(&twd, probs, order, confidence);
101
+ clamp_normalize(probs);
102
+
103
+ int match_byte;
104
+ double match_conf;
105
+ match_predict(&match, &match_byte, &match_conf);
106
+ blend_match(probs, match_byte, match_conf);
107
+
108
+ double w_conf;
109
+ if (word_predict_cached(&word, word_probs, &w_conf))
110
+ blend_word_model(probs, word_probs, w_conf);
111
+
112
+ double hctx_conf;
113
+ if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
114
+ blend_highctx(probs, hctx_probs, hctx_conf);
115
+
116
+ probs_to_cumfreqs(probs, cumfreqs, &total);
117
+ ae_encode(&enc, cumfreqs, byte, total);
118
+
119
+ tweedie_update(&twd, byte);
120
+ match_update(&match, byte);
121
+ word_update(&word, byte);
122
+ highctx_update(&hctx, byte);
123
+ ppm_update(&ppm, byte);
124
+
125
+ if ((i + 1) % 50000 == 0) {
126
+ clock_gettime(CLOCK_MONOTONIC, &t1);
127
+ double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
128
+ double pct = (i + 1) * 100.0 / original_size;
129
+ double speed = (i + 1) / elapsed;
130
+ fprintf(stderr, "\r %5.1f%% (%lu/%lu) %.0f B/s",
131
+ pct, (unsigned long)(i + 1), (unsigned long)original_size, speed);
132
+ }
133
+ }
134
+
135
+ ae_finish(&enc);
136
+
137
+ clock_gettime(CLOCK_MONOTONIC, &t1);
138
+ double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
139
+ fprintf(stderr, "\r \r");
140
+
141
+ FILE *fout = fopen(output_path, "wb");
142
+ if (!fout) { perror(output_path); return 1; }
143
+ fwrite(MAGIC, 1, 4, fout);
144
+ fwrite(&original_size, 8, 1, fout);
145
+ fwrite(enc.buf, 1, enc.buf_len, fout);
146
+ fclose(fout);
147
+
148
+ uint64_t total_size = 4 + 8 + enc.buf_len;
149
+ double ratio = (double)total_size / original_size;
150
+ printf(" Output: %s (%lu bytes)\n", output_path, (unsigned long)total_size);
151
+ printf(" Ratio: %.4f (%.2f%%)\n", ratio, ratio * 100.0);
152
+ printf(" Time: %.1fs (%.0f B/s)\n", elapsed, original_size / elapsed);
153
+
154
+ ae_free(&enc);
155
+ ppm_free(&ppm);
156
+ match_free(&match);
157
+ word_free(&word);
158
+ highctx_free(&hctx);
159
+ free(data);
160
+ return 0;
161
+ }
162
+
163
+ /* ── Decompress ── */
164
+
165
+ static int do_decompress(const char *input_path, const char *output_path) {
166
+ FILE *fin = fopen(input_path, "rb");
167
+ if (!fin) { perror(input_path); return 1; }
168
+
169
+ char magic[4];
170
+ if (fread(magic, 1, 4, fin) != 4 || memcmp(magic, MAGIC, 4) != 0) {
171
+ fprintf(stderr, "Error: not a MDC7 file\n");
172
+ fclose(fin);
173
+ return 1;
174
+ }
175
+
176
+ uint64_t original_size;
177
+ if (fread(&original_size, 8, 1, fin) != 1) {
178
+ fprintf(stderr, "Read error\n"); fclose(fin); return 1;
179
+ }
180
+
181
+ fseek(fin, 0, SEEK_END);
182
+ long fsize = ftell(fin);
183
+ fseek(fin, 12, SEEK_SET);
184
+ size_t comp_len = (size_t)(fsize - 12);
185
+ uint8_t *compressed = (uint8_t *)malloc(comp_len);
186
+ if (fread(compressed, 1, comp_len, fin) != comp_len) {
187
+ fprintf(stderr, "Read error\n"); fclose(fin); return 1;
188
+ }
189
+ fclose(fin);
190
+
191
+ printf(" Input: %s (%ld bytes)\n", input_path, fsize);
192
+ printf(" Original size: %lu bytes\n", (unsigned long)original_size);
193
+
194
+ if (original_size == 0) {
195
+ FILE *fout = fopen(output_path, "wb");
196
+ fclose(fout);
197
+ printf(" Empty file\n");
198
+ free(compressed);
199
+ return 0;
200
+ }
201
+
202
+ PPMModel ppm; ppm_init(&ppm);
203
+ MatchModel match; match_init(&match);
204
+ WordModel word; word_init(&word);
205
+ HighCtxModel hctx; highctx_init(&hctx);
206
+ ArithDecoder dec; ad_init(&dec, compressed, comp_len);
207
+ TweedieDenoiser twd; tweedie_init(&twd);
208
+
209
+ uint8_t *result = (uint8_t *)malloc(original_size);
210
+
211
+ double probs[256], word_probs[256], hctx_probs[256];
212
+ int64_t cumfreqs[257];
213
+ int64_t total;
214
+
215
+ struct timespec t0, t1;
216
+ clock_gettime(CLOCK_MONOTONIC, &t0);
217
+
218
+ for (uint64_t i = 0; i < original_size; i++) {
219
+ double confidence;
220
+ int order;
221
+ ppm_predict(&ppm, probs, &confidence, &order);
222
+
223
+ tweedie_denoise(&twd, probs, order, confidence);
224
+ clamp_normalize(probs);
225
+
226
+ int match_byte;
227
+ double match_conf;
228
+ match_predict(&match, &match_byte, &match_conf);
229
+ blend_match(probs, match_byte, match_conf);
230
+
231
+ double w_conf;
232
+ if (word_predict_cached(&word, word_probs, &w_conf))
233
+ blend_word_model(probs, word_probs, w_conf);
234
+
235
+ double hctx_conf;
236
+ if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
237
+ blend_highctx(probs, hctx_probs, hctx_conf);
238
+
239
+ probs_to_cumfreqs(probs, cumfreqs, &total);
240
+ int sym = ad_decode(&dec, cumfreqs, total);
241
+ result[i] = (uint8_t)sym;
242
+
243
+ tweedie_update(&twd, (uint8_t)sym);
244
+ match_update(&match, (uint8_t)sym);
245
+ word_update(&word, (uint8_t)sym);
246
+ highctx_update(&hctx, (uint8_t)sym);
247
+ ppm_update(&ppm, (uint8_t)sym);
248
+
249
+ if ((i + 1) % 50000 == 0) {
250
+ clock_gettime(CLOCK_MONOTONIC, &t1);
251
+ double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
252
+ double pct = (i + 1) * 100.0 / original_size;
253
+ double speed = (i + 1) / elapsed;
254
+ fprintf(stderr, "\r %5.1f%% (%lu/%lu) %.0f B/s",
255
+ pct, (unsigned long)(i + 1), (unsigned long)original_size, speed);
256
+ }
257
+ }
258
+
259
+ clock_gettime(CLOCK_MONOTONIC, &t1);
260
+ double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
261
+ fprintf(stderr, "\r \r");
262
+
263
+ FILE *fout = fopen(output_path, "wb");
264
+ fwrite(result, 1, original_size, fout);
265
+ fclose(fout);
266
+
267
+ printf(" Output: %s (%lu bytes)\n", output_path, (unsigned long)original_size);
268
+ printf(" Time: %.1fs (%.0f B/s)\n", elapsed, original_size / elapsed);
269
+
270
+ ppm_free(&ppm);
271
+ match_free(&match);
272
+ word_free(&word);
273
+ highctx_free(&hctx);
274
+ free(compressed);
275
+ free(result);
276
+ return 0;
277
+ }
278
+
279
+ /* ── Main ── */
280
+
281
+ int main(int argc, char **argv) {
282
+ if (argc != 4) {
283
+ fprintf(stderr, "Usage: %s compress|decompress <input> <output>\n", argv[0]);
284
+ return 1;
285
+ }
286
+
287
+ if (strcmp(argv[1], "compress") == 0)
288
+ return do_compress(argv[2], argv[3]);
289
+ else if (strcmp(argv[1], "decompress") == 0)
290
+ return do_decompress(argv[2], argv[3]);
291
+ else {
292
+ fprintf(stderr, "Unknown command: %s\n", argv[1]);
293
+ return 1;
294
+ }
295
+ }
ppm.h ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef PPM_H
2
+ #define PPM_H
3
+
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+ #include <math.h>
8
+
9
+ #define PPM_MAX_ORDER 4
10
+ #define PPM_NSYM 256
11
+ #define PPM_PRIOR 0.5
12
+
13
+ /*
14
+ * Hash table entry: maps a 64-bit context hash to a count array.
15
+ * counts[i] stores the (float) count for symbol i.
16
+ * total caches sum(counts).
17
+ * key == 0 means empty slot.
18
+ */
19
+ typedef struct {
20
+ uint64_t key; /* context hash (0 = empty) */
21
+ double counts[PPM_NSYM];
22
+ double total;
23
+ } PPMEntry;
24
+
25
+ typedef struct {
26
+ PPMEntry *entries;
27
+ uint32_t capacity; /* power of 2 */
28
+ uint32_t used;
29
+ } PPMTable;
30
+
31
+ typedef struct {
32
+ PPMTable tables[PPM_MAX_ORDER + 1]; /* order 0..4 */
33
+ uint8_t *history;
34
+ int hist_len;
35
+ int hist_cap;
36
+ } PPMModel;
37
+
38
+ /* ── Hash helper ── */
39
+
40
+ static inline uint64_t ppm_hash_context(const uint8_t *ctx, int len) {
41
+ /* We need a non-zero hash for all contexts including order-0 (empty).
42
+ * Use FNV-1a style. Order-0 empty context gets a fixed hash. */
43
+ if (len == 0) return 1; /* special: order-0 empty context */
44
+ uint64_t h = 14695981039346656037ULL;
45
+ for (int i = 0; i < len; i++) {
46
+ h ^= ctx[i];
47
+ h *= 1099511628211ULL;
48
+ }
49
+ if (h == 0) h = 1; /* reserve 0 for empty slot */
50
+ return h;
51
+ }
52
+
53
+ /* ── Table operations ── */
54
+
55
+ static inline void ppm_table_init(PPMTable *t, uint32_t capacity) {
56
+ t->capacity = capacity;
57
+ t->used = 0;
58
+ t->entries = (PPMEntry *)calloc(capacity, sizeof(PPMEntry));
59
+ }
60
+
61
+ static inline void ppm_table_free(PPMTable *t) {
62
+ free(t->entries);
63
+ t->entries = NULL;
64
+ }
65
+
66
+ static inline void ppm_table_grow(PPMTable *t);
67
+
68
+ static inline PPMEntry *ppm_table_find(PPMTable *t, uint64_t key) {
69
+ uint32_t mask = t->capacity - 1;
70
+ uint32_t idx = (uint32_t)(key & mask);
71
+ for (;;) {
72
+ PPMEntry *e = &t->entries[idx];
73
+ if (e->key == key) return e;
74
+ if (e->key == 0) return NULL;
75
+ idx = (idx + 1) & mask;
76
+ }
77
+ }
78
+
79
+ static inline PPMEntry *ppm_table_insert(PPMTable *t, uint64_t key) {
80
+ /* Grow if > 60% full */
81
+ if (t->used * 5 > t->capacity * 3) {
82
+ ppm_table_grow(t);
83
+ }
84
+ uint32_t mask = t->capacity - 1;
85
+ uint32_t idx = (uint32_t)(key & mask);
86
+ for (;;) {
87
+ PPMEntry *e = &t->entries[idx];
88
+ if (e->key == key) return e; /* already exists */
89
+ if (e->key == 0) {
90
+ /* init new entry with prior */
91
+ e->key = key;
92
+ for (int i = 0; i < PPM_NSYM; i++)
93
+ e->counts[i] = PPM_PRIOR;
94
+ e->total = PPM_NSYM * PPM_PRIOR;
95
+ t->used++;
96
+ return e;
97
+ }
98
+ idx = (idx + 1) & mask;
99
+ }
100
+ }
101
+
102
+ static inline void ppm_table_grow(PPMTable *t) {
103
+ uint32_t old_cap = t->capacity;
104
+ PPMEntry *old = t->entries;
105
+ uint32_t new_cap = old_cap * 2;
106
+ t->entries = (PPMEntry *)calloc(new_cap, sizeof(PPMEntry));
107
+ t->capacity = new_cap;
108
+ t->used = 0;
109
+ for (uint32_t i = 0; i < old_cap; i++) {
110
+ if (old[i].key != 0) {
111
+ /* re-insert */
112
+ PPMEntry *ne = ppm_table_insert(t, old[i].key);
113
+ memcpy(ne->counts, old[i].counts, sizeof(old[i].counts));
114
+ ne->total = old[i].total;
115
+ }
116
+ }
117
+ free(old);
118
+ }
119
+
120
+ /* ── PPM Model ── */
121
+
122
+ static inline void ppm_init(PPMModel *m) {
123
+ for (int o = 0; o <= PPM_MAX_ORDER; o++)
124
+ ppm_table_init(&m->tables[o], 1024);
125
+ m->hist_cap = 4096;
126
+ m->hist_len = 0;
127
+ m->history = (uint8_t *)malloc(m->hist_cap);
128
+ }
129
+
130
+ static inline void ppm_free(PPMModel *m) {
131
+ for (int o = 0; o <= PPM_MAX_ORDER; o++)
132
+ ppm_table_free(&m->tables[o]);
133
+ free(m->history);
134
+ m->history = NULL;
135
+ }
136
+
137
+ /*
138
+ * predict_with_confidence: fills probs[256] and returns confidence + order.
139
+ * Matches Python: fallback from max_order down to 0, first context with total > 1.
140
+ * If nothing found, returns uniform.
141
+ */
142
+ static inline void ppm_predict(PPMModel *m, double *probs,
143
+ double *out_confidence, int *out_order) {
144
+ for (int order = PPM_MAX_ORDER; order >= 0; order--) {
145
+ const uint8_t *ctx_start;
146
+ int ctx_len = order;
147
+
148
+ if (ctx_len > m->hist_len) continue;
149
+ ctx_start = m->history + m->hist_len - ctx_len;
150
+
151
+ uint64_t key = ppm_hash_context(ctx_start, ctx_len);
152
+ PPMEntry *e = ppm_table_find(&m->tables[order], key);
153
+ if (e == NULL) continue;
154
+ if (e->total <= 1.0) continue;
155
+
156
+ double inv_total = 1.0 / e->total;
157
+ for (int i = 0; i < PPM_NSYM; i++)
158
+ probs[i] = e->counts[i] * inv_total;
159
+
160
+ *out_confidence = e->total;
161
+ *out_order = order;
162
+ return;
163
+ }
164
+
165
+ /* uniform fallback */
166
+ double u = 1.0 / 256.0;
167
+ for (int i = 0; i < PPM_NSYM; i++)
168
+ probs[i] = u;
169
+ *out_confidence = 0.0;
170
+ *out_order = -1;
171
+ }
172
+
173
+ /*
174
+ * update: add symbol count to all orders (0..4) where context is available.
175
+ * Then append symbol to history.
176
+ */
177
+ static inline void ppm_update(PPMModel *m, uint8_t symbol) {
178
+ for (int order = 0; order <= PPM_MAX_ORDER; order++) {
179
+ int ctx_len = order;
180
+ if (ctx_len > m->hist_len) continue;
181
+
182
+ const uint8_t *ctx_start = m->history + m->hist_len - ctx_len;
183
+ uint64_t key = ppm_hash_context(ctx_start, ctx_len);
184
+
185
+ PPMEntry *e = ppm_table_insert(&m->tables[order], key);
186
+ e->counts[symbol] += 1.0;
187
+ e->total += 1.0;
188
+ }
189
+
190
+ /* append to history */
191
+ if (m->hist_len >= m->hist_cap) {
192
+ m->hist_cap *= 2;
193
+ m->history = (uint8_t *)realloc(m->history, m->hist_cap);
194
+ }
195
+ m->history[m->hist_len++] = symbol;
196
+ }
197
+
198
+ #endif /* PPM_H */
tweedie.h ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef TWEEDIE_H
2
+ #define TWEEDIE_H
3
+
4
+ /*
5
+ * Binary Tree Tweedie Denoiser — score-based reverse diffusion.
6
+ *
7
+ * Forward process (PPM Jeffreys prior):
8
+ * p̂(s) = (n·q(s) + 0.5) / (n + 128) = (1-γ)q(s) + γ·u(s)
9
+ * where γ = 128/(n+128) is the noise level, u = 1/256 uniform.
10
+ *
11
+ * Tweedie's formula gives the optimal denoiser:
12
+ * θ̂ = p̂ + σ² · s(p̂)
13
+ * where s(p̂) = ∇ log m(p̂) is the score of the marginal density.
14
+ *
15
+ * The score is estimated empirically via calibration tables that track
16
+ * the additive correction δ = E[θ|p̂] - E[p̂] = hit_rate - avg_pred.
17
+ * This δ equals σ²·s(p̂) — the full Tweedie correction term.
18
+ *
19
+ * Binary tree decomposition: 256-way → 8 binary decisions (MSB to LSB).
20
+ * Multi-step: K=3 denoising steps with independent score tables.
21
+ * Calibration context: (step, bit_context, order, shape, confidence, prob_bin)
22
+ */
23
+
24
+ #include <stdint.h>
25
+ #include <string.h>
26
+ #include <math.h>
27
+ #include "fastmath.h"
28
+
29
+ #define TWD_NSYM 256
30
+
31
+ /* Number of reverse diffusion steps */
32
+ #define TWD_STEPS 3
33
+
34
+ /* Binary tree: 8 levels for 256 symbols */
35
+ #define TWD_N_LEVELS 8
36
+
37
+ /* 255 internal nodes: 1 + 2 + 4 + ... + 128 */
38
+ #define TWD_N_NODES 255
39
+
40
+ /* Bit context: encodes level + parent bit values. 27 total. */
41
+ #define TWD_N_BCTX 27
42
+
43
+ /* Calibration dimensions */
44
+ #define TWD_N_ORD 3 /* order groups: {-1,0,1}, {2,3}, {4+} */
45
+ #define TWD_N_SHAPE 4 /* distribution shape bins by max_p */
46
+ #define TWD_N_CONF 8 /* confidence bins (log-spaced) */
47
+ #define TWD_N_PROB 20 /* binary probability bins (logit-spaced) */
48
+
49
+ /* Smoothing pseudo-observations per bucket */
50
+ #define TWD_PRIOR_WEIGHT 32.0
51
+
52
+ /* Logit range for binary probability mapping */
53
+ #define TWD_LOGIT_RANGE 8.0
54
+
55
+ typedef struct {
56
+ double sum_pred; /* sum of predicted P(right) */
57
+ double hits; /* times true symbol went right */
58
+ double total; /* total observations */
59
+ } TwdCalibEntry;
60
+
61
+ typedef struct {
62
+ /* Calibration table: [step][bctx][order][shape][conf][prob_bin]
63
+ * Total entries: 3 × 27 × 3 × 4 × 8 × 20 = 155,520
64
+ * Memory: 155,520 × 24 bytes = 3.6 MB */
65
+ TwdCalibEntry table[TWD_STEPS][TWD_N_BCTX][TWD_N_ORD][TWD_N_SHAPE][TWD_N_CONF][TWD_N_PROB];
66
+
67
+ /* Cached from denoise, reused by update */
68
+ double cached_p_right[TWD_STEPS][TWD_N_NODES];
69
+ int cached_prob_bin[TWD_STEPS][TWD_N_NODES];
70
+ int cached_bctx[TWD_STEPS][TWD_N_NODES];
71
+ int cached_ord;
72
+ int cached_shape;
73
+ int cached_conf;
74
+ } TweedieDenoiser;
75
+
76
+ /* ── Bucket mapping functions ── */
77
+
78
+ static inline int twd_order_group(int ppm_order) {
79
+ if (ppm_order <= 1) return 0;
80
+ if (ppm_order <= 3) return 1;
81
+ return 2;
82
+ }
83
+
84
+ static inline int twd_shape_bin(double max_p) {
85
+ if (max_p < 0.05) return 0; /* very flat */
86
+ if (max_p < 0.15) return 1; /* moderately flat */
87
+ if (max_p < 0.40) return 2; /* moderate peak */
88
+ return 3; /* peaked */
89
+ }
90
+
91
+ static inline int twd_conf_bin(double confidence) {
92
+ if (confidence < 4.0) return 0;
93
+ int bin = (int)(fast_log(confidence) * (1.0 / 1.3862943611198906));
94
+ if (bin < 0) bin = 0;
95
+ if (bin > TWD_N_CONF - 1) bin = TWD_N_CONF - 1;
96
+ return bin;
97
+ }
98
+
99
+ /* Binary probability bin: logit-spaced in [-8, 8]. */
100
+ static inline int twd_prob_bin(double p) {
101
+ if (p < 1e-8) p = 1e-8;
102
+ if (p > 1.0 - 1e-8) p = 1.0 - 1e-8;
103
+ double logit = fast_log(p / (1.0 - p));
104
+ int bin = (int)((logit + TWD_LOGIT_RANGE) / (2.0 * TWD_LOGIT_RANGE) * TWD_N_PROB);
105
+ if (bin < 0) bin = 0;
106
+ if (bin > TWD_N_PROB - 1) bin = TWD_N_PROB - 1;
107
+ return bin;
108
+ }
109
+
110
+ /* Bin center for prior initialization */
111
+ static inline double twd_bin_center(int bin) {
112
+ double logit = ((bin + 0.5) / TWD_N_PROB) * 2.0 * TWD_LOGIT_RANGE - TWD_LOGIT_RANGE;
113
+ return 1.0 / (1.0 + fast_exp(-logit));
114
+ }
115
+
116
+ /* Bit context: maps (level, node_index_at_level) → context ID 0..26. */
117
+ static inline int twd_bit_context(int level, int node_at_level) {
118
+ if (level == 0) return 0;
119
+ if (level == 1) return 1 + node_at_level; /* 2 contexts */
120
+ if (level == 2) return 3 + node_at_level; /* 4 contexts */
121
+ /* Levels 3-7: hash node_at_level into 4 groups */
122
+ int group = (node_at_level * 2654435761U) >> 30; /* hash → 0..3 */
123
+ return 7 + (level - 3) * 4 + group;
124
+ }
125
+
126
+ /* ── Initialization ── */
127
+
128
+ static inline void tweedie_init(TweedieDenoiser *td) {
129
+ memset(td, 0, sizeof(*td));
130
+
131
+ for (int t = 0; t < TWD_STEPS; t++)
132
+ for (int b = 0; b < TWD_N_BCTX; b++)
133
+ for (int o = 0; o < TWD_N_ORD; o++)
134
+ for (int s = 0; s < TWD_N_SHAPE; s++)
135
+ for (int c = 0; c < TWD_N_CONF; c++)
136
+ for (int p = 0; p < TWD_N_PROB; p++) {
137
+ double center = twd_bin_center(p);
138
+ td->table[t][b][o][s][c][p].sum_pred = center * TWD_PRIOR_WEIGHT;
139
+ td->table[t][b][o][s][c][p].hits = center * TWD_PRIOR_WEIGHT;
140
+ td->table[t][b][o][s][c][p].total = TWD_PRIOR_WEIGHT;
141
+ }
142
+ }
143
+
144
+ /* ── Denoise: multi-step Tweedie reverse diffusion ──
145
+ *
146
+ * Additive Tweedie correction: p' = p + δ
147
+ * where δ = hits/total - sum_pred/total estimates the Tweedie term σ²·s(p̂).
148
+ *
149
+ * This is the nonparametric Tweedie estimator: within each calibration bin,
150
+ * the empirical hit rate is the posterior mean E[θ|p̂], and the additive
151
+ * correction δ = E[θ|p̂] - E[p̂] equals σ²·∇log m(p̂). */
152
+
153
+ static inline void tweedie_denoise(TweedieDenoiser *td, double *probs,
154
+ int ppm_order, double confidence) {
155
+ int og = twd_order_group(ppm_order);
156
+ int cb = twd_conf_bin(confidence);
157
+
158
+ /* Shape from the 256-way distribution (before any correction) */
159
+ double max_p = 0.0;
160
+ for (int i = 0; i < TWD_NSYM; i++)
161
+ if (probs[i] > max_p) max_p = probs[i];
162
+ int sb = twd_shape_bin(max_p);
163
+
164
+ td->cached_ord = og;
165
+ td->cached_shape = sb;
166
+ td->cached_conf = cb;
167
+
168
+ double stree[512];
169
+ double scale[512];
170
+
171
+ for (int step = 0; step < TWD_STEPS; step++) {
172
+
173
+ /* 1. Build sum tree bottom-up */
174
+ for (int i = 0; i < TWD_NSYM; i++)
175
+ stree[TWD_NSYM + i] = probs[i];
176
+ for (int i = TWD_NSYM - 1; i >= 1; i--)
177
+ stree[i] = stree[2 * i] + stree[2 * i + 1];
178
+
179
+ /* 2. Process all nodes: compute P(right), apply Tweedie correction */
180
+ scale[1] = 1.0;
181
+
182
+ for (int level = 0; level < TWD_N_LEVELS; level++) {
183
+ int level_start = 1 << level;
184
+ int level_end = 1 << (level + 1);
185
+
186
+ for (int ni = level_start; ni < level_end; ni++) {
187
+ double node_total = stree[ni];
188
+ int node_id = ni - 1;
189
+ int node_at_level = ni - level_start;
190
+
191
+ if (node_total < 1e-15) {
192
+ scale[2 * ni] = scale[ni];
193
+ scale[2 * ni + 1] = scale[ni];
194
+ td->cached_p_right[step][node_id] = 0.5;
195
+ td->cached_prob_bin[step][node_id] = twd_prob_bin(0.5);
196
+ td->cached_bctx[step][node_id] = twd_bit_context(level, node_at_level);
197
+ continue;
198
+ }
199
+
200
+ double sum_right = stree[2 * ni + 1];
201
+ double p_right = sum_right / node_total;
202
+ if (p_right < 1e-8) p_right = 1e-8;
203
+ if (p_right > 1.0 - 1e-8) p_right = 1.0 - 1e-8;
204
+
205
+ int bctx = twd_bit_context(level, node_at_level);
206
+ int pbin = twd_prob_bin(p_right);
207
+ td->cached_p_right[step][node_id] = p_right;
208
+ td->cached_prob_bin[step][node_id] = pbin;
209
+ td->cached_bctx[step][node_id] = bctx;
210
+
211
+ /* Tweedie additive correction: δ = E[θ|p̂] - E[p̂] */
212
+ TwdCalibEntry *e = &td->table[step][bctx][og][sb][cb][pbin];
213
+ double avg_pred = e->sum_pred / e->total;
214
+ double emp_rate = e->hits / e->total;
215
+ double delta = emp_rate - avg_pred;
216
+
217
+ double p_right_corr = p_right + delta;
218
+ if (p_right_corr < 1e-8) p_right_corr = 1e-8;
219
+ if (p_right_corr > 1.0 - 1e-8) p_right_corr = 1.0 - 1e-8;
220
+
221
+ double sl = (1.0 - p_right_corr) / (1.0 - p_right);
222
+ double sr = p_right_corr / p_right;
223
+ scale[2 * ni] = scale[ni] * sl;
224
+ scale[2 * ni + 1] = scale[ni] * sr;
225
+ }
226
+ }
227
+
228
+ /* 3. Apply accumulated leaf scales */
229
+ for (int i = 0; i < TWD_NSYM; i++)
230
+ probs[i] *= scale[TWD_NSYM + i];
231
+
232
+ /* 4. Renormalize */
233
+ double sum = 0.0;
234
+ for (int i = 0; i < TWD_NSYM; i++) {
235
+ if (probs[i] < 1e-10) probs[i] = 1e-10;
236
+ sum += probs[i];
237
+ }
238
+ double inv = 1.0 / sum;
239
+ for (int i = 0; i < TWD_NSYM; i++)
240
+ probs[i] *= inv;
241
+
242
+ /* Recompute shape after correction for next step */
243
+ max_p = 0.0;
244
+ for (int i = 0; i < TWD_NSYM; i++)
245
+ if (probs[i] > max_p) max_p = probs[i];
246
+ sb = twd_shape_bin(max_p);
247
+ }
248
+ }
249
+
250
+ /* ── Update ── */
251
+
252
+ static inline void tweedie_update(TweedieDenoiser *td, uint8_t true_symbol) {
253
+ int og = td->cached_ord;
254
+ int sb = td->cached_shape;
255
+ int cb = td->cached_conf;
256
+
257
+ for (int step = 0; step < TWD_STEPS; step++) {
258
+ for (int level = 0; level < TWD_N_LEVELS; level++) {
259
+ int block_size = TWD_NSYM >> level;
260
+ int half = block_size >> 1;
261
+
262
+ int node_at_level = true_symbol / block_size;
263
+ int start = node_at_level * block_size;
264
+ int mid = start + half;
265
+ int went_right = (true_symbol >= mid) ? 1 : 0;
266
+
267
+ int node_id = (1 << level) - 1 + node_at_level;
268
+ int pbin = td->cached_prob_bin[step][node_id];
269
+ int bctx = td->cached_bctx[step][node_id];
270
+
271
+ TwdCalibEntry *e = &td->table[step][bctx][og][sb][cb][pbin];
272
+ e->sum_pred += td->cached_p_right[step][node_id];
273
+ e->total += 1.0;
274
+ if (went_right)
275
+ e->hits += 1.0;
276
+ }
277
+ }
278
+ }
279
+
280
+ #endif /* TWEEDIE_H */
word.h ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef WORD_H
2
+ #define WORD_H
3
+
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+
8
+ #define WORD_NSYM 256
9
+
10
+ /* ── Word character set ── */
11
+ static inline int is_word_char(int c) {
12
+ if (c >= 'a' && c <= 'z') return 1;
13
+ if (c >= 'A' && c <= 'Z') return 1;
14
+ if (c >= '0' && c <= '9') return 1;
15
+ if (c == '\'' || c == '-') return 1;
16
+ return 0;
17
+ }
18
+
19
+ /* ── Trie node ── */
20
+ typedef struct TrieNode {
21
+ /* continuations: next_byte → count */
22
+ int cont_keys[64];
23
+ int cont_vals[64];
24
+ int cont_count;
25
+
26
+ /* children: byte → child node */
27
+ struct TrieNode *children[256];
28
+ } TrieNode;
29
+
30
+ static inline TrieNode *trie_new(void) {
31
+ TrieNode *n = (TrieNode *)calloc(1, sizeof(TrieNode));
32
+ return n;
33
+ }
34
+
35
+ static inline void trie_free(TrieNode *n) {
36
+ if (!n) return;
37
+ for (int i = 0; i < 256; i++)
38
+ trie_free(n->children[i]);
39
+ free(n);
40
+ }
41
+
42
+ static inline void trie_add_cont(TrieNode *n, int byte_val) {
43
+ for (int i = 0; i < n->cont_count; i++) {
44
+ if (n->cont_keys[i] == byte_val) {
45
+ n->cont_vals[i]++;
46
+ return;
47
+ }
48
+ }
49
+ if (n->cont_count < 64) {
50
+ n->cont_keys[n->cont_count] = byte_val;
51
+ n->cont_vals[n->cont_count] = 1;
52
+ n->cont_count++;
53
+ }
54
+ }
55
+
56
+ /* ── Word counts hash table ── */
57
+ typedef struct {
58
+ uint64_t key;
59
+ int count;
60
+ } WordCountEntry;
61
+
62
+ typedef struct {
63
+ WordCountEntry *entries;
64
+ uint32_t capacity;
65
+ uint32_t mask;
66
+ uint32_t used;
67
+ } WordCountHT;
68
+
69
+ static inline void wcht_init(WordCountHT *t, uint32_t cap) {
70
+ t->capacity = cap;
71
+ t->mask = cap - 1;
72
+ t->used = 0;
73
+ t->entries = (WordCountEntry *)calloc(cap, sizeof(WordCountEntry));
74
+ }
75
+
76
+ static inline void wcht_free(WordCountHT *t) {
77
+ free(t->entries);
78
+ }
79
+
80
+ static inline uint64_t word_hash(const uint8_t *w, int len) {
81
+ uint64_t h = 14695981039346656037ULL;
82
+ for (int i = 0; i < len; i++) {
83
+ h ^= w[i];
84
+ h *= 1099511628211ULL;
85
+ }
86
+ if (h == 0) h = 1;
87
+ return h;
88
+ }
89
+
90
+ static inline void wcht_grow(WordCountHT *t) {
91
+ uint32_t old_cap = t->capacity;
92
+ WordCountEntry *old = t->entries;
93
+ t->capacity *= 2;
94
+ t->mask = t->capacity - 1;
95
+ t->entries = (WordCountEntry *)calloc(t->capacity, sizeof(WordCountEntry));
96
+ t->used = 0;
97
+ for (uint32_t i = 0; i < old_cap; i++) {
98
+ if (old[i].key != 0) {
99
+ uint32_t idx = (uint32_t)(old[i].key & t->mask);
100
+ while (t->entries[idx].key != 0)
101
+ idx = (idx + 1) & t->mask;
102
+ t->entries[idx] = old[i];
103
+ t->used++;
104
+ }
105
+ }
106
+ free(old);
107
+ }
108
+
109
+ static inline int wcht_get(WordCountHT *t, uint64_t key) {
110
+ uint32_t idx = (uint32_t)(key & t->mask);
111
+ for (;;) {
112
+ if (t->entries[idx].key == key) return t->entries[idx].count;
113
+ if (t->entries[idx].key == 0) return 0;
114
+ idx = (idx + 1) & t->mask;
115
+ }
116
+ }
117
+
118
+ static inline void wcht_add(WordCountHT *t, uint64_t key, int delta) {
119
+ if (t->used * 5 > t->capacity * 3) wcht_grow(t);
120
+ uint32_t idx = (uint32_t)(key & t->mask);
121
+ for (;;) {
122
+ if (t->entries[idx].key == key) {
123
+ t->entries[idx].count += delta;
124
+ return;
125
+ }
126
+ if (t->entries[idx].key == 0) {
127
+ t->entries[idx].key = key;
128
+ t->entries[idx].count = delta;
129
+ t->used++;
130
+ return;
131
+ }
132
+ idx = (idx + 1) & t->mask;
133
+ }
134
+ }
135
+
136
+ /* ── Bigram table: word_hash → { byte → count } ── */
137
+ typedef struct {
138
+ uint64_t key;
139
+ int counts[256];
140
+ int total;
141
+ } BigramEntry;
142
+
143
+ typedef struct {
144
+ BigramEntry *entries;
145
+ uint32_t capacity;
146
+ uint32_t mask;
147
+ uint32_t used;
148
+ } BigramHT;
149
+
150
+ static inline void bht_init(BigramHT *t, uint32_t cap) {
151
+ t->capacity = cap;
152
+ t->mask = cap - 1;
153
+ t->used = 0;
154
+ t->entries = (BigramEntry *)calloc(cap, sizeof(BigramEntry));
155
+ }
156
+
157
+ static inline void bht_free(BigramHT *t) {
158
+ free(t->entries);
159
+ }
160
+
161
+ static inline void bht_grow(BigramHT *t) {
162
+ uint32_t old_cap = t->capacity;
163
+ BigramEntry *old = t->entries;
164
+ t->capacity *= 2;
165
+ t->mask = t->capacity - 1;
166
+ t->entries = (BigramEntry *)calloc(t->capacity, sizeof(BigramEntry));
167
+ t->used = 0;
168
+ for (uint32_t i = 0; i < old_cap; i++) {
169
+ if (old[i].key != 0) {
170
+ uint32_t idx = (uint32_t)(old[i].key & t->mask);
171
+ while (t->entries[idx].key != 0)
172
+ idx = (idx + 1) & t->mask;
173
+ t->entries[idx] = old[i];
174
+ t->used++;
175
+ }
176
+ }
177
+ free(old);
178
+ }
179
+
180
+ static inline BigramEntry *bht_get_or_create(BigramHT *t, uint64_t key) {
181
+ if (t->used * 5 > t->capacity * 3) bht_grow(t);
182
+ uint32_t idx = (uint32_t)(key & t->mask);
183
+ for (;;) {
184
+ if (t->entries[idx].key == key) return &t->entries[idx];
185
+ if (t->entries[idx].key == 0) {
186
+ t->entries[idx].key = key;
187
+ memset(t->entries[idx].counts, 0, sizeof(t->entries[idx].counts));
188
+ t->entries[idx].total = 0;
189
+ t->used++;
190
+ return &t->entries[idx];
191
+ }
192
+ idx = (idx + 1) & t->mask;
193
+ }
194
+ }
195
+
196
+ /* ── Word Model ── */
197
+
198
+ typedef struct {
199
+ TrieNode *trie;
200
+ WordCountHT word_counts;
201
+ BigramHT bigrams;
202
+
203
+ uint8_t current_word[256];
204
+ int current_word_len;
205
+
206
+ uint8_t last_word[256];
207
+ int last_word_len;
208
+ int has_last_word;
209
+
210
+ int in_word;
211
+ double hits;
212
+ double attempts;
213
+
214
+ /* prediction cache to avoid double trie traversal */
215
+ double cached_probs[WORD_NSYM];
216
+ double cached_conf;
217
+ int cache_valid;
218
+ } WordModel;
219
+
220
+ static inline void word_init(WordModel *w) {
221
+ w->trie = trie_new();
222
+ wcht_init(&w->word_counts, 4096);
223
+ bht_init(&w->bigrams, 2048);
224
+ w->current_word_len = 0;
225
+ w->last_word_len = 0;
226
+ w->has_last_word = 0;
227
+ w->in_word = 0;
228
+ w->hits = 1.0;
229
+ w->attempts = 2.0;
230
+ w->cache_valid = 0;
231
+ }
232
+
233
+ static inline void word_free(WordModel *w) {
234
+ trie_free(w->trie);
235
+ wcht_free(&w->word_counts);
236
+ bht_free(&w->bigrams);
237
+ }
238
+
239
+ static inline void word_add_to_trie(WordModel *w, const uint8_t *word, int len) {
240
+ uint64_t wh = word_hash(word, len);
241
+ wcht_add(&w->word_counts, wh, 1);
242
+
243
+ TrieNode *node = w->trie;
244
+ for (int i = 0; i < len; i++) {
245
+ int b = word[i];
246
+ if (!node->children[b])
247
+ node->children[b] = trie_new();
248
+ TrieNode *entry = node->children[b];
249
+ if (i + 1 < len)
250
+ trie_add_cont(entry, word[i + 1]);
251
+ node = entry;
252
+ }
253
+ }
254
+
255
+ /* Get continuations for a prefix. Returns count of distinct continuations.
256
+ Fills keys[] and vals[] arrays. */
257
+ static inline int word_get_continuations(WordModel *w, const uint8_t *prefix,
258
+ int prefix_len, int *keys, int *vals) {
259
+ if (prefix_len == 0) return 0;
260
+ TrieNode *node = w->trie;
261
+ for (int i = 0; i < prefix_len; i++) {
262
+ int b = prefix[i];
263
+ if (!node->children[b]) return 0;
264
+ TrieNode *entry = node->children[b];
265
+ if (i == prefix_len - 1) {
266
+ int n = entry->cont_count;
267
+ for (int j = 0; j < n; j++) {
268
+ keys[j] = entry->cont_keys[j];
269
+ vals[j] = entry->cont_vals[j];
270
+ }
271
+ return n;
272
+ }
273
+ node = entry;
274
+ }
275
+ return 0;
276
+ }
277
+
278
+ /*
279
+ * predict: fills probs[256] if prediction available.
280
+ * Returns 1 with confidence in *out_conf, or 0 if no prediction.
281
+ */
282
+ static inline int word_predict(WordModel *w, double *probs, double *out_conf) {
283
+ static const int boundary_chars[] = {32, 10, 13, 44, 46, 59, 58, 33, 63, 41, 93};
284
+ static const int n_boundary = 11;
285
+
286
+ if (w->in_word && w->current_word_len >= 1) {
287
+ int keys[64], vals[64];
288
+ int nc = word_get_continuations(w, w->current_word,
289
+ w->current_word_len, keys, vals);
290
+ if (nc > 0) {
291
+ memset(probs, 0, WORD_NSYM * sizeof(double));
292
+ int total = 0;
293
+ for (int i = 0; i < nc; i++) total += vals[i];
294
+ double inv_total = 1.0 / total;
295
+ for (int i = 0; i < nc; i++)
296
+ probs[keys[i]] += vals[i] * inv_total;
297
+
298
+ /* word boundary probability */
299
+ uint64_t wh = word_hash(w->current_word, w->current_word_len);
300
+ int wcount = wcht_get(&w->word_counts, wh);
301
+ if (wcount > 0) {
302
+ double bw = (double)wcount / (wcount + total);
303
+ for (int i = 0; i < WORD_NSYM; i++)
304
+ probs[i] *= (1.0 - bw);
305
+ for (int i = 0; i < n_boundary; i++)
306
+ probs[boundary_chars[i]] += bw / n_boundary;
307
+ }
308
+
309
+ int plen = w->current_word_len;
310
+ double confidence = (plen / 3.0 < 1.0 ? plen / 3.0 : 1.0);
311
+ double cont_factor = nc * 0.5;
312
+ if (cont_factor > 1.0) cont_factor = 1.0;
313
+ confidence *= cont_factor;
314
+ confidence *= (w->hits / w->attempts);
315
+
316
+ double sum = 0.0;
317
+ for (int i = 0; i < WORD_NSYM; i++) sum += probs[i];
318
+ if (sum > 0.0) {
319
+ double inv = 1.0 / sum;
320
+ for (int i = 0; i < WORD_NSYM; i++) probs[i] *= inv;
321
+ *out_conf = confidence;
322
+ return 1;
323
+ }
324
+ }
325
+ } else if (!w->in_word && w->has_last_word) {
326
+ uint64_t wh = word_hash(w->last_word, w->last_word_len);
327
+ BigramEntry *be = NULL;
328
+ /* look up without creating */
329
+ uint32_t idx = (uint32_t)(wh & w->bigrams.mask);
330
+ for (;;) {
331
+ if (w->bigrams.entries[idx].key == wh) {
332
+ be = &w->bigrams.entries[idx];
333
+ break;
334
+ }
335
+ if (w->bigrams.entries[idx].key == 0) break;
336
+ idx = (idx + 1) & w->bigrams.mask;
337
+ }
338
+ if (be && be->total > 0) {
339
+ memset(probs, 0, WORD_NSYM * sizeof(double));
340
+ double inv = 1.0 / be->total;
341
+ for (int i = 0; i < WORD_NSYM; i++)
342
+ if (be->counts[i] > 0)
343
+ probs[i] = be->counts[i] * inv;
344
+
345
+ double confidence = (be->total / 5.0 < 1.0 ? be->total / 5.0 : 1.0);
346
+ confidence *= 0.3 * (w->hits / w->attempts);
347
+
348
+ double sum = 0.0;
349
+ for (int i = 0; i < WORD_NSYM; i++) sum += probs[i];
350
+ if (sum > 0.0) {
351
+ double inv2 = 1.0 / sum;
352
+ for (int i = 0; i < WORD_NSYM; i++) probs[i] *= inv2;
353
+ *out_conf = confidence;
354
+ return 1;
355
+ }
356
+ }
357
+ }
358
+
359
+ *out_conf = 0.0;
360
+ return 0;
361
+ }
362
+
363
+ /* Predict with caching: compute once, reuse in update */
364
+ static inline int word_predict_cached(WordModel *w, double *probs, double *out_conf) {
365
+ if (w->cache_valid) {
366
+ memcpy(probs, w->cached_probs, sizeof(w->cached_probs));
367
+ *out_conf = w->cached_conf;
368
+ return (*out_conf > 0.0) ? 1 : 0;
369
+ }
370
+ int ret = word_predict(w, probs, out_conf);
371
+ if (ret) {
372
+ memcpy(w->cached_probs, probs, sizeof(w->cached_probs));
373
+ w->cached_conf = *out_conf;
374
+ } else {
375
+ w->cached_conf = 0.0;
376
+ }
377
+ w->cache_valid = 1;
378
+ return ret;
379
+ }
380
+
381
+ static inline void word_update(WordModel *w, uint8_t byte_val) {
382
+ /* track accuracy using cached prediction */
383
+ double pred_conf = w->cached_conf;
384
+ int has_pred = w->cache_valid && pred_conf > 0.01;
385
+ if (has_pred) {
386
+ w->attempts += 1.0;
387
+ if (w->cached_probs[byte_val] > 0.05)
388
+ w->hits += 1.0;
389
+ if (w->attempts > 500.0) {
390
+ w->hits *= 0.99;
391
+ w->attempts *= 0.99;
392
+ }
393
+ }
394
+
395
+ int is_wc = is_word_char(byte_val);
396
+ if (is_wc) {
397
+ if (!w->in_word) {
398
+ w->current_word_len = 0;
399
+ w->in_word = 1;
400
+ /* bigram: last_word → first byte of new word */
401
+ if (w->has_last_word) {
402
+ uint64_t wh = word_hash(w->last_word, w->last_word_len);
403
+ BigramEntry *be = bht_get_or_create(&w->bigrams, wh);
404
+ be->counts[byte_val]++;
405
+ be->total++;
406
+ }
407
+ }
408
+ if (w->current_word_len < 255)
409
+ w->current_word[w->current_word_len++] = byte_val;
410
+ } else {
411
+ if (w->in_word && w->current_word_len >= 2) {
412
+ word_add_to_trie(w, w->current_word, w->current_word_len);
413
+ w->last_word_len = w->current_word_len;
414
+ memcpy(w->last_word, w->current_word, w->current_word_len);
415
+ w->has_last_word = 1;
416
+ } else if (w->in_word) {
417
+ w->last_word_len = w->current_word_len;
418
+ memcpy(w->last_word, w->current_word, w->current_word_len);
419
+ w->has_last_word = 1;
420
+ }
421
+ w->in_word = 0;
422
+ w->current_word_len = 0;
423
+ }
424
+ w->cache_valid = 0; /* invalidate cache after state change */
425
+ }
426
+
427
+ static inline void blend_word_model(double *probs, const double *word_probs,
428
+ double word_confidence) {
429
+ if (word_confidence < 0.01) return;
430
+ double weight = word_confidence * 0.35;
431
+ if (weight > 0.45) weight = 0.45;
432
+ double sum = 0.0;
433
+ for (int i = 0; i < WORD_NSYM; i++) {
434
+ probs[i] = probs[i] * (1.0 - weight) + word_probs[i] * weight;
435
+ if (probs[i] < 1e-10) probs[i] = 1e-10;
436
+ sum += probs[i];
437
+ }
438
+ double inv = 1.0 / sum;
439
+ for (int i = 0; i < WORD_NSYM; i++)
440
+ probs[i] *= inv;
441
+ }
442
+
443
+ #endif /* WORD_H */