Spaces:
Sleeping
Sleeping
Roberto Tacconelli commited on
Add files via upload
Browse files- Makefile +24 -0
- ablation.c +377 -0
- arith.h +171 -0
- delta_vs_noise.c +226 -0
- fastmath.h +88 -0
- highctx.h +208 -0
- match.h +234 -0
- mdc.c +295 -0
- ppm.h +198 -0
- tweedie.h +280 -0
- word.h +443 -0
Makefile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CC = gcc
|
| 2 |
+
CFLAGS = -O3 -march=native -Wall -Wextra
|
| 3 |
+
LDFLAGS = -lm
|
| 4 |
+
|
| 5 |
+
HEADERS = arith.h ppm.h tweedie.h match.h word.h highctx.h fastmath.h
|
| 6 |
+
|
| 7 |
+
all: mdc ablation
|
| 8 |
+
|
| 9 |
+
mdc: mdc.c $(HEADERS)
|
| 10 |
+
$(CC) $(CFLAGS) -o mdc mdc.c $(LDFLAGS)
|
| 11 |
+
|
| 12 |
+
ablation: ablation.c $(HEADERS)
|
| 13 |
+
$(CC) $(CFLAGS) -o ablation ablation.c $(LDFLAGS)
|
| 14 |
+
|
| 15 |
+
test_arith: test_arith.c arith.h
|
| 16 |
+
$(CC) $(CFLAGS) -o test_arith test_arith.c $(LDFLAGS)
|
| 17 |
+
|
| 18 |
+
test_ppm: test_ppm.c arith.h ppm.h
|
| 19 |
+
$(CC) $(CFLAGS) -o test_ppm test_ppm.c $(LDFLAGS)
|
| 20 |
+
|
| 21 |
+
clean:
|
| 22 |
+
rm -f mdc ablation bench test_arith test_ppm
|
| 23 |
+
|
| 24 |
+
.PHONY: all clean
|
ablation.c
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Ablation study — C implementation
|
| 3 |
+
* Measures incremental contribution of each pipeline layer.
|
| 4 |
+
*
|
| 5 |
+
* Usage:
|
| 6 |
+
* ./ablation # alice29.txt only
|
| 7 |
+
* ./ablation file1 file2 ... # specific files
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#include <stdio.h>
|
| 11 |
+
#include <stdlib.h>
|
| 12 |
+
#include <string.h>
|
| 13 |
+
#include <math.h>
|
| 14 |
+
#include <time.h>
|
| 15 |
+
#include <libgen.h>
|
| 16 |
+
|
| 17 |
+
#include "fastmath.h"
|
| 18 |
+
#include "arith.h"
|
| 19 |
+
#include "ppm.h"
|
| 20 |
+
#include "tweedie.h"
|
| 21 |
+
#include "match.h"
|
| 22 |
+
#include "word.h"
|
| 23 |
+
#include "highctx.h"
|
| 24 |
+
|
| 25 |
+
#define SCALE (1 << 14)
|
| 26 |
+
|
| 27 |
+
/* ── Flags ── */
|
| 28 |
+
#define FLAG_TWEEDIE 1
|
| 29 |
+
#define FLAG_MATCH 2
|
| 30 |
+
#define FLAG_WORD 4
|
| 31 |
+
#define FLAG_HIGHCTX 8
|
| 32 |
+
|
| 33 |
+
/* ── Helpers ── */
|
| 34 |
+
|
| 35 |
+
static void probs_to_cumfreqs(const double *probs, int64_t *cumfreqs,
|
| 36 |
+
int64_t *out_total) {
|
| 37 |
+
cumfreqs[0] = 0;
|
| 38 |
+
for (int i = 0; i < 256; i++) {
|
| 39 |
+
int64_t f = (int64_t)(probs[i] * SCALE + 0.5);
|
| 40 |
+
if (f < 1) f = 1;
|
| 41 |
+
cumfreqs[i + 1] = cumfreqs[i] + f;
|
| 42 |
+
}
|
| 43 |
+
*out_total = cumfreqs[256];
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
static void clamp_normalize(double *probs) {
|
| 47 |
+
double sum = 0.0;
|
| 48 |
+
for (int i = 0; i < 256; i++) {
|
| 49 |
+
if (probs[i] < 1e-10) probs[i] = 1e-10;
|
| 50 |
+
sum += probs[i];
|
| 51 |
+
}
|
| 52 |
+
double inv = 1.0 / sum;
|
| 53 |
+
for (int i = 0; i < 256; i++)
|
| 54 |
+
probs[i] *= inv;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
static inline double now_sec(void) {
|
| 58 |
+
struct timespec ts;
|
| 59 |
+
clock_gettime(CLOCK_MONOTONIC, &ts);
|
| 60 |
+
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
/* ── Configurable compress ── */
|
| 64 |
+
|
| 65 |
+
static uint8_t *do_compress(const uint8_t *data, size_t data_len,
|
| 66 |
+
int flags, size_t *out_len, double *out_time) {
|
| 67 |
+
PPMModel ppm; ppm_init(&ppm);
|
| 68 |
+
MatchModel match; if (flags & FLAG_MATCH) match_init(&match);
|
| 69 |
+
WordModel word; if (flags & FLAG_WORD) word_init(&word);
|
| 70 |
+
HighCtxModel hctx; if (flags & FLAG_HIGHCTX) highctx_init(&hctx);
|
| 71 |
+
ArithEncoder enc; ae_init(&enc);
|
| 72 |
+
TweedieDenoiser twd; if (flags & FLAG_TWEEDIE) tweedie_init(&twd);
|
| 73 |
+
|
| 74 |
+
double probs[256], word_probs[256], hctx_probs[256];
|
| 75 |
+
int64_t cumfreqs[257];
|
| 76 |
+
int64_t total;
|
| 77 |
+
|
| 78 |
+
double t0 = now_sec();
|
| 79 |
+
|
| 80 |
+
for (size_t i = 0; i < data_len; i++) {
|
| 81 |
+
uint8_t byte = data[i];
|
| 82 |
+
double confidence;
|
| 83 |
+
int order;
|
| 84 |
+
|
| 85 |
+
ppm_predict(&ppm, probs, &confidence, &order);
|
| 86 |
+
|
| 87 |
+
if (flags & FLAG_TWEEDIE) {
|
| 88 |
+
tweedie_denoise(&twd, probs, order, confidence);
|
| 89 |
+
}
|
| 90 |
+
clamp_normalize(probs);
|
| 91 |
+
|
| 92 |
+
if (flags & FLAG_MATCH) {
|
| 93 |
+
int match_byte;
|
| 94 |
+
double match_conf;
|
| 95 |
+
match_predict(&match, &match_byte, &match_conf);
|
| 96 |
+
blend_match(probs, match_byte, match_conf);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
if (flags & FLAG_WORD) {
|
| 100 |
+
double w_conf;
|
| 101 |
+
if (word_predict_cached(&word, word_probs, &w_conf))
|
| 102 |
+
blend_word_model(probs, word_probs, w_conf);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
if (flags & FLAG_HIGHCTX) {
|
| 106 |
+
double hctx_conf;
|
| 107 |
+
if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
|
| 108 |
+
blend_highctx(probs, hctx_probs, hctx_conf);
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
probs_to_cumfreqs(probs, cumfreqs, &total);
|
| 112 |
+
ae_encode(&enc, cumfreqs, byte, total);
|
| 113 |
+
|
| 114 |
+
/* Updates */
|
| 115 |
+
if (flags & FLAG_TWEEDIE)
|
| 116 |
+
tweedie_update(&twd, byte);
|
| 117 |
+
if (flags & FLAG_MATCH)
|
| 118 |
+
match_update(&match, byte);
|
| 119 |
+
if (flags & FLAG_WORD)
|
| 120 |
+
word_update(&word, byte);
|
| 121 |
+
if (flags & FLAG_HIGHCTX)
|
| 122 |
+
highctx_update(&hctx, byte);
|
| 123 |
+
ppm_update(&ppm, byte);
|
| 124 |
+
|
| 125 |
+
if ((i + 1) % 50000 == 0) {
|
| 126 |
+
double elapsed = now_sec() - t0;
|
| 127 |
+
double pct = (i + 1) * 100.0 / data_len;
|
| 128 |
+
double speed = (i + 1) / elapsed;
|
| 129 |
+
fprintf(stderr, "\r %5.1f%% (%zu/%zu) %.0f B/s",
|
| 130 |
+
pct, i + 1, data_len, speed);
|
| 131 |
+
}
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
ae_finish(&enc);
|
| 135 |
+
double elapsed = now_sec() - t0;
|
| 136 |
+
if (data_len >= 50000)
|
| 137 |
+
fprintf(stderr, "\r \r");
|
| 138 |
+
|
| 139 |
+
*out_time = elapsed;
|
| 140 |
+
|
| 141 |
+
/* Copy output */
|
| 142 |
+
*out_len = enc.buf_len;
|
| 143 |
+
uint8_t *result = (uint8_t *)malloc(enc.buf_len);
|
| 144 |
+
memcpy(result, enc.buf, enc.buf_len);
|
| 145 |
+
|
| 146 |
+
ae_free(&enc);
|
| 147 |
+
ppm_free(&ppm);
|
| 148 |
+
if (flags & FLAG_MATCH) match_free(&match);
|
| 149 |
+
if (flags & FLAG_WORD) word_free(&word);
|
| 150 |
+
if (flags & FLAG_HIGHCTX) highctx_free(&hctx);
|
| 151 |
+
|
| 152 |
+
return result;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
/* ── Configurable decompress ── */
|
| 156 |
+
|
| 157 |
+
static uint8_t *do_decompress(const uint8_t *compressed, size_t comp_len,
|
| 158 |
+
size_t original_size, int flags,
|
| 159 |
+
double *out_time) {
|
| 160 |
+
PPMModel ppm; ppm_init(&ppm);
|
| 161 |
+
MatchModel match; if (flags & FLAG_MATCH) match_init(&match);
|
| 162 |
+
WordModel word; if (flags & FLAG_WORD) word_init(&word);
|
| 163 |
+
HighCtxModel hctx; if (flags & FLAG_HIGHCTX) highctx_init(&hctx);
|
| 164 |
+
ArithDecoder dec; ad_init(&dec, compressed, comp_len);
|
| 165 |
+
TweedieDenoiser twd; if (flags & FLAG_TWEEDIE) tweedie_init(&twd);
|
| 166 |
+
|
| 167 |
+
uint8_t *result = (uint8_t *)malloc(original_size);
|
| 168 |
+
|
| 169 |
+
double probs[256], word_probs[256], hctx_probs[256];
|
| 170 |
+
int64_t cumfreqs[257];
|
| 171 |
+
int64_t total;
|
| 172 |
+
|
| 173 |
+
double t0 = now_sec();
|
| 174 |
+
|
| 175 |
+
for (size_t i = 0; i < original_size; i++) {
|
| 176 |
+
double confidence;
|
| 177 |
+
int order;
|
| 178 |
+
|
| 179 |
+
ppm_predict(&ppm, probs, &confidence, &order);
|
| 180 |
+
|
| 181 |
+
if (flags & FLAG_TWEEDIE) {
|
| 182 |
+
tweedie_denoise(&twd, probs, order, confidence);
|
| 183 |
+
}
|
| 184 |
+
clamp_normalize(probs);
|
| 185 |
+
|
| 186 |
+
if (flags & FLAG_MATCH) {
|
| 187 |
+
int match_byte;
|
| 188 |
+
double match_conf;
|
| 189 |
+
match_predict(&match, &match_byte, &match_conf);
|
| 190 |
+
blend_match(probs, match_byte, match_conf);
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
if (flags & FLAG_WORD) {
|
| 194 |
+
double w_conf;
|
| 195 |
+
if (word_predict_cached(&word, word_probs, &w_conf))
|
| 196 |
+
blend_word_model(probs, word_probs, w_conf);
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
if (flags & FLAG_HIGHCTX) {
|
| 200 |
+
double hctx_conf;
|
| 201 |
+
if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
|
| 202 |
+
blend_highctx(probs, hctx_probs, hctx_conf);
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
probs_to_cumfreqs(probs, cumfreqs, &total);
|
| 206 |
+
int sym = ad_decode(&dec, cumfreqs, total);
|
| 207 |
+
result[i] = (uint8_t)sym;
|
| 208 |
+
|
| 209 |
+
if (flags & FLAG_TWEEDIE)
|
| 210 |
+
tweedie_update(&twd, (uint8_t)sym);
|
| 211 |
+
if (flags & FLAG_MATCH)
|
| 212 |
+
match_update(&match, (uint8_t)sym);
|
| 213 |
+
if (flags & FLAG_WORD)
|
| 214 |
+
word_update(&word, (uint8_t)sym);
|
| 215 |
+
if (flags & FLAG_HIGHCTX)
|
| 216 |
+
highctx_update(&hctx, (uint8_t)sym);
|
| 217 |
+
ppm_update(&ppm, (uint8_t)sym);
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
*out_time = now_sec() - t0;
|
| 221 |
+
|
| 222 |
+
ppm_free(&ppm);
|
| 223 |
+
if (flags & FLAG_MATCH) match_free(&match);
|
| 224 |
+
if (flags & FLAG_WORD) word_free(&word);
|
| 225 |
+
if (flags & FLAG_HIGHCTX) highctx_free(&hctx);
|
| 226 |
+
|
| 227 |
+
return result;
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
/* ── Ablation configs ── */
|
| 231 |
+
|
| 232 |
+
typedef struct {
|
| 233 |
+
const char *label;
|
| 234 |
+
int flags;
|
| 235 |
+
} AblationConfig;
|
| 236 |
+
|
| 237 |
+
static const AblationConfig CONFIGS[] = {
|
| 238 |
+
{ "Base PPM", 0 },
|
| 239 |
+
{ "+ Tweedie", FLAG_TWEEDIE },
|
| 240 |
+
{ "+ Twd + Match", FLAG_TWEEDIE | FLAG_MATCH },
|
| 241 |
+
{ "+ Twd + Match + Word", FLAG_TWEEDIE | FLAG_MATCH | FLAG_WORD },
|
| 242 |
+
{ "+ Twd + M + W + H", FLAG_TWEEDIE | FLAG_MATCH | FLAG_WORD | FLAG_HIGHCTX },
|
| 243 |
+
};
|
| 244 |
+
#define N_CONFIGS 5
|
| 245 |
+
|
| 246 |
+
typedef struct {
|
| 247 |
+
const char *label;
|
| 248 |
+
size_t c_size;
|
| 249 |
+
double ratio;
|
| 250 |
+
double c_time;
|
| 251 |
+
} AblationResult;
|
| 252 |
+
|
| 253 |
+
static void run_ablation(const char *filepath) {
|
| 254 |
+
FILE *f = fopen(filepath, "rb");
|
| 255 |
+
if (!f) { fprintf(stderr, "File not found: %s\n", filepath); return; }
|
| 256 |
+
fseek(f, 0, SEEK_END);
|
| 257 |
+
long file_size = ftell(f);
|
| 258 |
+
fseek(f, 0, SEEK_SET);
|
| 259 |
+
uint8_t *data = (uint8_t *)malloc(file_size);
|
| 260 |
+
if (fread(data, 1, file_size, f) != (size_t)file_size) {
|
| 261 |
+
fprintf(stderr, "Read error: %s\n", filepath);
|
| 262 |
+
fclose(f); free(data); return;
|
| 263 |
+
}
|
| 264 |
+
fclose(f);
|
| 265 |
+
|
| 266 |
+
/* basename */
|
| 267 |
+
char *path_copy = strdup(filepath);
|
| 268 |
+
const char *filename = basename(path_copy);
|
| 269 |
+
size_t original_size = (size_t)file_size;
|
| 270 |
+
|
| 271 |
+
printf("\n======================================================================\n");
|
| 272 |
+
printf(" ABLATION: %s (%zu bytes)\n", filename, original_size);
|
| 273 |
+
printf("======================================================================\n");
|
| 274 |
+
|
| 275 |
+
AblationResult results[N_CONFIGS];
|
| 276 |
+
|
| 277 |
+
for (int c = 0; c < N_CONFIGS; c++) {
|
| 278 |
+
printf("\n [%s]\n", CONFIGS[c].label);
|
| 279 |
+
printf(" Compressing...");
|
| 280 |
+
fflush(stdout);
|
| 281 |
+
|
| 282 |
+
size_t comp_len;
|
| 283 |
+
double c_time;
|
| 284 |
+
uint8_t *compressed = do_compress(data, original_size, CONFIGS[c].flags,
|
| 285 |
+
&comp_len, &c_time);
|
| 286 |
+
double ratio = (double)comp_len / original_size;
|
| 287 |
+
printf(" %zu bytes (%.2f%%) in %.1fs\n", comp_len, ratio * 100.0, c_time);
|
| 288 |
+
|
| 289 |
+
/* Verify round-trip */
|
| 290 |
+
printf(" Verifying...");
|
| 291 |
+
fflush(stdout);
|
| 292 |
+
|
| 293 |
+
/*double d_time;
|
| 294 |
+
uint8_t *decompressed = do_decompress(compressed, comp_len, original_size,
|
| 295 |
+
CONFIGS[c].flags, &d_time);
|
| 296 |
+
|
| 297 |
+
if (memcmp(data, decompressed, original_size) == 0) {
|
| 298 |
+
printf(" OK (%.1fs)\n", d_time);
|
| 299 |
+
} else {
|
| 300 |
+
printf(" FAILED!\n");
|
| 301 |
+
// Find first mismatch
|
| 302 |
+
for (size_t i = 0; i < original_size; i++) {
|
| 303 |
+
if (data[i] != decompressed[i]) {
|
| 304 |
+
printf(" First mismatch at byte %zu: expected %d, got %d\n",
|
| 305 |
+
i, data[i], decompressed[i]);
|
| 306 |
+
break;
|
| 307 |
+
}
|
| 308 |
+
}
|
| 309 |
+
free(compressed);
|
| 310 |
+
free(decompressed);
|
| 311 |
+
free(data);
|
| 312 |
+
free(path_copy);
|
| 313 |
+
exit(1);
|
| 314 |
+
} */
|
| 315 |
+
|
| 316 |
+
results[c].label = CONFIGS[c].label;
|
| 317 |
+
results[c].c_size = comp_len;
|
| 318 |
+
results[c].ratio = ratio;
|
| 319 |
+
results[c].c_time = c_time;
|
| 320 |
+
|
| 321 |
+
free(compressed);
|
| 322 |
+
//free(decompressed);
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
/* ── Summary table ── */
|
| 326 |
+
printf("\n======================================================================\n");
|
| 327 |
+
printf(" RESULTS: %s (%zu bytes)\n", filename, original_size);
|
| 328 |
+
printf("======================================================================\n");
|
| 329 |
+
printf(" %-32s %8s %8s %9s %9s %7s\n",
|
| 330 |
+
"Layer", "Size", "Ratio", "Layer +%", "Total +%", "Time");
|
| 331 |
+
printf(" -------------------------------- -------- -------- --------- --------- -------\n");
|
| 332 |
+
|
| 333 |
+
size_t base_size = results[0].c_size;
|
| 334 |
+
size_t prev_size = results[0].c_size;
|
| 335 |
+
|
| 336 |
+
for (int i = 0; i < N_CONFIGS; i++) {
|
| 337 |
+
size_t c_size = results[i].c_size;
|
| 338 |
+
double ratio = results[i].ratio;
|
| 339 |
+
double c_time = results[i].c_time;
|
| 340 |
+
|
| 341 |
+
if (i == 0) {
|
| 342 |
+
printf(" %-32s %8zu %6.2f%% %9s %9s %6.1fs\n",
|
| 343 |
+
results[i].label, c_size, ratio * 100.0, "", "", c_time);
|
| 344 |
+
} else {
|
| 345 |
+
double layer_imp = (double)(prev_size - c_size) / prev_size * 100.0;
|
| 346 |
+
double total_imp = (double)(base_size - c_size) / base_size * 100.0;
|
| 347 |
+
printf(" %-32s %8zu %6.2f%% %+8.2f%% %+8.2f%% %6.1fs\n",
|
| 348 |
+
results[i].label, c_size, ratio * 100.0,
|
| 349 |
+
layer_imp, total_imp, c_time);
|
| 350 |
+
}
|
| 351 |
+
prev_size = c_size;
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
printf(" -------------------------------- -------- -------- --------- --------- -------\n");
|
| 355 |
+
size_t final_size = results[N_CONFIGS - 1].c_size;
|
| 356 |
+
double total_imp = (double)(base_size - final_size) / base_size * 100.0;
|
| 357 |
+
printf(" %-32s %8s %8s %9s %+8.2f%%\n",
|
| 358 |
+
"TOTAL IMPROVEMENT", "", "", "", total_imp);
|
| 359 |
+
printf("\n");
|
| 360 |
+
|
| 361 |
+
free(data);
|
| 362 |
+
free(path_copy);
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
/* ── Main ── */
|
| 366 |
+
|
| 367 |
+
int main(int argc, char **argv) {
|
| 368 |
+
if (argc > 1) {
|
| 369 |
+
for (int i = 1; i < argc; i++)
|
| 370 |
+
run_ablation(argv[i]);
|
| 371 |
+
} else {
|
| 372 |
+
run_ablation("../alice29.txt");
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
/* Cross-file comparison would go here for multiple files */
|
| 376 |
+
return 0;
|
| 377 |
+
}
|
arith.h
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef ARITH_H
|
| 2 |
+
#define ARITH_H
|
| 3 |
+
|
| 4 |
+
#include <stdint.h>
|
| 5 |
+
#include <stdlib.h>
|
| 6 |
+
#include <string.h>
|
| 7 |
+
|
| 8 |
+
/* Arithmetic encoder */
|
| 9 |
+
typedef struct {
|
| 10 |
+
uint32_t low;
|
| 11 |
+
uint32_t high;
|
| 12 |
+
int pending;
|
| 13 |
+
uint8_t *buf; /* output byte buffer */
|
| 14 |
+
size_t buf_len;
|
| 15 |
+
size_t buf_cap;
|
| 16 |
+
int bit_buf; /* accumulates 8 bits before flushing a byte */
|
| 17 |
+
int bit_count; /* bits in bit_buf (0..7) */
|
| 18 |
+
} ArithEncoder;
|
| 19 |
+
|
| 20 |
+
/* Arithmetic decoder */
|
| 21 |
+
typedef struct {
|
| 22 |
+
const uint8_t *data;
|
| 23 |
+
size_t data_len;
|
| 24 |
+
size_t bit_pos;
|
| 25 |
+
uint32_t low;
|
| 26 |
+
uint32_t high;
|
| 27 |
+
uint32_t value;
|
| 28 |
+
} ArithDecoder;
|
| 29 |
+
|
| 30 |
+
/* ── Encoder ── */
|
| 31 |
+
|
| 32 |
+
static inline void ae_init(ArithEncoder *e) {
|
| 33 |
+
e->low = 0;
|
| 34 |
+
e->high = 0xFFFFFFFF;
|
| 35 |
+
e->pending = 0;
|
| 36 |
+
e->buf_cap = 4096;
|
| 37 |
+
e->buf_len = 0;
|
| 38 |
+
e->buf = (uint8_t *)malloc(e->buf_cap);
|
| 39 |
+
e->bit_buf = 0;
|
| 40 |
+
e->bit_count = 0;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
static inline void ae_flush_byte(ArithEncoder *e) {
|
| 44 |
+
if (e->buf_len >= e->buf_cap) {
|
| 45 |
+
e->buf_cap *= 2;
|
| 46 |
+
e->buf = (uint8_t *)realloc(e->buf, e->buf_cap);
|
| 47 |
+
}
|
| 48 |
+
e->buf[e->buf_len++] = (uint8_t)e->bit_buf;
|
| 49 |
+
e->bit_buf = 0;
|
| 50 |
+
e->bit_count = 0;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
static inline void ae_output_bit(ArithEncoder *e, int bit) {
|
| 54 |
+
e->bit_buf = (e->bit_buf << 1) | bit;
|
| 55 |
+
e->bit_count++;
|
| 56 |
+
if (e->bit_count == 8) ae_flush_byte(e);
|
| 57 |
+
|
| 58 |
+
int inv = 1 - bit;
|
| 59 |
+
while (e->pending > 0) {
|
| 60 |
+
e->bit_buf = (e->bit_buf << 1) | inv;
|
| 61 |
+
e->bit_count++;
|
| 62 |
+
if (e->bit_count == 8) ae_flush_byte(e);
|
| 63 |
+
e->pending--;
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
static inline void ae_encode(ArithEncoder *e, const int64_t *cumfreqs,
|
| 68 |
+
int symbol, int64_t total) {
|
| 69 |
+
uint64_t rng = (uint64_t)e->high - e->low + 1;
|
| 70 |
+
e->high = e->low + (uint32_t)((rng * cumfreqs[symbol + 1]) / total) - 1;
|
| 71 |
+
e->low = e->low + (uint32_t)((rng * cumfreqs[symbol]) / total);
|
| 72 |
+
|
| 73 |
+
for (;;) {
|
| 74 |
+
if (e->high < 0x80000000u) {
|
| 75 |
+
ae_output_bit(e, 0);
|
| 76 |
+
} else if (e->low >= 0x80000000u) {
|
| 77 |
+
ae_output_bit(e, 1);
|
| 78 |
+
e->low -= 0x80000000u;
|
| 79 |
+
e->high -= 0x80000000u;
|
| 80 |
+
} else if (e->low >= 0x40000000u && e->high < 0xC0000000u) {
|
| 81 |
+
e->pending++;
|
| 82 |
+
e->low -= 0x40000000u;
|
| 83 |
+
e->high -= 0x40000000u;
|
| 84 |
+
} else {
|
| 85 |
+
break;
|
| 86 |
+
}
|
| 87 |
+
e->low = (e->low << 1) & 0xFFFFFFFF;
|
| 88 |
+
e->high = ((e->high << 1) | 1) & 0xFFFFFFFF;
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
static inline void ae_finish(ArithEncoder *e) {
|
| 93 |
+
e->pending++;
|
| 94 |
+
if (e->low < 0x40000000u)
|
| 95 |
+
ae_output_bit(e, 0);
|
| 96 |
+
else
|
| 97 |
+
ae_output_bit(e, 1);
|
| 98 |
+
|
| 99 |
+
/* pad remaining bits in the last byte */
|
| 100 |
+
if (e->bit_count > 0) {
|
| 101 |
+
e->bit_buf <<= (8 - e->bit_count);
|
| 102 |
+
ae_flush_byte(e);
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
static inline void ae_free(ArithEncoder *e) {
|
| 107 |
+
free(e->buf);
|
| 108 |
+
e->buf = NULL;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
/* ── Decoder ── */
|
| 112 |
+
|
| 113 |
+
static inline int ad_read_bit(ArithDecoder *d) {
|
| 114 |
+
size_t byte_idx = d->bit_pos / 8;
|
| 115 |
+
if (byte_idx >= d->data_len) {
|
| 116 |
+
d->bit_pos++;
|
| 117 |
+
return 0;
|
| 118 |
+
}
|
| 119 |
+
int bit = (d->data[byte_idx] >> (7 - (d->bit_pos % 8))) & 1;
|
| 120 |
+
d->bit_pos++;
|
| 121 |
+
return bit;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
static inline void ad_init(ArithDecoder *d, const uint8_t *data, size_t len) {
|
| 125 |
+
d->data = data;
|
| 126 |
+
d->data_len = len;
|
| 127 |
+
d->bit_pos = 0;
|
| 128 |
+
d->low = 0;
|
| 129 |
+
d->high = 0xFFFFFFFF;
|
| 130 |
+
d->value = 0;
|
| 131 |
+
for (int i = 0; i < 32; i++)
|
| 132 |
+
d->value = (d->value << 1) | ad_read_bit(d);
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
static inline int ad_decode(ArithDecoder *d, const int64_t *cumfreqs,
|
| 136 |
+
int64_t total) {
|
| 137 |
+
uint64_t rng = (uint64_t)d->high - d->low + 1;
|
| 138 |
+
int64_t scaled = (int64_t)(((uint64_t)(d->value - d->low + 1) * total - 1) / rng);
|
| 139 |
+
|
| 140 |
+
/* linear search (matches Python behavior) */
|
| 141 |
+
int sym = 0;
|
| 142 |
+
for (sym = 0; sym < 256; sym++) {
|
| 143 |
+
if (cumfreqs[sym + 1] > scaled)
|
| 144 |
+
break;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
d->high = d->low + (uint32_t)((rng * cumfreqs[sym + 1]) / total) - 1;
|
| 148 |
+
d->low = d->low + (uint32_t)((rng * cumfreqs[sym]) / total);
|
| 149 |
+
|
| 150 |
+
for (;;) {
|
| 151 |
+
if (d->high < 0x80000000u) {
|
| 152 |
+
/* nothing */
|
| 153 |
+
} else if (d->low >= 0x80000000u) {
|
| 154 |
+
d->low -= 0x80000000u;
|
| 155 |
+
d->high -= 0x80000000u;
|
| 156 |
+
d->value -= 0x80000000u;
|
| 157 |
+
} else if (d->low >= 0x40000000u && d->high < 0xC0000000u) {
|
| 158 |
+
d->low -= 0x40000000u;
|
| 159 |
+
d->high -= 0x40000000u;
|
| 160 |
+
d->value -= 0x40000000u;
|
| 161 |
+
} else {
|
| 162 |
+
break;
|
| 163 |
+
}
|
| 164 |
+
d->low = (d->low << 1) & 0xFFFFFFFF;
|
| 165 |
+
d->high = ((d->high << 1) | 1) & 0xFFFFFFFF;
|
| 166 |
+
d->value = ((d->value << 1) | ad_read_bit(d)) & 0xFFFFFFFF;
|
| 167 |
+
}
|
| 168 |
+
return sym;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
#endif /* ARITH_H */
|
delta_vs_noise.c
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* delta_vs_noise.c — Experiment: |δ| vs noise level γ
|
| 3 |
+
*
|
| 4 |
+
* Compresses a file using the full pipeline, then dumps the
|
| 5 |
+
* calibration table statistics showing mean |δ| per confidence bin.
|
| 6 |
+
*
|
| 7 |
+
* Usage: ./delta_vs_noise <input_file>
|
| 8 |
+
*
|
| 9 |
+
* Output: TSV table of (conf_bin, γ_approx, mean_|δ|, weighted_mean_|δ|, total_obs)
|
| 10 |
+
*/
|
| 11 |
+
|
| 12 |
+
#include <stdio.h>
|
| 13 |
+
#include <stdlib.h>
|
| 14 |
+
#include <string.h>
|
| 15 |
+
#include <math.h>
|
| 16 |
+
#include <time.h>
|
| 17 |
+
|
| 18 |
+
#include "fastmath.h"
|
| 19 |
+
#include "arith.h"
|
| 20 |
+
#include "ppm.h"
|
| 21 |
+
#include "tweedie.h"
|
| 22 |
+
#include "match.h"
|
| 23 |
+
#include "word.h"
|
| 24 |
+
#include "highctx.h"
|
| 25 |
+
|
| 26 |
+
#define SCALE (1 << 14)
|
| 27 |
+
|
| 28 |
+
static void probs_to_cumfreqs(const double *probs, int64_t *cumfreqs,
|
| 29 |
+
int64_t *out_total) {
|
| 30 |
+
cumfreqs[0] = 0;
|
| 31 |
+
for (int i = 0; i < 256; i++) {
|
| 32 |
+
int64_t f = (int64_t)(probs[i] * SCALE + 0.5);
|
| 33 |
+
if (f < 1) f = 1;
|
| 34 |
+
cumfreqs[i + 1] = cumfreqs[i] + f;
|
| 35 |
+
}
|
| 36 |
+
*out_total = cumfreqs[256];
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
static void clamp_normalize(double *probs) {
|
| 40 |
+
double sum = 0.0;
|
| 41 |
+
for (int i = 0; i < 256; i++) {
|
| 42 |
+
if (probs[i] < 1e-10) probs[i] = 1e-10;
|
| 43 |
+
sum += probs[i];
|
| 44 |
+
}
|
| 45 |
+
double inv = 1.0 / sum;
|
| 46 |
+
for (int i = 0; i < 256; i++)
|
| 47 |
+
probs[i] *= inv;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
/* Representative C value for each confidence bin.
|
| 51 |
+
* twd_conf_bin uses: bin = (int)(ln(C) / 1.3863)
|
| 52 |
+
* with C < 4 → bin 0.
|
| 53 |
+
* Bin boundaries: 0:[0,4), 1:[4,e^1.39)≈[4,16), 2:[16,59), ... */
|
| 54 |
+
static double conf_bin_representative_C(int bin) {
|
| 55 |
+
if (bin == 0) return 2.0; /* midpoint of [0, 4) */
|
| 56 |
+
/* bin = floor(ln(C) / 1.3863), so midpoint is exp((bin + 0.5) * 1.3863) */
|
| 57 |
+
return exp((bin + 0.5) * 1.3862943611198906);
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
int main(int argc, char **argv) {
|
| 61 |
+
if (argc < 2) {
|
| 62 |
+
fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
|
| 63 |
+
return 1;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
FILE *fin = fopen(argv[1], "rb");
|
| 67 |
+
if (!fin) { perror(argv[1]); return 1; }
|
| 68 |
+
fseek(fin, 0, SEEK_END);
|
| 69 |
+
long file_size = ftell(fin);
|
| 70 |
+
fseek(fin, 0, SEEK_SET);
|
| 71 |
+
uint8_t *data = (uint8_t *)malloc(file_size);
|
| 72 |
+
if (fread(data, 1, file_size, fin) != (size_t)file_size) {
|
| 73 |
+
fprintf(stderr, "Read error\n"); fclose(fin); return 1;
|
| 74 |
+
}
|
| 75 |
+
fclose(fin);
|
| 76 |
+
|
| 77 |
+
fprintf(stderr, "Processing %s (%ld bytes)...\n", argv[1], file_size);
|
| 78 |
+
|
| 79 |
+
/* Run the full pipeline to populate calibration tables */
|
| 80 |
+
PPMModel ppm; ppm_init(&ppm);
|
| 81 |
+
MatchModel match; match_init(&match);
|
| 82 |
+
WordModel word; word_init(&word);
|
| 83 |
+
HighCtxModel hctx; highctx_init(&hctx);
|
| 84 |
+
ArithEncoder enc; ae_init(&enc);
|
| 85 |
+
TweedieDenoiser twd; tweedie_init(&twd);
|
| 86 |
+
|
| 87 |
+
double probs[256], word_probs[256], hctx_probs[256];
|
| 88 |
+
int64_t cumfreqs[257];
|
| 89 |
+
int64_t total;
|
| 90 |
+
|
| 91 |
+
struct timespec t0, t1;
|
| 92 |
+
clock_gettime(CLOCK_MONOTONIC, &t0);
|
| 93 |
+
|
| 94 |
+
for (long i = 0; i < file_size; i++) {
|
| 95 |
+
uint8_t byte = data[i];
|
| 96 |
+
|
| 97 |
+
double confidence;
|
| 98 |
+
int order;
|
| 99 |
+
ppm_predict(&ppm, probs, &confidence, &order);
|
| 100 |
+
|
| 101 |
+
tweedie_denoise(&twd, probs, order, confidence);
|
| 102 |
+
clamp_normalize(probs);
|
| 103 |
+
|
| 104 |
+
int match_byte;
|
| 105 |
+
double match_conf;
|
| 106 |
+
match_predict(&match, &match_byte, &match_conf);
|
| 107 |
+
blend_match(probs, match_byte, match_conf);
|
| 108 |
+
|
| 109 |
+
double w_conf;
|
| 110 |
+
if (word_predict_cached(&word, word_probs, &w_conf))
|
| 111 |
+
blend_word_model(probs, word_probs, w_conf);
|
| 112 |
+
|
| 113 |
+
double hctx_conf;
|
| 114 |
+
if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
|
| 115 |
+
blend_highctx(probs, hctx_probs, hctx_conf);
|
| 116 |
+
|
| 117 |
+
probs_to_cumfreqs(probs, cumfreqs, &total);
|
| 118 |
+
ae_encode(&enc, cumfreqs, byte, total);
|
| 119 |
+
|
| 120 |
+
tweedie_update(&twd, byte);
|
| 121 |
+
match_update(&match, byte);
|
| 122 |
+
word_update(&word, byte);
|
| 123 |
+
highctx_update(&hctx, byte);
|
| 124 |
+
ppm_update(&ppm, byte);
|
| 125 |
+
|
| 126 |
+
if ((i + 1) % 50000 == 0) {
|
| 127 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 128 |
+
double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
|
| 129 |
+
fprintf(stderr, "\r %5.1f%% (%.0f B/s)",
|
| 130 |
+
(i + 1) * 100.0 / file_size, (i + 1) / elapsed);
|
| 131 |
+
}
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 135 |
+
double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
|
| 136 |
+
fprintf(stderr, "\r Done in %.1fs \n", elapsed);
|
| 137 |
+
|
| 138 |
+
/* ── Analyze calibration tables: mean |δ| per confidence bin ── */
|
| 139 |
+
|
| 140 |
+
/* Aggregate across all steps, bit contexts, order groups, shapes, prob bins */
|
| 141 |
+
double sum_abs_delta[TWD_N_CONF];
|
| 142 |
+
double sum_weight[TWD_N_CONF];
|
| 143 |
+
double sum_weighted_abs_delta[TWD_N_CONF];
|
| 144 |
+
int count[TWD_N_CONF];
|
| 145 |
+
memset(sum_abs_delta, 0, sizeof(sum_abs_delta));
|
| 146 |
+
memset(sum_weight, 0, sizeof(sum_weight));
|
| 147 |
+
memset(sum_weighted_abs_delta, 0, sizeof(sum_weighted_abs_delta));
|
| 148 |
+
memset(count, 0, sizeof(count));
|
| 149 |
+
|
| 150 |
+
for (int t = 0; t < TWD_STEPS; t++)
|
| 151 |
+
for (int b = 0; b < TWD_N_BCTX; b++)
|
| 152 |
+
for (int o = 0; o < TWD_N_ORD; o++)
|
| 153 |
+
for (int s = 0; s < TWD_N_SHAPE; s++)
|
| 154 |
+
for (int c = 0; c < TWD_N_CONF; c++)
|
| 155 |
+
for (int p = 0; p < TWD_N_PROB; p++) {
|
| 156 |
+
TwdCalibEntry *e = &twd.table[t][b][o][s][c][p];
|
| 157 |
+
double real_obs = e->total - TWD_PRIOR_WEIGHT;
|
| 158 |
+
if (real_obs < 1.0) continue; /* skip bins with only prior */
|
| 159 |
+
|
| 160 |
+
double avg_pred = e->sum_pred / e->total;
|
| 161 |
+
double emp_rate = e->hits / e->total;
|
| 162 |
+
double delta = emp_rate - avg_pred;
|
| 163 |
+
|
| 164 |
+
sum_abs_delta[c] += fabs(delta);
|
| 165 |
+
sum_weighted_abs_delta[c] += fabs(delta) * real_obs;
|
| 166 |
+
sum_weight[c] += real_obs;
|
| 167 |
+
count[c]++;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
/* ── Output ── */
|
| 171 |
+
printf("# Delta vs Noise Level — %s (%ld bytes)\n", argv[1], file_size);
|
| 172 |
+
printf("# conf_bin\tC_repr\tgamma\tmean_abs_delta\tweighted_abs_delta\tactive_bins\ttotal_obs\n");
|
| 173 |
+
|
| 174 |
+
for (int c = 0; c < TWD_N_CONF; c++) {
|
| 175 |
+
double C_repr = conf_bin_representative_C(c);
|
| 176 |
+
double gamma = 128.0 / (C_repr + 128.0);
|
| 177 |
+
double mean_d = (count[c] > 0) ? sum_abs_delta[c] / count[c] : 0.0;
|
| 178 |
+
double wmean_d = (sum_weight[c] > 0) ? sum_weighted_abs_delta[c] / sum_weight[c] : 0.0;
|
| 179 |
+
|
| 180 |
+
printf("%d\t%.1f\t%.4f\t%.6f\t%.6f\t%d\t%.0f\n",
|
| 181 |
+
c, C_repr, gamma, mean_d, wmean_d, count[c], sum_weight[c]);
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
/* ── Also output per-step breakdown ── */
|
| 185 |
+
printf("\n# Per-step breakdown:\n");
|
| 186 |
+
printf("# step\tconf_bin\tgamma\tweighted_abs_delta\ttotal_obs\n");
|
| 187 |
+
|
| 188 |
+
for (int t = 0; t < TWD_STEPS; t++) {
|
| 189 |
+
double step_sum_wd[TWD_N_CONF] = {0};
|
| 190 |
+
double step_sum_w[TWD_N_CONF] = {0};
|
| 191 |
+
|
| 192 |
+
for (int b = 0; b < TWD_N_BCTX; b++)
|
| 193 |
+
for (int o = 0; o < TWD_N_ORD; o++)
|
| 194 |
+
for (int s = 0; s < TWD_N_SHAPE; s++)
|
| 195 |
+
for (int c = 0; c < TWD_N_CONF; c++)
|
| 196 |
+
for (int p = 0; p < TWD_N_PROB; p++) {
|
| 197 |
+
TwdCalibEntry *e = &twd.table[t][b][o][s][c][p];
|
| 198 |
+
double real_obs = e->total - TWD_PRIOR_WEIGHT;
|
| 199 |
+
if (real_obs < 1.0) continue;
|
| 200 |
+
|
| 201 |
+
double avg_pred = e->sum_pred / e->total;
|
| 202 |
+
double emp_rate = e->hits / e->total;
|
| 203 |
+
double delta = emp_rate - avg_pred;
|
| 204 |
+
|
| 205 |
+
step_sum_wd[c] += fabs(delta) * real_obs;
|
| 206 |
+
step_sum_w[c] += real_obs;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
for (int c = 0; c < TWD_N_CONF; c++) {
|
| 210 |
+
double C_repr = conf_bin_representative_C(c);
|
| 211 |
+
double gamma = 128.0 / (C_repr + 128.0);
|
| 212 |
+
double wmean_d = (step_sum_w[c] > 0) ? step_sum_wd[c] / step_sum_w[c] : 0.0;
|
| 213 |
+
printf("%d\t%d\t%.4f\t%.6f\t%.0f\n",
|
| 214 |
+
t, c, gamma, wmean_d, step_sum_w[c]);
|
| 215 |
+
}
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
free(data);
|
| 219 |
+
ppm_free(&ppm);
|
| 220 |
+
match_free(&match);
|
| 221 |
+
word_free(&word);
|
| 222 |
+
highctx_free(&hctx);
|
| 223 |
+
ae_free(&enc);
|
| 224 |
+
|
| 225 |
+
return 0;
|
| 226 |
+
}
|
fastmath.h
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef FASTMATH_H
|
| 2 |
+
#define FASTMATH_H
|
| 3 |
+
|
| 4 |
+
#include <stdint.h>
|
| 5 |
+
#include <math.h>
|
| 6 |
+
|
| 7 |
+
/*
|
| 8 |
+
* Fast log/exp approximations using IEEE 754 bit tricks + polynomial correction.
|
| 9 |
+
* Accurate to ~1e-4 relative error — sufficient for probability manipulation.
|
| 10 |
+
*/
|
| 11 |
+
|
| 12 |
+
/* Fast natural log. Relative error < 2e-4 over [1e-30, 1.0] */
|
| 13 |
+
static inline double fast_log(double x) {
|
| 14 |
+
union { double d; uint64_t u; } v = { .d = x };
|
| 15 |
+
/* Extract exponent and mantissa from IEEE 754 */
|
| 16 |
+
int64_t exp_bits = (int64_t)((v.u >> 52) & 0x7FF) - 1023;
|
| 17 |
+
/* Set exponent to 0 → mantissa in [1, 2) */
|
| 18 |
+
v.u = (v.u & 0x000FFFFFFFFFFFFFULL) | 0x3FF0000000000000ULL;
|
| 19 |
+
double m = v.d;
|
| 20 |
+
/* Polynomial approx of log(m) for m in [1,2): Remez-like */
|
| 21 |
+
/* log(m) ≈ (m-1) - (m-1)^2/2 + (m-1)^3/3 ... but use minimax */
|
| 22 |
+
double t = m - 1.0;
|
| 23 |
+
double log_m = t * (1.0 + t * (-0.5 + t * (0.333333333 + t * (-0.25 + t * 0.2))));
|
| 24 |
+
return log_m + exp_bits * 0.6931471805599453; /* exp_bits * ln(2) */
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
/* Fast exp. Relative error < 3e-4 over [-90, 0] (typical range for log-probs) */
|
| 28 |
+
static inline double fast_exp(double x) {
|
| 29 |
+
if (x < -700.0) return 0.0;
|
| 30 |
+
if (x > 709.0) return 1e308;
|
| 31 |
+
/* exp(x) = 2^(x/ln2) = 2^(k+f) where k=floor, f=frac */
|
| 32 |
+
double t = x * 1.4426950408889634; /* x / ln(2) */
|
| 33 |
+
int64_t k = (int64_t)t;
|
| 34 |
+
if (t < k) k--; /* floor for negative */
|
| 35 |
+
double f = t - k;
|
| 36 |
+
/* 2^f for f in [0,1): minimax polynomial */
|
| 37 |
+
double p = 1.0 + f * (0.6931471805599453 + f * (0.24022650695910071
|
| 38 |
+
+ f * (0.05550410866482158 + f * (0.009618129107628477
|
| 39 |
+
+ f * 0.0013333558146428443))));
|
| 40 |
+
/* Multiply by 2^k via bit manipulation */
|
| 41 |
+
union { double d; uint64_t u; } v;
|
| 42 |
+
v.u = (uint64_t)(k + 1023) << 52;
|
| 43 |
+
return p * v.d;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
/* Fast log(a / (1-a)) — logit function via single IEEE bit trick.
|
| 47 |
+
* Uses the identity: logit(p) = log(p) - log(1-p)
|
| 48 |
+
* We can compute log(p/(1-p)) in one pass by exploiting IEEE 754. */
|
| 49 |
+
static inline double fast_logit(double p) {
|
| 50 |
+
if (p < 1e-8) p = 1e-8;
|
| 51 |
+
if (p > 1.0 - 1e-8) p = 1.0 - 1e-8;
|
| 52 |
+
/* For p near 0.5, use rational approx; otherwise use fast_log */
|
| 53 |
+
double r = p / (1.0 - p);
|
| 54 |
+
return fast_log(r);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
/*
|
| 58 |
+
* Precomputed logit lookup table for probabilities.
|
| 59 |
+
* Maps probability [0..65536]/65536 → logit value.
|
| 60 |
+
* Avoids per-symbol log computation entirely.
|
| 61 |
+
*/
|
| 62 |
+
#define LOGIT_TABLE_SIZE 65537
|
| 63 |
+
typedef struct {
|
| 64 |
+
double table[LOGIT_TABLE_SIZE];
|
| 65 |
+
int initialized;
|
| 66 |
+
} LogitTable;
|
| 67 |
+
|
| 68 |
+
static inline void logit_table_init(LogitTable *lt) {
|
| 69 |
+
for (int i = 0; i < LOGIT_TABLE_SIZE; i++) {
|
| 70 |
+
double p = (double)i / 65536.0;
|
| 71 |
+
if (p < 1e-8) p = 1e-8;
|
| 72 |
+
if (p > 1.0 - 1e-8) p = 1.0 - 1e-8;
|
| 73 |
+
lt->table[i] = log(p / (1.0 - p));
|
| 74 |
+
}
|
| 75 |
+
lt->initialized = 1;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
static inline double logit_table_lookup(const LogitTable *lt, double p) {
|
| 79 |
+
int idx = (int)(p * 65536.0 + 0.5);
|
| 80 |
+
if (idx < 0) idx = 0;
|
| 81 |
+
if (idx >= LOGIT_TABLE_SIZE) idx = LOGIT_TABLE_SIZE - 1;
|
| 82 |
+
return lt->table[idx];
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
/* Fast sqrt (just use hardware — it's already fast) */
|
| 86 |
+
/* static inline double fast_sqrt(double x) { return sqrt(x); } */
|
| 87 |
+
|
| 88 |
+
#endif /* FASTMATH_H */
|
highctx.h
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef HIGHCTX_H
|
| 2 |
+
#define HIGHCTX_H
|
| 3 |
+
|
| 4 |
+
#include <stdint.h>
|
| 5 |
+
#include <stdlib.h>
|
| 6 |
+
#include <string.h>
|
| 7 |
+
|
| 8 |
+
/*
|
| 9 |
+
* High-Order Context Model (orders 5-8)
|
| 10 |
+
*
|
| 11 |
+
* Extends effective context beyond PPM's order-4 limit without modifying PPM.
|
| 12 |
+
* Uses hash tables mapping context_hash → count[256] for orders 5, 6, 7, 8.
|
| 13 |
+
* Unlike the match model (which finds one position, predicts one byte),
|
| 14 |
+
* this aggregates ALL matching positions into a full probability distribution.
|
| 15 |
+
*
|
| 16 |
+
* Blended after SSE in the pipeline, preserving diffusion's contribution.
|
| 17 |
+
*/
|
| 18 |
+
|
| 19 |
+
#define HCTX_NSYM 256
|
| 20 |
+
#define HCTX_N_ORDERS 4 /* orders 5, 6, 7, 8 */
|
| 21 |
+
#define HCTX_MIN_ORDER 5
|
| 22 |
+
#define HCTX_MAX_ENTRIES (1 << 20) /* 1M entries per table, ~500MB total max */
|
| 23 |
+
|
| 24 |
+
/* Hash table entry: context hash → byte counts */
|
| 25 |
+
typedef struct {
|
| 26 |
+
uint64_t key;
|
| 27 |
+
uint16_t counts[HCTX_NSYM];
|
| 28 |
+
uint32_t total;
|
| 29 |
+
} HCtxEntry;
|
| 30 |
+
|
| 31 |
+
typedef struct {
|
| 32 |
+
HCtxEntry *entries;
|
| 33 |
+
uint32_t capacity;
|
| 34 |
+
uint32_t mask;
|
| 35 |
+
uint32_t used;
|
| 36 |
+
} HCtxTable;
|
| 37 |
+
|
| 38 |
+
static inline void hctx_table_init(HCtxTable *t, uint32_t cap) {
|
| 39 |
+
t->capacity = cap;
|
| 40 |
+
t->mask = cap - 1;
|
| 41 |
+
t->used = 0;
|
| 42 |
+
t->entries = (HCtxEntry *)calloc(cap, sizeof(HCtxEntry));
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
static inline void hctx_table_free(HCtxTable *t) {
|
| 46 |
+
free(t->entries);
|
| 47 |
+
t->entries = NULL;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
static inline void hctx_table_grow(HCtxTable *t) {
|
| 51 |
+
uint32_t old_cap = t->capacity;
|
| 52 |
+
HCtxEntry *old = t->entries;
|
| 53 |
+
uint32_t new_cap = old_cap * 2;
|
| 54 |
+
t->entries = (HCtxEntry *)calloc(new_cap, sizeof(HCtxEntry));
|
| 55 |
+
t->capacity = new_cap;
|
| 56 |
+
t->mask = new_cap - 1;
|
| 57 |
+
t->used = 0;
|
| 58 |
+
for (uint32_t i = 0; i < old_cap; i++) {
|
| 59 |
+
if (old[i].key != 0) {
|
| 60 |
+
uint32_t idx = (uint32_t)(old[i].key & t->mask);
|
| 61 |
+
while (t->entries[idx].key != 0)
|
| 62 |
+
idx = (idx + 1) & t->mask;
|
| 63 |
+
t->entries[idx] = old[i];
|
| 64 |
+
t->used++;
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
free(old);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
/* Find or create entry. Returns pointer to entry (NULL if full and create). */
|
| 71 |
+
static inline HCtxEntry *hctx_table_get(HCtxTable *t, uint64_t key, int create) {
|
| 72 |
+
if (create && t->used * 5 > t->capacity * 3) {
|
| 73 |
+
if (t->capacity < HCTX_MAX_ENTRIES)
|
| 74 |
+
hctx_table_grow(t);
|
| 75 |
+
else
|
| 76 |
+
create = 0; /* at max capacity, only look up existing */
|
| 77 |
+
}
|
| 78 |
+
uint32_t idx = (uint32_t)(key & t->mask);
|
| 79 |
+
for (;;) {
|
| 80 |
+
if (t->entries[idx].key == key)
|
| 81 |
+
return &t->entries[idx];
|
| 82 |
+
if (t->entries[idx].key == 0) {
|
| 83 |
+
if (!create) return NULL;
|
| 84 |
+
t->entries[idx].key = key;
|
| 85 |
+
memset(t->entries[idx].counts, 0, sizeof(t->entries[idx].counts));
|
| 86 |
+
t->entries[idx].total = 0;
|
| 87 |
+
t->used++;
|
| 88 |
+
return &t->entries[idx];
|
| 89 |
+
}
|
| 90 |
+
idx = (idx + 1) & t->mask;
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
/* FNV-1a hash for context bytes */
|
| 95 |
+
static inline uint64_t hctx_hash(const uint8_t *data, int len) {
|
| 96 |
+
uint64_t h = 14695981039346656037ULL;
|
| 97 |
+
for (int i = 0; i < len; i++) {
|
| 98 |
+
h ^= data[i];
|
| 99 |
+
h *= 1099511628211ULL;
|
| 100 |
+
}
|
| 101 |
+
if (h == 0) h = 1;
|
| 102 |
+
return h;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
typedef struct {
|
| 106 |
+
HCtxTable tables[HCTX_N_ORDERS]; /* orders 5, 6, 7, 8 */
|
| 107 |
+
uint8_t *history;
|
| 108 |
+
int hist_len;
|
| 109 |
+
int hist_cap;
|
| 110 |
+
} HighCtxModel;
|
| 111 |
+
|
| 112 |
+
static inline void highctx_init(HighCtxModel *m) {
|
| 113 |
+
for (int i = 0; i < HCTX_N_ORDERS; i++)
|
| 114 |
+
hctx_table_init(&m->tables[i], 8192);
|
| 115 |
+
m->hist_cap = 4096;
|
| 116 |
+
m->hist_len = 0;
|
| 117 |
+
m->history = (uint8_t *)malloc(m->hist_cap);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
static inline void highctx_free(HighCtxModel *m) {
|
| 121 |
+
for (int i = 0; i < HCTX_N_ORDERS; i++)
|
| 122 |
+
hctx_table_free(&m->tables[i]);
|
| 123 |
+
free(m->history);
|
| 124 |
+
m->history = NULL;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
/*
|
| 128 |
+
* Predict: try highest order first (8, 7, 6, 5).
|
| 129 |
+
* Use the highest order that has a context with total >= min_count.
|
| 130 |
+
* Returns 1 if prediction available, fills probs[256] and *out_conf.
|
| 131 |
+
*/
|
| 132 |
+
static inline int highctx_predict(HighCtxModel *m, double *probs, double *out_conf) {
|
| 133 |
+
int n = m->hist_len;
|
| 134 |
+
|
| 135 |
+
for (int oidx = HCTX_N_ORDERS - 1; oidx >= 0; oidx--) {
|
| 136 |
+
int order = HCTX_MIN_ORDER + oidx; /* 8, 7, 6, 5 */
|
| 137 |
+
if (n < order) continue;
|
| 138 |
+
|
| 139 |
+
uint64_t key = hctx_hash(m->history + n - order, order);
|
| 140 |
+
HCtxEntry *e = hctx_table_get(&m->tables[oidx], key, 0);
|
| 141 |
+
if (!e || e->total < 4) continue;
|
| 142 |
+
|
| 143 |
+
/* Build distribution: sparse smoothing to avoid zero probs */
|
| 144 |
+
double smooth = 1e-4;
|
| 145 |
+
double total_smooth = e->total + smooth * HCTX_NSYM;
|
| 146 |
+
double inv = 1.0 / total_smooth;
|
| 147 |
+
for (int s = 0; s < HCTX_NSYM; s++)
|
| 148 |
+
probs[s] = (e->counts[s] + smooth) * inv;
|
| 149 |
+
|
| 150 |
+
/* Confidence: ramps slowly, requires real data */
|
| 151 |
+
double count_conf = (e->total - 4.0) / (e->total + 8.0); /* 0 at total=4, ~0.7 at 20 */
|
| 152 |
+
if (count_conf < 0) count_conf = 0;
|
| 153 |
+
double order_factor = 0.4 + (order - HCTX_MIN_ORDER) * 0.1; /* 0.4 for o5, 0.7 for o8 */
|
| 154 |
+
*out_conf = count_conf * order_factor;
|
| 155 |
+
|
| 156 |
+
return 1;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
*out_conf = 0.0;
|
| 160 |
+
return 0;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
/*
|
| 164 |
+
* Update: increment counts for all available orders.
|
| 165 |
+
*/
|
| 166 |
+
static inline void highctx_update(HighCtxModel *m, uint8_t byte) {
|
| 167 |
+
int n = m->hist_len;
|
| 168 |
+
|
| 169 |
+
for (int oidx = 0; oidx < HCTX_N_ORDERS; oidx++) {
|
| 170 |
+
int order = HCTX_MIN_ORDER + oidx;
|
| 171 |
+
if (n >= order) {
|
| 172 |
+
uint64_t key = hctx_hash(m->history + n - order, order);
|
| 173 |
+
HCtxEntry *e = hctx_table_get(&m->tables[oidx], key, 1);
|
| 174 |
+
if (e) {
|
| 175 |
+
e->counts[byte]++;
|
| 176 |
+
e->total++;
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
/* Append to history */
|
| 182 |
+
if (m->hist_len >= m->hist_cap) {
|
| 183 |
+
m->hist_cap *= 2;
|
| 184 |
+
m->history = (uint8_t *)realloc(m->history, m->hist_cap);
|
| 185 |
+
}
|
| 186 |
+
m->history[m->hist_len++] = byte;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
/*
|
| 190 |
+
* Blend high-context prediction into existing probability distribution.
|
| 191 |
+
*/
|
| 192 |
+
static inline void blend_highctx(double *probs, const double *hctx_probs,
|
| 193 |
+
double hctx_conf) {
|
| 194 |
+
if (hctx_conf < 0.01) return;
|
| 195 |
+
double weight = hctx_conf * 2.0;
|
| 196 |
+
if (weight > 0.60) weight = 0.60;
|
| 197 |
+
double sum = 0.0;
|
| 198 |
+
for (int i = 0; i < HCTX_NSYM; i++) {
|
| 199 |
+
probs[i] = probs[i] * (1.0 - weight) + hctx_probs[i] * weight;
|
| 200 |
+
if (probs[i] < 1e-10) probs[i] = 1e-10;
|
| 201 |
+
sum += probs[i];
|
| 202 |
+
}
|
| 203 |
+
double inv = 1.0 / sum;
|
| 204 |
+
for (int i = 0; i < HCTX_NSYM; i++)
|
| 205 |
+
probs[i] *= inv;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
#endif /* HIGHCTX_H */
|
match.h
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef MATCH_H
|
| 2 |
+
#define MATCH_H
|
| 3 |
+
|
| 4 |
+
#include <stdint.h>
|
| 5 |
+
#include <stdlib.h>
|
| 6 |
+
#include <string.h>
|
| 7 |
+
|
| 8 |
+
#define MATCH_NSYM 256
|
| 9 |
+
#define MATCH_N_CTX 5 /* context lengths: 4, 6, 8, 12, 16 */
|
| 10 |
+
|
| 11 |
+
/* Hash table entry: context hash → position in history */
|
| 12 |
+
typedef struct {
|
| 13 |
+
uint64_t key;
|
| 14 |
+
uint32_t pos;
|
| 15 |
+
} MatchHTEntry;
|
| 16 |
+
|
| 17 |
+
typedef struct {
|
| 18 |
+
MatchHTEntry *entries;
|
| 19 |
+
uint32_t capacity;
|
| 20 |
+
uint32_t mask;
|
| 21 |
+
} MatchHT;
|
| 22 |
+
|
| 23 |
+
static inline void mht_init(MatchHT *t, uint32_t cap) {
|
| 24 |
+
t->capacity = cap;
|
| 25 |
+
t->mask = cap - 1;
|
| 26 |
+
t->entries = (MatchHTEntry *)calloc(cap, sizeof(MatchHTEntry));
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
static inline void mht_free(MatchHT *t) {
|
| 30 |
+
free(t->entries);
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
static inline void mht_grow(MatchHT *t) {
|
| 34 |
+
uint32_t old_cap = t->capacity;
|
| 35 |
+
MatchHTEntry *old = t->entries;
|
| 36 |
+
uint32_t new_cap = old_cap * 2;
|
| 37 |
+
t->entries = (MatchHTEntry *)calloc(new_cap, sizeof(MatchHTEntry));
|
| 38 |
+
t->capacity = new_cap;
|
| 39 |
+
t->mask = new_cap - 1;
|
| 40 |
+
for (uint32_t i = 0; i < old_cap; i++) {
|
| 41 |
+
if (old[i].key != 0) {
|
| 42 |
+
uint32_t idx = (uint32_t)(old[i].key & t->mask);
|
| 43 |
+
while (t->entries[idx].key != 0)
|
| 44 |
+
idx = (idx + 1) & t->mask;
|
| 45 |
+
t->entries[idx] = old[i];
|
| 46 |
+
}
|
| 47 |
+
}
|
| 48 |
+
free(old);
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
static inline void mht_set(MatchHT *t, uint64_t key, uint32_t pos,
|
| 52 |
+
uint32_t *used) {
|
| 53 |
+
if (*used * 5 > t->capacity * 3) mht_grow(t);
|
| 54 |
+
uint32_t idx = (uint32_t)(key & t->mask);
|
| 55 |
+
for (;;) {
|
| 56 |
+
if (t->entries[idx].key == key || t->entries[idx].key == 0) {
|
| 57 |
+
if (t->entries[idx].key == 0) (*used)++;
|
| 58 |
+
t->entries[idx].key = key;
|
| 59 |
+
t->entries[idx].pos = pos;
|
| 60 |
+
return;
|
| 61 |
+
}
|
| 62 |
+
idx = (idx + 1) & t->mask;
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
static inline int mht_get(MatchHT *t, uint64_t key, uint32_t *out_pos) {
|
| 67 |
+
uint32_t idx = (uint32_t)(key & t->mask);
|
| 68 |
+
for (;;) {
|
| 69 |
+
if (t->entries[idx].key == key) {
|
| 70 |
+
*out_pos = t->entries[idx].pos;
|
| 71 |
+
return 1;
|
| 72 |
+
}
|
| 73 |
+
if (t->entries[idx].key == 0) return 0;
|
| 74 |
+
idx = (idx + 1) & t->mask;
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
typedef struct {
|
| 79 |
+
int ctx_lens[MATCH_N_CTX];
|
| 80 |
+
MatchHT tables[MATCH_N_CTX];
|
| 81 |
+
uint32_t table_used[MATCH_N_CTX];
|
| 82 |
+
uint8_t *history;
|
| 83 |
+
int hist_len;
|
| 84 |
+
int hist_cap;
|
| 85 |
+
|
| 86 |
+
/* active match state */
|
| 87 |
+
int match_read_pos;
|
| 88 |
+
int match_active;
|
| 89 |
+
int match_streak;
|
| 90 |
+
|
| 91 |
+
/* adaptive accuracy */
|
| 92 |
+
double hits;
|
| 93 |
+
double total;
|
| 94 |
+
} MatchModel;
|
| 95 |
+
|
| 96 |
+
static inline uint64_t match_ctx_hash(const uint8_t *data, int len) {
|
| 97 |
+
uint64_t h = 14695981039346656037ULL;
|
| 98 |
+
for (int i = 0; i < len; i++) {
|
| 99 |
+
h ^= data[i];
|
| 100 |
+
h *= 1099511628211ULL;
|
| 101 |
+
}
|
| 102 |
+
if (h == 0) h = 1;
|
| 103 |
+
return h;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
static inline void match_init(MatchModel *m) {
|
| 107 |
+
m->ctx_lens[0] = 4;
|
| 108 |
+
m->ctx_lens[1] = 6;
|
| 109 |
+
m->ctx_lens[2] = 8;
|
| 110 |
+
m->ctx_lens[3] = 12;
|
| 111 |
+
m->ctx_lens[4] = 16;
|
| 112 |
+
for (int i = 0; i < MATCH_N_CTX; i++) {
|
| 113 |
+
mht_init(&m->tables[i], 4096);
|
| 114 |
+
m->table_used[i] = 0;
|
| 115 |
+
}
|
| 116 |
+
m->hist_cap = 4096;
|
| 117 |
+
m->hist_len = 0;
|
| 118 |
+
m->history = (uint8_t *)malloc(m->hist_cap);
|
| 119 |
+
m->match_read_pos = -1;
|
| 120 |
+
m->match_active = 0;
|
| 121 |
+
m->match_streak = 0;
|
| 122 |
+
m->hits = 1.0;
|
| 123 |
+
m->total = 2.0;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
static inline void match_free(MatchModel *m) {
|
| 127 |
+
for (int i = 0; i < MATCH_N_CTX; i++)
|
| 128 |
+
mht_free(&m->tables[i]);
|
| 129 |
+
free(m->history);
|
| 130 |
+
m->history = NULL;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
/*
|
| 134 |
+
* predict: returns predicted byte via *out_byte, confidence via *out_conf.
|
| 135 |
+
* Returns 1 if prediction available, 0 otherwise.
|
| 136 |
+
*/
|
| 137 |
+
static inline int match_predict(MatchModel *m, int *out_byte, double *out_conf) {
|
| 138 |
+
/* 1. Continue active match */
|
| 139 |
+
if (m->match_active && m->match_read_pos >= 0
|
| 140 |
+
&& m->match_read_pos < m->hist_len) {
|
| 141 |
+
*out_byte = m->history[m->match_read_pos];
|
| 142 |
+
double base = m->hits / m->total;
|
| 143 |
+
double conf = base * (0.65 + m->match_streak * 0.04);
|
| 144 |
+
if (conf > 0.96) conf = 0.96;
|
| 145 |
+
*out_conf = conf;
|
| 146 |
+
return 1;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
m->match_active = 0;
|
| 150 |
+
|
| 151 |
+
/* 2. Try new match (longest context first) */
|
| 152 |
+
for (int idx = MATCH_N_CTX - 1; idx >= 0; idx--) {
|
| 153 |
+
int ctx_len = m->ctx_lens[idx];
|
| 154 |
+
int n = m->hist_len;
|
| 155 |
+
if (n < ctx_len) continue;
|
| 156 |
+
|
| 157 |
+
uint64_t key = match_ctx_hash(m->history + n - ctx_len, ctx_len);
|
| 158 |
+
uint32_t pos;
|
| 159 |
+
if (mht_get(&m->tables[idx], key, &pos) && pos < (uint32_t)n) {
|
| 160 |
+
*out_byte = m->history[pos];
|
| 161 |
+
m->match_active = 1;
|
| 162 |
+
m->match_read_pos = (int)pos;
|
| 163 |
+
m->match_streak = 0;
|
| 164 |
+
double base = m->hits / m->total;
|
| 165 |
+
double conf = base * (ctx_len / 6.0);
|
| 166 |
+
if (conf > base * 0.9) conf = base * 0.9;
|
| 167 |
+
*out_conf = conf;
|
| 168 |
+
return 1;
|
| 169 |
+
}
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
*out_byte = -1;
|
| 173 |
+
*out_conf = 0.0;
|
| 174 |
+
return 0;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
static inline void match_update(MatchModel *m, uint8_t actual_byte) {
|
| 178 |
+
/* track accuracy of active match */
|
| 179 |
+
if (m->match_active && m->match_read_pos >= 0
|
| 180 |
+
&& m->match_read_pos < m->hist_len) {
|
| 181 |
+
int predicted = m->history[m->match_read_pos];
|
| 182 |
+
m->total += 1.0;
|
| 183 |
+
if (predicted == actual_byte) {
|
| 184 |
+
m->hits += 1.0;
|
| 185 |
+
m->match_streak++;
|
| 186 |
+
m->match_read_pos++;
|
| 187 |
+
} else {
|
| 188 |
+
m->match_active = 0;
|
| 189 |
+
m->match_streak = 0;
|
| 190 |
+
}
|
| 191 |
+
if (m->total > 500.0) {
|
| 192 |
+
m->hits *= 0.99;
|
| 193 |
+
m->total *= 0.99;
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
/* store context → position */
|
| 198 |
+
int n = m->hist_len;
|
| 199 |
+
for (int tidx = 0; tidx < MATCH_N_CTX; tidx++) {
|
| 200 |
+
int ctx_len = m->ctx_lens[tidx];
|
| 201 |
+
if (n >= ctx_len) {
|
| 202 |
+
uint64_t key = match_ctx_hash(m->history + n - ctx_len, ctx_len);
|
| 203 |
+
mht_set(&m->tables[tidx], key, (uint32_t)n,
|
| 204 |
+
&m->table_used[tidx]);
|
| 205 |
+
}
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
/* append to history */
|
| 209 |
+
if (m->hist_len >= m->hist_cap) {
|
| 210 |
+
m->hist_cap *= 2;
|
| 211 |
+
m->history = (uint8_t *)realloc(m->history, m->hist_cap);
|
| 212 |
+
}
|
| 213 |
+
m->history[m->hist_len++] = actual_byte;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
static inline void blend_match(double *probs, int match_byte,
|
| 217 |
+
double match_confidence) {
|
| 218 |
+
if (match_byte < 0) return;
|
| 219 |
+
double weight = match_confidence * 0.85;
|
| 220 |
+
if (weight > 0.95) weight = 0.95;
|
| 221 |
+
for (int i = 0; i < MATCH_NSYM; i++)
|
| 222 |
+
probs[i] *= (1.0 - weight);
|
| 223 |
+
probs[match_byte] += weight;
|
| 224 |
+
double sum = 0.0;
|
| 225 |
+
for (int i = 0; i < MATCH_NSYM; i++) {
|
| 226 |
+
if (probs[i] < 1e-10) probs[i] = 1e-10;
|
| 227 |
+
sum += probs[i];
|
| 228 |
+
}
|
| 229 |
+
double inv = 1.0 / sum;
|
| 230 |
+
for (int i = 0; i < MATCH_NSYM; i++)
|
| 231 |
+
probs[i] *= inv;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
#endif /* MATCH_H */
|
mdc.c
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Midicoth Compressor — C implementation
|
| 3 |
+
* Pipeline: PPM + Tweedie Denoising + Match + Word + HighCtx
|
| 4 |
+
*
|
| 5 |
+
* Usage:
|
| 6 |
+
* ./mdc compress <input> <output>
|
| 7 |
+
* ./mdc decompress <input> <output>
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#include <stdio.h>
|
| 11 |
+
#include <stdlib.h>
|
| 12 |
+
#include <string.h>
|
| 13 |
+
#include <math.h>
|
| 14 |
+
#include <time.h>
|
| 15 |
+
|
| 16 |
+
#include "fastmath.h"
|
| 17 |
+
#include "arith.h"
|
| 18 |
+
#include "ppm.h"
|
| 19 |
+
#include "tweedie.h"
|
| 20 |
+
#include "match.h"
|
| 21 |
+
#include "word.h"
|
| 22 |
+
#include "highctx.h"
|
| 23 |
+
|
| 24 |
+
#define MAGIC "MDC7"
|
| 25 |
+
#define SCALE (1 << 14)
|
| 26 |
+
|
| 27 |
+
/* ── Helpers ── */
|
| 28 |
+
|
| 29 |
+
static void probs_to_cumfreqs(const double *probs, int64_t *cumfreqs,
|
| 30 |
+
int64_t *out_total) {
|
| 31 |
+
cumfreqs[0] = 0;
|
| 32 |
+
for (int i = 0; i < 256; i++) {
|
| 33 |
+
int64_t f = (int64_t)(probs[i] * SCALE + 0.5);
|
| 34 |
+
if (f < 1) f = 1;
|
| 35 |
+
cumfreqs[i + 1] = cumfreqs[i] + f;
|
| 36 |
+
}
|
| 37 |
+
*out_total = cumfreqs[256];
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
static void clamp_normalize(double *probs) {
|
| 41 |
+
double sum = 0.0;
|
| 42 |
+
for (int i = 0; i < 256; i++) {
|
| 43 |
+
if (probs[i] < 1e-10) probs[i] = 1e-10;
|
| 44 |
+
sum += probs[i];
|
| 45 |
+
}
|
| 46 |
+
double inv = 1.0 / sum;
|
| 47 |
+
for (int i = 0; i < 256; i++)
|
| 48 |
+
probs[i] *= inv;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
/* ── Compress ── */
|
| 52 |
+
|
| 53 |
+
static int do_compress(const char *input_path, const char *output_path) {
|
| 54 |
+
FILE *fin = fopen(input_path, "rb");
|
| 55 |
+
if (!fin) { perror(input_path); return 1; }
|
| 56 |
+
fseek(fin, 0, SEEK_END);
|
| 57 |
+
long file_size = ftell(fin);
|
| 58 |
+
fseek(fin, 0, SEEK_SET);
|
| 59 |
+
uint8_t *data = (uint8_t *)malloc(file_size);
|
| 60 |
+
if (fread(data, 1, file_size, fin) != (size_t)file_size) {
|
| 61 |
+
fprintf(stderr, "Read error\n"); fclose(fin); return 1;
|
| 62 |
+
}
|
| 63 |
+
fclose(fin);
|
| 64 |
+
|
| 65 |
+
uint64_t original_size = (uint64_t)file_size;
|
| 66 |
+
printf(" Input: %s (%lu bytes)\n", input_path, (unsigned long)original_size);
|
| 67 |
+
|
| 68 |
+
if (original_size == 0) {
|
| 69 |
+
FILE *fout = fopen(output_path, "wb");
|
| 70 |
+
fwrite(MAGIC, 1, 4, fout);
|
| 71 |
+
uint64_t zero = 0;
|
| 72 |
+
fwrite(&zero, 8, 1, fout);
|
| 73 |
+
fclose(fout);
|
| 74 |
+
printf(" Empty file -> 12 bytes\n");
|
| 75 |
+
free(data);
|
| 76 |
+
return 0;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
PPMModel ppm; ppm_init(&ppm);
|
| 80 |
+
MatchModel match; match_init(&match);
|
| 81 |
+
WordModel word; word_init(&word);
|
| 82 |
+
HighCtxModel hctx; highctx_init(&hctx);
|
| 83 |
+
ArithEncoder enc; ae_init(&enc);
|
| 84 |
+
TweedieDenoiser twd; tweedie_init(&twd);
|
| 85 |
+
|
| 86 |
+
double probs[256], word_probs[256], hctx_probs[256];
|
| 87 |
+
int64_t cumfreqs[257];
|
| 88 |
+
int64_t total;
|
| 89 |
+
|
| 90 |
+
struct timespec t0, t1;
|
| 91 |
+
clock_gettime(CLOCK_MONOTONIC, &t0);
|
| 92 |
+
|
| 93 |
+
for (uint64_t i = 0; i < original_size; i++) {
|
| 94 |
+
uint8_t byte = data[i];
|
| 95 |
+
|
| 96 |
+
double confidence;
|
| 97 |
+
int order;
|
| 98 |
+
ppm_predict(&ppm, probs, &confidence, &order);
|
| 99 |
+
|
| 100 |
+
tweedie_denoise(&twd, probs, order, confidence);
|
| 101 |
+
clamp_normalize(probs);
|
| 102 |
+
|
| 103 |
+
int match_byte;
|
| 104 |
+
double match_conf;
|
| 105 |
+
match_predict(&match, &match_byte, &match_conf);
|
| 106 |
+
blend_match(probs, match_byte, match_conf);
|
| 107 |
+
|
| 108 |
+
double w_conf;
|
| 109 |
+
if (word_predict_cached(&word, word_probs, &w_conf))
|
| 110 |
+
blend_word_model(probs, word_probs, w_conf);
|
| 111 |
+
|
| 112 |
+
double hctx_conf;
|
| 113 |
+
if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
|
| 114 |
+
blend_highctx(probs, hctx_probs, hctx_conf);
|
| 115 |
+
|
| 116 |
+
probs_to_cumfreqs(probs, cumfreqs, &total);
|
| 117 |
+
ae_encode(&enc, cumfreqs, byte, total);
|
| 118 |
+
|
| 119 |
+
tweedie_update(&twd, byte);
|
| 120 |
+
match_update(&match, byte);
|
| 121 |
+
word_update(&word, byte);
|
| 122 |
+
highctx_update(&hctx, byte);
|
| 123 |
+
ppm_update(&ppm, byte);
|
| 124 |
+
|
| 125 |
+
if ((i + 1) % 50000 == 0) {
|
| 126 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 127 |
+
double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
|
| 128 |
+
double pct = (i + 1) * 100.0 / original_size;
|
| 129 |
+
double speed = (i + 1) / elapsed;
|
| 130 |
+
fprintf(stderr, "\r %5.1f%% (%lu/%lu) %.0f B/s",
|
| 131 |
+
pct, (unsigned long)(i + 1), (unsigned long)original_size, speed);
|
| 132 |
+
}
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
ae_finish(&enc);
|
| 136 |
+
|
| 137 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 138 |
+
double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
|
| 139 |
+
fprintf(stderr, "\r \r");
|
| 140 |
+
|
| 141 |
+
FILE *fout = fopen(output_path, "wb");
|
| 142 |
+
if (!fout) { perror(output_path); return 1; }
|
| 143 |
+
fwrite(MAGIC, 1, 4, fout);
|
| 144 |
+
fwrite(&original_size, 8, 1, fout);
|
| 145 |
+
fwrite(enc.buf, 1, enc.buf_len, fout);
|
| 146 |
+
fclose(fout);
|
| 147 |
+
|
| 148 |
+
uint64_t total_size = 4 + 8 + enc.buf_len;
|
| 149 |
+
double ratio = (double)total_size / original_size;
|
| 150 |
+
printf(" Output: %s (%lu bytes)\n", output_path, (unsigned long)total_size);
|
| 151 |
+
printf(" Ratio: %.4f (%.2f%%)\n", ratio, ratio * 100.0);
|
| 152 |
+
printf(" Time: %.1fs (%.0f B/s)\n", elapsed, original_size / elapsed);
|
| 153 |
+
|
| 154 |
+
ae_free(&enc);
|
| 155 |
+
ppm_free(&ppm);
|
| 156 |
+
match_free(&match);
|
| 157 |
+
word_free(&word);
|
| 158 |
+
highctx_free(&hctx);
|
| 159 |
+
free(data);
|
| 160 |
+
return 0;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
/* ── Decompress ── */
|
| 164 |
+
|
| 165 |
+
static int do_decompress(const char *input_path, const char *output_path) {
|
| 166 |
+
FILE *fin = fopen(input_path, "rb");
|
| 167 |
+
if (!fin) { perror(input_path); return 1; }
|
| 168 |
+
|
| 169 |
+
char magic[4];
|
| 170 |
+
if (fread(magic, 1, 4, fin) != 4 || memcmp(magic, MAGIC, 4) != 0) {
|
| 171 |
+
fprintf(stderr, "Error: not a MDC7 file\n");
|
| 172 |
+
fclose(fin);
|
| 173 |
+
return 1;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
uint64_t original_size;
|
| 177 |
+
if (fread(&original_size, 8, 1, fin) != 1) {
|
| 178 |
+
fprintf(stderr, "Read error\n"); fclose(fin); return 1;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
fseek(fin, 0, SEEK_END);
|
| 182 |
+
long fsize = ftell(fin);
|
| 183 |
+
fseek(fin, 12, SEEK_SET);
|
| 184 |
+
size_t comp_len = (size_t)(fsize - 12);
|
| 185 |
+
uint8_t *compressed = (uint8_t *)malloc(comp_len);
|
| 186 |
+
if (fread(compressed, 1, comp_len, fin) != comp_len) {
|
| 187 |
+
fprintf(stderr, "Read error\n"); fclose(fin); return 1;
|
| 188 |
+
}
|
| 189 |
+
fclose(fin);
|
| 190 |
+
|
| 191 |
+
printf(" Input: %s (%ld bytes)\n", input_path, fsize);
|
| 192 |
+
printf(" Original size: %lu bytes\n", (unsigned long)original_size);
|
| 193 |
+
|
| 194 |
+
if (original_size == 0) {
|
| 195 |
+
FILE *fout = fopen(output_path, "wb");
|
| 196 |
+
fclose(fout);
|
| 197 |
+
printf(" Empty file\n");
|
| 198 |
+
free(compressed);
|
| 199 |
+
return 0;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
PPMModel ppm; ppm_init(&ppm);
|
| 203 |
+
MatchModel match; match_init(&match);
|
| 204 |
+
WordModel word; word_init(&word);
|
| 205 |
+
HighCtxModel hctx; highctx_init(&hctx);
|
| 206 |
+
ArithDecoder dec; ad_init(&dec, compressed, comp_len);
|
| 207 |
+
TweedieDenoiser twd; tweedie_init(&twd);
|
| 208 |
+
|
| 209 |
+
uint8_t *result = (uint8_t *)malloc(original_size);
|
| 210 |
+
|
| 211 |
+
double probs[256], word_probs[256], hctx_probs[256];
|
| 212 |
+
int64_t cumfreqs[257];
|
| 213 |
+
int64_t total;
|
| 214 |
+
|
| 215 |
+
struct timespec t0, t1;
|
| 216 |
+
clock_gettime(CLOCK_MONOTONIC, &t0);
|
| 217 |
+
|
| 218 |
+
for (uint64_t i = 0; i < original_size; i++) {
|
| 219 |
+
double confidence;
|
| 220 |
+
int order;
|
| 221 |
+
ppm_predict(&ppm, probs, &confidence, &order);
|
| 222 |
+
|
| 223 |
+
tweedie_denoise(&twd, probs, order, confidence);
|
| 224 |
+
clamp_normalize(probs);
|
| 225 |
+
|
| 226 |
+
int match_byte;
|
| 227 |
+
double match_conf;
|
| 228 |
+
match_predict(&match, &match_byte, &match_conf);
|
| 229 |
+
blend_match(probs, match_byte, match_conf);
|
| 230 |
+
|
| 231 |
+
double w_conf;
|
| 232 |
+
if (word_predict_cached(&word, word_probs, &w_conf))
|
| 233 |
+
blend_word_model(probs, word_probs, w_conf);
|
| 234 |
+
|
| 235 |
+
double hctx_conf;
|
| 236 |
+
if (highctx_predict(&hctx, hctx_probs, &hctx_conf))
|
| 237 |
+
blend_highctx(probs, hctx_probs, hctx_conf);
|
| 238 |
+
|
| 239 |
+
probs_to_cumfreqs(probs, cumfreqs, &total);
|
| 240 |
+
int sym = ad_decode(&dec, cumfreqs, total);
|
| 241 |
+
result[i] = (uint8_t)sym;
|
| 242 |
+
|
| 243 |
+
tweedie_update(&twd, (uint8_t)sym);
|
| 244 |
+
match_update(&match, (uint8_t)sym);
|
| 245 |
+
word_update(&word, (uint8_t)sym);
|
| 246 |
+
highctx_update(&hctx, (uint8_t)sym);
|
| 247 |
+
ppm_update(&ppm, (uint8_t)sym);
|
| 248 |
+
|
| 249 |
+
if ((i + 1) % 50000 == 0) {
|
| 250 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 251 |
+
double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
|
| 252 |
+
double pct = (i + 1) * 100.0 / original_size;
|
| 253 |
+
double speed = (i + 1) / elapsed;
|
| 254 |
+
fprintf(stderr, "\r %5.1f%% (%lu/%lu) %.0f B/s",
|
| 255 |
+
pct, (unsigned long)(i + 1), (unsigned long)original_size, speed);
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 260 |
+
double elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
|
| 261 |
+
fprintf(stderr, "\r \r");
|
| 262 |
+
|
| 263 |
+
FILE *fout = fopen(output_path, "wb");
|
| 264 |
+
fwrite(result, 1, original_size, fout);
|
| 265 |
+
fclose(fout);
|
| 266 |
+
|
| 267 |
+
printf(" Output: %s (%lu bytes)\n", output_path, (unsigned long)original_size);
|
| 268 |
+
printf(" Time: %.1fs (%.0f B/s)\n", elapsed, original_size / elapsed);
|
| 269 |
+
|
| 270 |
+
ppm_free(&ppm);
|
| 271 |
+
match_free(&match);
|
| 272 |
+
word_free(&word);
|
| 273 |
+
highctx_free(&hctx);
|
| 274 |
+
free(compressed);
|
| 275 |
+
free(result);
|
| 276 |
+
return 0;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
/* ── Main ── */
|
| 280 |
+
|
| 281 |
+
int main(int argc, char **argv) {
|
| 282 |
+
if (argc != 4) {
|
| 283 |
+
fprintf(stderr, "Usage: %s compress|decompress <input> <output>\n", argv[0]);
|
| 284 |
+
return 1;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
if (strcmp(argv[1], "compress") == 0)
|
| 288 |
+
return do_compress(argv[2], argv[3]);
|
| 289 |
+
else if (strcmp(argv[1], "decompress") == 0)
|
| 290 |
+
return do_decompress(argv[2], argv[3]);
|
| 291 |
+
else {
|
| 292 |
+
fprintf(stderr, "Unknown command: %s\n", argv[1]);
|
| 293 |
+
return 1;
|
| 294 |
+
}
|
| 295 |
+
}
|
ppm.h
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef PPM_H
|
| 2 |
+
#define PPM_H
|
| 3 |
+
|
| 4 |
+
#include <stdint.h>
|
| 5 |
+
#include <stdlib.h>
|
| 6 |
+
#include <string.h>
|
| 7 |
+
#include <math.h>
|
| 8 |
+
|
| 9 |
+
#define PPM_MAX_ORDER 4
|
| 10 |
+
#define PPM_NSYM 256
|
| 11 |
+
#define PPM_PRIOR 0.5
|
| 12 |
+
|
| 13 |
+
/*
|
| 14 |
+
* Hash table entry: maps a 64-bit context hash to a count array.
|
| 15 |
+
* counts[i] stores the (float) count for symbol i.
|
| 16 |
+
* total caches sum(counts).
|
| 17 |
+
* key == 0 means empty slot.
|
| 18 |
+
*/
|
| 19 |
+
typedef struct {
|
| 20 |
+
uint64_t key; /* context hash (0 = empty) */
|
| 21 |
+
double counts[PPM_NSYM];
|
| 22 |
+
double total;
|
| 23 |
+
} PPMEntry;
|
| 24 |
+
|
| 25 |
+
typedef struct {
|
| 26 |
+
PPMEntry *entries;
|
| 27 |
+
uint32_t capacity; /* power of 2 */
|
| 28 |
+
uint32_t used;
|
| 29 |
+
} PPMTable;
|
| 30 |
+
|
| 31 |
+
typedef struct {
|
| 32 |
+
PPMTable tables[PPM_MAX_ORDER + 1]; /* order 0..4 */
|
| 33 |
+
uint8_t *history;
|
| 34 |
+
int hist_len;
|
| 35 |
+
int hist_cap;
|
| 36 |
+
} PPMModel;
|
| 37 |
+
|
| 38 |
+
/* ── Hash helper ── */
|
| 39 |
+
|
| 40 |
+
static inline uint64_t ppm_hash_context(const uint8_t *ctx, int len) {
|
| 41 |
+
/* We need a non-zero hash for all contexts including order-0 (empty).
|
| 42 |
+
* Use FNV-1a style. Order-0 empty context gets a fixed hash. */
|
| 43 |
+
if (len == 0) return 1; /* special: order-0 empty context */
|
| 44 |
+
uint64_t h = 14695981039346656037ULL;
|
| 45 |
+
for (int i = 0; i < len; i++) {
|
| 46 |
+
h ^= ctx[i];
|
| 47 |
+
h *= 1099511628211ULL;
|
| 48 |
+
}
|
| 49 |
+
if (h == 0) h = 1; /* reserve 0 for empty slot */
|
| 50 |
+
return h;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
/* ── Table operations ── */
|
| 54 |
+
|
| 55 |
+
static inline void ppm_table_init(PPMTable *t, uint32_t capacity) {
|
| 56 |
+
t->capacity = capacity;
|
| 57 |
+
t->used = 0;
|
| 58 |
+
t->entries = (PPMEntry *)calloc(capacity, sizeof(PPMEntry));
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
static inline void ppm_table_free(PPMTable *t) {
|
| 62 |
+
free(t->entries);
|
| 63 |
+
t->entries = NULL;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
static inline void ppm_table_grow(PPMTable *t);
|
| 67 |
+
|
| 68 |
+
static inline PPMEntry *ppm_table_find(PPMTable *t, uint64_t key) {
|
| 69 |
+
uint32_t mask = t->capacity - 1;
|
| 70 |
+
uint32_t idx = (uint32_t)(key & mask);
|
| 71 |
+
for (;;) {
|
| 72 |
+
PPMEntry *e = &t->entries[idx];
|
| 73 |
+
if (e->key == key) return e;
|
| 74 |
+
if (e->key == 0) return NULL;
|
| 75 |
+
idx = (idx + 1) & mask;
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
static inline PPMEntry *ppm_table_insert(PPMTable *t, uint64_t key) {
|
| 80 |
+
/* Grow if > 60% full */
|
| 81 |
+
if (t->used * 5 > t->capacity * 3) {
|
| 82 |
+
ppm_table_grow(t);
|
| 83 |
+
}
|
| 84 |
+
uint32_t mask = t->capacity - 1;
|
| 85 |
+
uint32_t idx = (uint32_t)(key & mask);
|
| 86 |
+
for (;;) {
|
| 87 |
+
PPMEntry *e = &t->entries[idx];
|
| 88 |
+
if (e->key == key) return e; /* already exists */
|
| 89 |
+
if (e->key == 0) {
|
| 90 |
+
/* init new entry with prior */
|
| 91 |
+
e->key = key;
|
| 92 |
+
for (int i = 0; i < PPM_NSYM; i++)
|
| 93 |
+
e->counts[i] = PPM_PRIOR;
|
| 94 |
+
e->total = PPM_NSYM * PPM_PRIOR;
|
| 95 |
+
t->used++;
|
| 96 |
+
return e;
|
| 97 |
+
}
|
| 98 |
+
idx = (idx + 1) & mask;
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
static inline void ppm_table_grow(PPMTable *t) {
|
| 103 |
+
uint32_t old_cap = t->capacity;
|
| 104 |
+
PPMEntry *old = t->entries;
|
| 105 |
+
uint32_t new_cap = old_cap * 2;
|
| 106 |
+
t->entries = (PPMEntry *)calloc(new_cap, sizeof(PPMEntry));
|
| 107 |
+
t->capacity = new_cap;
|
| 108 |
+
t->used = 0;
|
| 109 |
+
for (uint32_t i = 0; i < old_cap; i++) {
|
| 110 |
+
if (old[i].key != 0) {
|
| 111 |
+
/* re-insert */
|
| 112 |
+
PPMEntry *ne = ppm_table_insert(t, old[i].key);
|
| 113 |
+
memcpy(ne->counts, old[i].counts, sizeof(old[i].counts));
|
| 114 |
+
ne->total = old[i].total;
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
free(old);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
/* ── PPM Model ── */
|
| 121 |
+
|
| 122 |
+
static inline void ppm_init(PPMModel *m) {
|
| 123 |
+
for (int o = 0; o <= PPM_MAX_ORDER; o++)
|
| 124 |
+
ppm_table_init(&m->tables[o], 1024);
|
| 125 |
+
m->hist_cap = 4096;
|
| 126 |
+
m->hist_len = 0;
|
| 127 |
+
m->history = (uint8_t *)malloc(m->hist_cap);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
static inline void ppm_free(PPMModel *m) {
|
| 131 |
+
for (int o = 0; o <= PPM_MAX_ORDER; o++)
|
| 132 |
+
ppm_table_free(&m->tables[o]);
|
| 133 |
+
free(m->history);
|
| 134 |
+
m->history = NULL;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
/*
|
| 138 |
+
* predict_with_confidence: fills probs[256] and returns confidence + order.
|
| 139 |
+
* Matches Python: fallback from max_order down to 0, first context with total > 1.
|
| 140 |
+
* If nothing found, returns uniform.
|
| 141 |
+
*/
|
| 142 |
+
static inline void ppm_predict(PPMModel *m, double *probs,
|
| 143 |
+
double *out_confidence, int *out_order) {
|
| 144 |
+
for (int order = PPM_MAX_ORDER; order >= 0; order--) {
|
| 145 |
+
const uint8_t *ctx_start;
|
| 146 |
+
int ctx_len = order;
|
| 147 |
+
|
| 148 |
+
if (ctx_len > m->hist_len) continue;
|
| 149 |
+
ctx_start = m->history + m->hist_len - ctx_len;
|
| 150 |
+
|
| 151 |
+
uint64_t key = ppm_hash_context(ctx_start, ctx_len);
|
| 152 |
+
PPMEntry *e = ppm_table_find(&m->tables[order], key);
|
| 153 |
+
if (e == NULL) continue;
|
| 154 |
+
if (e->total <= 1.0) continue;
|
| 155 |
+
|
| 156 |
+
double inv_total = 1.0 / e->total;
|
| 157 |
+
for (int i = 0; i < PPM_NSYM; i++)
|
| 158 |
+
probs[i] = e->counts[i] * inv_total;
|
| 159 |
+
|
| 160 |
+
*out_confidence = e->total;
|
| 161 |
+
*out_order = order;
|
| 162 |
+
return;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
/* uniform fallback */
|
| 166 |
+
double u = 1.0 / 256.0;
|
| 167 |
+
for (int i = 0; i < PPM_NSYM; i++)
|
| 168 |
+
probs[i] = u;
|
| 169 |
+
*out_confidence = 0.0;
|
| 170 |
+
*out_order = -1;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
/*
|
| 174 |
+
* update: add symbol count to all orders (0..4) where context is available.
|
| 175 |
+
* Then append symbol to history.
|
| 176 |
+
*/
|
| 177 |
+
static inline void ppm_update(PPMModel *m, uint8_t symbol) {
|
| 178 |
+
for (int order = 0; order <= PPM_MAX_ORDER; order++) {
|
| 179 |
+
int ctx_len = order;
|
| 180 |
+
if (ctx_len > m->hist_len) continue;
|
| 181 |
+
|
| 182 |
+
const uint8_t *ctx_start = m->history + m->hist_len - ctx_len;
|
| 183 |
+
uint64_t key = ppm_hash_context(ctx_start, ctx_len);
|
| 184 |
+
|
| 185 |
+
PPMEntry *e = ppm_table_insert(&m->tables[order], key);
|
| 186 |
+
e->counts[symbol] += 1.0;
|
| 187 |
+
e->total += 1.0;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
/* append to history */
|
| 191 |
+
if (m->hist_len >= m->hist_cap) {
|
| 192 |
+
m->hist_cap *= 2;
|
| 193 |
+
m->history = (uint8_t *)realloc(m->history, m->hist_cap);
|
| 194 |
+
}
|
| 195 |
+
m->history[m->hist_len++] = symbol;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
#endif /* PPM_H */
|
tweedie.h
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef TWEEDIE_H
|
| 2 |
+
#define TWEEDIE_H
|
| 3 |
+
|
| 4 |
+
/*
|
| 5 |
+
* Binary Tree Tweedie Denoiser — score-based reverse diffusion.
|
| 6 |
+
*
|
| 7 |
+
* Forward process (PPM Jeffreys prior):
|
| 8 |
+
* p̂(s) = (n·q(s) + 0.5) / (n + 128) = (1-γ)q(s) + γ·u(s)
|
| 9 |
+
* where γ = 128/(n+128) is the noise level, u = 1/256 uniform.
|
| 10 |
+
*
|
| 11 |
+
* Tweedie's formula gives the optimal denoiser:
|
| 12 |
+
* θ̂ = p̂ + σ² · s(p̂)
|
| 13 |
+
* where s(p̂) = ∇ log m(p̂) is the score of the marginal density.
|
| 14 |
+
*
|
| 15 |
+
* The score is estimated empirically via calibration tables that track
|
| 16 |
+
* the additive correction δ = E[θ|p̂] - E[p̂] = hit_rate - avg_pred.
|
| 17 |
+
* This δ equals σ²·s(p̂) — the full Tweedie correction term.
|
| 18 |
+
*
|
| 19 |
+
* Binary tree decomposition: 256-way → 8 binary decisions (MSB to LSB).
|
| 20 |
+
* Multi-step: K=3 denoising steps with independent score tables.
|
| 21 |
+
* Calibration context: (step, bit_context, order, shape, confidence, prob_bin)
|
| 22 |
+
*/
|
| 23 |
+
|
| 24 |
+
#include <stdint.h>
|
| 25 |
+
#include <string.h>
|
| 26 |
+
#include <math.h>
|
| 27 |
+
#include "fastmath.h"
|
| 28 |
+
|
| 29 |
+
#define TWD_NSYM 256
|
| 30 |
+
|
| 31 |
+
/* Number of reverse diffusion steps */
|
| 32 |
+
#define TWD_STEPS 3
|
| 33 |
+
|
| 34 |
+
/* Binary tree: 8 levels for 256 symbols */
|
| 35 |
+
#define TWD_N_LEVELS 8
|
| 36 |
+
|
| 37 |
+
/* 255 internal nodes: 1 + 2 + 4 + ... + 128 */
|
| 38 |
+
#define TWD_N_NODES 255
|
| 39 |
+
|
| 40 |
+
/* Bit context: encodes level + parent bit values. 27 total. */
|
| 41 |
+
#define TWD_N_BCTX 27
|
| 42 |
+
|
| 43 |
+
/* Calibration dimensions */
|
| 44 |
+
#define TWD_N_ORD 3 /* order groups: {-1,0,1}, {2,3}, {4+} */
|
| 45 |
+
#define TWD_N_SHAPE 4 /* distribution shape bins by max_p */
|
| 46 |
+
#define TWD_N_CONF 8 /* confidence bins (log-spaced) */
|
| 47 |
+
#define TWD_N_PROB 20 /* binary probability bins (logit-spaced) */
|
| 48 |
+
|
| 49 |
+
/* Smoothing pseudo-observations per bucket */
|
| 50 |
+
#define TWD_PRIOR_WEIGHT 32.0
|
| 51 |
+
|
| 52 |
+
/* Logit range for binary probability mapping */
|
| 53 |
+
#define TWD_LOGIT_RANGE 8.0
|
| 54 |
+
|
| 55 |
+
typedef struct {
|
| 56 |
+
double sum_pred; /* sum of predicted P(right) */
|
| 57 |
+
double hits; /* times true symbol went right */
|
| 58 |
+
double total; /* total observations */
|
| 59 |
+
} TwdCalibEntry;
|
| 60 |
+
|
| 61 |
+
typedef struct {
|
| 62 |
+
/* Calibration table: [step][bctx][order][shape][conf][prob_bin]
|
| 63 |
+
* Total entries: 3 × 27 × 3 × 4 × 8 × 20 = 155,520
|
| 64 |
+
* Memory: 155,520 × 24 bytes = 3.6 MB */
|
| 65 |
+
TwdCalibEntry table[TWD_STEPS][TWD_N_BCTX][TWD_N_ORD][TWD_N_SHAPE][TWD_N_CONF][TWD_N_PROB];
|
| 66 |
+
|
| 67 |
+
/* Cached from denoise, reused by update */
|
| 68 |
+
double cached_p_right[TWD_STEPS][TWD_N_NODES];
|
| 69 |
+
int cached_prob_bin[TWD_STEPS][TWD_N_NODES];
|
| 70 |
+
int cached_bctx[TWD_STEPS][TWD_N_NODES];
|
| 71 |
+
int cached_ord;
|
| 72 |
+
int cached_shape;
|
| 73 |
+
int cached_conf;
|
| 74 |
+
} TweedieDenoiser;
|
| 75 |
+
|
| 76 |
+
/* ── Bucket mapping functions ── */
|
| 77 |
+
|
| 78 |
+
static inline int twd_order_group(int ppm_order) {
|
| 79 |
+
if (ppm_order <= 1) return 0;
|
| 80 |
+
if (ppm_order <= 3) return 1;
|
| 81 |
+
return 2;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
static inline int twd_shape_bin(double max_p) {
|
| 85 |
+
if (max_p < 0.05) return 0; /* very flat */
|
| 86 |
+
if (max_p < 0.15) return 1; /* moderately flat */
|
| 87 |
+
if (max_p < 0.40) return 2; /* moderate peak */
|
| 88 |
+
return 3; /* peaked */
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
static inline int twd_conf_bin(double confidence) {
|
| 92 |
+
if (confidence < 4.0) return 0;
|
| 93 |
+
int bin = (int)(fast_log(confidence) * (1.0 / 1.3862943611198906));
|
| 94 |
+
if (bin < 0) bin = 0;
|
| 95 |
+
if (bin > TWD_N_CONF - 1) bin = TWD_N_CONF - 1;
|
| 96 |
+
return bin;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
/* Binary probability bin: logit-spaced in [-8, 8]. */
|
| 100 |
+
static inline int twd_prob_bin(double p) {
|
| 101 |
+
if (p < 1e-8) p = 1e-8;
|
| 102 |
+
if (p > 1.0 - 1e-8) p = 1.0 - 1e-8;
|
| 103 |
+
double logit = fast_log(p / (1.0 - p));
|
| 104 |
+
int bin = (int)((logit + TWD_LOGIT_RANGE) / (2.0 * TWD_LOGIT_RANGE) * TWD_N_PROB);
|
| 105 |
+
if (bin < 0) bin = 0;
|
| 106 |
+
if (bin > TWD_N_PROB - 1) bin = TWD_N_PROB - 1;
|
| 107 |
+
return bin;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
/* Bin center for prior initialization */
|
| 111 |
+
static inline double twd_bin_center(int bin) {
|
| 112 |
+
double logit = ((bin + 0.5) / TWD_N_PROB) * 2.0 * TWD_LOGIT_RANGE - TWD_LOGIT_RANGE;
|
| 113 |
+
return 1.0 / (1.0 + fast_exp(-logit));
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* Bit context: maps (level, node_index_at_level) → context ID 0..26. */
|
| 117 |
+
static inline int twd_bit_context(int level, int node_at_level) {
|
| 118 |
+
if (level == 0) return 0;
|
| 119 |
+
if (level == 1) return 1 + node_at_level; /* 2 contexts */
|
| 120 |
+
if (level == 2) return 3 + node_at_level; /* 4 contexts */
|
| 121 |
+
/* Levels 3-7: hash node_at_level into 4 groups */
|
| 122 |
+
int group = (node_at_level * 2654435761U) >> 30; /* hash → 0..3 */
|
| 123 |
+
return 7 + (level - 3) * 4 + group;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
/* ── Initialization ── */
|
| 127 |
+
|
| 128 |
+
static inline void tweedie_init(TweedieDenoiser *td) {
|
| 129 |
+
memset(td, 0, sizeof(*td));
|
| 130 |
+
|
| 131 |
+
for (int t = 0; t < TWD_STEPS; t++)
|
| 132 |
+
for (int b = 0; b < TWD_N_BCTX; b++)
|
| 133 |
+
for (int o = 0; o < TWD_N_ORD; o++)
|
| 134 |
+
for (int s = 0; s < TWD_N_SHAPE; s++)
|
| 135 |
+
for (int c = 0; c < TWD_N_CONF; c++)
|
| 136 |
+
for (int p = 0; p < TWD_N_PROB; p++) {
|
| 137 |
+
double center = twd_bin_center(p);
|
| 138 |
+
td->table[t][b][o][s][c][p].sum_pred = center * TWD_PRIOR_WEIGHT;
|
| 139 |
+
td->table[t][b][o][s][c][p].hits = center * TWD_PRIOR_WEIGHT;
|
| 140 |
+
td->table[t][b][o][s][c][p].total = TWD_PRIOR_WEIGHT;
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
/* ── Denoise: multi-step Tweedie reverse diffusion ──
|
| 145 |
+
*
|
| 146 |
+
* Additive Tweedie correction: p' = p + δ
|
| 147 |
+
* where δ = hits/total - sum_pred/total estimates the Tweedie term σ²·s(p̂).
|
| 148 |
+
*
|
| 149 |
+
* This is the nonparametric Tweedie estimator: within each calibration bin,
|
| 150 |
+
* the empirical hit rate is the posterior mean E[θ|p̂], and the additive
|
| 151 |
+
* correction δ = E[θ|p̂] - E[p̂] equals σ²·∇log m(p̂). */
|
| 152 |
+
|
| 153 |
+
static inline void tweedie_denoise(TweedieDenoiser *td, double *probs,
|
| 154 |
+
int ppm_order, double confidence) {
|
| 155 |
+
int og = twd_order_group(ppm_order);
|
| 156 |
+
int cb = twd_conf_bin(confidence);
|
| 157 |
+
|
| 158 |
+
/* Shape from the 256-way distribution (before any correction) */
|
| 159 |
+
double max_p = 0.0;
|
| 160 |
+
for (int i = 0; i < TWD_NSYM; i++)
|
| 161 |
+
if (probs[i] > max_p) max_p = probs[i];
|
| 162 |
+
int sb = twd_shape_bin(max_p);
|
| 163 |
+
|
| 164 |
+
td->cached_ord = og;
|
| 165 |
+
td->cached_shape = sb;
|
| 166 |
+
td->cached_conf = cb;
|
| 167 |
+
|
| 168 |
+
double stree[512];
|
| 169 |
+
double scale[512];
|
| 170 |
+
|
| 171 |
+
for (int step = 0; step < TWD_STEPS; step++) {
|
| 172 |
+
|
| 173 |
+
/* 1. Build sum tree bottom-up */
|
| 174 |
+
for (int i = 0; i < TWD_NSYM; i++)
|
| 175 |
+
stree[TWD_NSYM + i] = probs[i];
|
| 176 |
+
for (int i = TWD_NSYM - 1; i >= 1; i--)
|
| 177 |
+
stree[i] = stree[2 * i] + stree[2 * i + 1];
|
| 178 |
+
|
| 179 |
+
/* 2. Process all nodes: compute P(right), apply Tweedie correction */
|
| 180 |
+
scale[1] = 1.0;
|
| 181 |
+
|
| 182 |
+
for (int level = 0; level < TWD_N_LEVELS; level++) {
|
| 183 |
+
int level_start = 1 << level;
|
| 184 |
+
int level_end = 1 << (level + 1);
|
| 185 |
+
|
| 186 |
+
for (int ni = level_start; ni < level_end; ni++) {
|
| 187 |
+
double node_total = stree[ni];
|
| 188 |
+
int node_id = ni - 1;
|
| 189 |
+
int node_at_level = ni - level_start;
|
| 190 |
+
|
| 191 |
+
if (node_total < 1e-15) {
|
| 192 |
+
scale[2 * ni] = scale[ni];
|
| 193 |
+
scale[2 * ni + 1] = scale[ni];
|
| 194 |
+
td->cached_p_right[step][node_id] = 0.5;
|
| 195 |
+
td->cached_prob_bin[step][node_id] = twd_prob_bin(0.5);
|
| 196 |
+
td->cached_bctx[step][node_id] = twd_bit_context(level, node_at_level);
|
| 197 |
+
continue;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
double sum_right = stree[2 * ni + 1];
|
| 201 |
+
double p_right = sum_right / node_total;
|
| 202 |
+
if (p_right < 1e-8) p_right = 1e-8;
|
| 203 |
+
if (p_right > 1.0 - 1e-8) p_right = 1.0 - 1e-8;
|
| 204 |
+
|
| 205 |
+
int bctx = twd_bit_context(level, node_at_level);
|
| 206 |
+
int pbin = twd_prob_bin(p_right);
|
| 207 |
+
td->cached_p_right[step][node_id] = p_right;
|
| 208 |
+
td->cached_prob_bin[step][node_id] = pbin;
|
| 209 |
+
td->cached_bctx[step][node_id] = bctx;
|
| 210 |
+
|
| 211 |
+
/* Tweedie additive correction: δ = E[θ|p̂] - E[p̂] */
|
| 212 |
+
TwdCalibEntry *e = &td->table[step][bctx][og][sb][cb][pbin];
|
| 213 |
+
double avg_pred = e->sum_pred / e->total;
|
| 214 |
+
double emp_rate = e->hits / e->total;
|
| 215 |
+
double delta = emp_rate - avg_pred;
|
| 216 |
+
|
| 217 |
+
double p_right_corr = p_right + delta;
|
| 218 |
+
if (p_right_corr < 1e-8) p_right_corr = 1e-8;
|
| 219 |
+
if (p_right_corr > 1.0 - 1e-8) p_right_corr = 1.0 - 1e-8;
|
| 220 |
+
|
| 221 |
+
double sl = (1.0 - p_right_corr) / (1.0 - p_right);
|
| 222 |
+
double sr = p_right_corr / p_right;
|
| 223 |
+
scale[2 * ni] = scale[ni] * sl;
|
| 224 |
+
scale[2 * ni + 1] = scale[ni] * sr;
|
| 225 |
+
}
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
/* 3. Apply accumulated leaf scales */
|
| 229 |
+
for (int i = 0; i < TWD_NSYM; i++)
|
| 230 |
+
probs[i] *= scale[TWD_NSYM + i];
|
| 231 |
+
|
| 232 |
+
/* 4. Renormalize */
|
| 233 |
+
double sum = 0.0;
|
| 234 |
+
for (int i = 0; i < TWD_NSYM; i++) {
|
| 235 |
+
if (probs[i] < 1e-10) probs[i] = 1e-10;
|
| 236 |
+
sum += probs[i];
|
| 237 |
+
}
|
| 238 |
+
double inv = 1.0 / sum;
|
| 239 |
+
for (int i = 0; i < TWD_NSYM; i++)
|
| 240 |
+
probs[i] *= inv;
|
| 241 |
+
|
| 242 |
+
/* Recompute shape after correction for next step */
|
| 243 |
+
max_p = 0.0;
|
| 244 |
+
for (int i = 0; i < TWD_NSYM; i++)
|
| 245 |
+
if (probs[i] > max_p) max_p = probs[i];
|
| 246 |
+
sb = twd_shape_bin(max_p);
|
| 247 |
+
}
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
/* ── Update ── */
|
| 251 |
+
|
| 252 |
+
static inline void tweedie_update(TweedieDenoiser *td, uint8_t true_symbol) {
|
| 253 |
+
int og = td->cached_ord;
|
| 254 |
+
int sb = td->cached_shape;
|
| 255 |
+
int cb = td->cached_conf;
|
| 256 |
+
|
| 257 |
+
for (int step = 0; step < TWD_STEPS; step++) {
|
| 258 |
+
for (int level = 0; level < TWD_N_LEVELS; level++) {
|
| 259 |
+
int block_size = TWD_NSYM >> level;
|
| 260 |
+
int half = block_size >> 1;
|
| 261 |
+
|
| 262 |
+
int node_at_level = true_symbol / block_size;
|
| 263 |
+
int start = node_at_level * block_size;
|
| 264 |
+
int mid = start + half;
|
| 265 |
+
int went_right = (true_symbol >= mid) ? 1 : 0;
|
| 266 |
+
|
| 267 |
+
int node_id = (1 << level) - 1 + node_at_level;
|
| 268 |
+
int pbin = td->cached_prob_bin[step][node_id];
|
| 269 |
+
int bctx = td->cached_bctx[step][node_id];
|
| 270 |
+
|
| 271 |
+
TwdCalibEntry *e = &td->table[step][bctx][og][sb][cb][pbin];
|
| 272 |
+
e->sum_pred += td->cached_p_right[step][node_id];
|
| 273 |
+
e->total += 1.0;
|
| 274 |
+
if (went_right)
|
| 275 |
+
e->hits += 1.0;
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
#endif /* TWEEDIE_H */
|
word.h
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef WORD_H
|
| 2 |
+
#define WORD_H
|
| 3 |
+
|
| 4 |
+
#include <stdint.h>
|
| 5 |
+
#include <stdlib.h>
|
| 6 |
+
#include <string.h>
|
| 7 |
+
|
| 8 |
+
#define WORD_NSYM 256
|
| 9 |
+
|
| 10 |
+
/* ── Word character set ── */
|
| 11 |
+
static inline int is_word_char(int c) {
|
| 12 |
+
if (c >= 'a' && c <= 'z') return 1;
|
| 13 |
+
if (c >= 'A' && c <= 'Z') return 1;
|
| 14 |
+
if (c >= '0' && c <= '9') return 1;
|
| 15 |
+
if (c == '\'' || c == '-') return 1;
|
| 16 |
+
return 0;
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
/* ── Trie node ── */
|
| 20 |
+
typedef struct TrieNode {
|
| 21 |
+
/* continuations: next_byte → count */
|
| 22 |
+
int cont_keys[64];
|
| 23 |
+
int cont_vals[64];
|
| 24 |
+
int cont_count;
|
| 25 |
+
|
| 26 |
+
/* children: byte → child node */
|
| 27 |
+
struct TrieNode *children[256];
|
| 28 |
+
} TrieNode;
|
| 29 |
+
|
| 30 |
+
static inline TrieNode *trie_new(void) {
|
| 31 |
+
TrieNode *n = (TrieNode *)calloc(1, sizeof(TrieNode));
|
| 32 |
+
return n;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
static inline void trie_free(TrieNode *n) {
|
| 36 |
+
if (!n) return;
|
| 37 |
+
for (int i = 0; i < 256; i++)
|
| 38 |
+
trie_free(n->children[i]);
|
| 39 |
+
free(n);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
static inline void trie_add_cont(TrieNode *n, int byte_val) {
|
| 43 |
+
for (int i = 0; i < n->cont_count; i++) {
|
| 44 |
+
if (n->cont_keys[i] == byte_val) {
|
| 45 |
+
n->cont_vals[i]++;
|
| 46 |
+
return;
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
if (n->cont_count < 64) {
|
| 50 |
+
n->cont_keys[n->cont_count] = byte_val;
|
| 51 |
+
n->cont_vals[n->cont_count] = 1;
|
| 52 |
+
n->cont_count++;
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
/* ── Word counts hash table ── */
|
| 57 |
+
typedef struct {
|
| 58 |
+
uint64_t key;
|
| 59 |
+
int count;
|
| 60 |
+
} WordCountEntry;
|
| 61 |
+
|
| 62 |
+
typedef struct {
|
| 63 |
+
WordCountEntry *entries;
|
| 64 |
+
uint32_t capacity;
|
| 65 |
+
uint32_t mask;
|
| 66 |
+
uint32_t used;
|
| 67 |
+
} WordCountHT;
|
| 68 |
+
|
| 69 |
+
static inline void wcht_init(WordCountHT *t, uint32_t cap) {
|
| 70 |
+
t->capacity = cap;
|
| 71 |
+
t->mask = cap - 1;
|
| 72 |
+
t->used = 0;
|
| 73 |
+
t->entries = (WordCountEntry *)calloc(cap, sizeof(WordCountEntry));
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
static inline void wcht_free(WordCountHT *t) {
|
| 77 |
+
free(t->entries);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
static inline uint64_t word_hash(const uint8_t *w, int len) {
|
| 81 |
+
uint64_t h = 14695981039346656037ULL;
|
| 82 |
+
for (int i = 0; i < len; i++) {
|
| 83 |
+
h ^= w[i];
|
| 84 |
+
h *= 1099511628211ULL;
|
| 85 |
+
}
|
| 86 |
+
if (h == 0) h = 1;
|
| 87 |
+
return h;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
static inline void wcht_grow(WordCountHT *t) {
|
| 91 |
+
uint32_t old_cap = t->capacity;
|
| 92 |
+
WordCountEntry *old = t->entries;
|
| 93 |
+
t->capacity *= 2;
|
| 94 |
+
t->mask = t->capacity - 1;
|
| 95 |
+
t->entries = (WordCountEntry *)calloc(t->capacity, sizeof(WordCountEntry));
|
| 96 |
+
t->used = 0;
|
| 97 |
+
for (uint32_t i = 0; i < old_cap; i++) {
|
| 98 |
+
if (old[i].key != 0) {
|
| 99 |
+
uint32_t idx = (uint32_t)(old[i].key & t->mask);
|
| 100 |
+
while (t->entries[idx].key != 0)
|
| 101 |
+
idx = (idx + 1) & t->mask;
|
| 102 |
+
t->entries[idx] = old[i];
|
| 103 |
+
t->used++;
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
free(old);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
static inline int wcht_get(WordCountHT *t, uint64_t key) {
|
| 110 |
+
uint32_t idx = (uint32_t)(key & t->mask);
|
| 111 |
+
for (;;) {
|
| 112 |
+
if (t->entries[idx].key == key) return t->entries[idx].count;
|
| 113 |
+
if (t->entries[idx].key == 0) return 0;
|
| 114 |
+
idx = (idx + 1) & t->mask;
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
static inline void wcht_add(WordCountHT *t, uint64_t key, int delta) {
|
| 119 |
+
if (t->used * 5 > t->capacity * 3) wcht_grow(t);
|
| 120 |
+
uint32_t idx = (uint32_t)(key & t->mask);
|
| 121 |
+
for (;;) {
|
| 122 |
+
if (t->entries[idx].key == key) {
|
| 123 |
+
t->entries[idx].count += delta;
|
| 124 |
+
return;
|
| 125 |
+
}
|
| 126 |
+
if (t->entries[idx].key == 0) {
|
| 127 |
+
t->entries[idx].key = key;
|
| 128 |
+
t->entries[idx].count = delta;
|
| 129 |
+
t->used++;
|
| 130 |
+
return;
|
| 131 |
+
}
|
| 132 |
+
idx = (idx + 1) & t->mask;
|
| 133 |
+
}
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
/* ── Bigram table: word_hash → { byte → count } ── */
|
| 137 |
+
typedef struct {
|
| 138 |
+
uint64_t key;
|
| 139 |
+
int counts[256];
|
| 140 |
+
int total;
|
| 141 |
+
} BigramEntry;
|
| 142 |
+
|
| 143 |
+
typedef struct {
|
| 144 |
+
BigramEntry *entries;
|
| 145 |
+
uint32_t capacity;
|
| 146 |
+
uint32_t mask;
|
| 147 |
+
uint32_t used;
|
| 148 |
+
} BigramHT;
|
| 149 |
+
|
| 150 |
+
static inline void bht_init(BigramHT *t, uint32_t cap) {
|
| 151 |
+
t->capacity = cap;
|
| 152 |
+
t->mask = cap - 1;
|
| 153 |
+
t->used = 0;
|
| 154 |
+
t->entries = (BigramEntry *)calloc(cap, sizeof(BigramEntry));
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
static inline void bht_free(BigramHT *t) {
|
| 158 |
+
free(t->entries);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
static inline void bht_grow(BigramHT *t) {
|
| 162 |
+
uint32_t old_cap = t->capacity;
|
| 163 |
+
BigramEntry *old = t->entries;
|
| 164 |
+
t->capacity *= 2;
|
| 165 |
+
t->mask = t->capacity - 1;
|
| 166 |
+
t->entries = (BigramEntry *)calloc(t->capacity, sizeof(BigramEntry));
|
| 167 |
+
t->used = 0;
|
| 168 |
+
for (uint32_t i = 0; i < old_cap; i++) {
|
| 169 |
+
if (old[i].key != 0) {
|
| 170 |
+
uint32_t idx = (uint32_t)(old[i].key & t->mask);
|
| 171 |
+
while (t->entries[idx].key != 0)
|
| 172 |
+
idx = (idx + 1) & t->mask;
|
| 173 |
+
t->entries[idx] = old[i];
|
| 174 |
+
t->used++;
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
free(old);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
static inline BigramEntry *bht_get_or_create(BigramHT *t, uint64_t key) {
|
| 181 |
+
if (t->used * 5 > t->capacity * 3) bht_grow(t);
|
| 182 |
+
uint32_t idx = (uint32_t)(key & t->mask);
|
| 183 |
+
for (;;) {
|
| 184 |
+
if (t->entries[idx].key == key) return &t->entries[idx];
|
| 185 |
+
if (t->entries[idx].key == 0) {
|
| 186 |
+
t->entries[idx].key = key;
|
| 187 |
+
memset(t->entries[idx].counts, 0, sizeof(t->entries[idx].counts));
|
| 188 |
+
t->entries[idx].total = 0;
|
| 189 |
+
t->used++;
|
| 190 |
+
return &t->entries[idx];
|
| 191 |
+
}
|
| 192 |
+
idx = (idx + 1) & t->mask;
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
/* ── Word Model ── */
|
| 197 |
+
|
| 198 |
+
typedef struct {
|
| 199 |
+
TrieNode *trie;
|
| 200 |
+
WordCountHT word_counts;
|
| 201 |
+
BigramHT bigrams;
|
| 202 |
+
|
| 203 |
+
uint8_t current_word[256];
|
| 204 |
+
int current_word_len;
|
| 205 |
+
|
| 206 |
+
uint8_t last_word[256];
|
| 207 |
+
int last_word_len;
|
| 208 |
+
int has_last_word;
|
| 209 |
+
|
| 210 |
+
int in_word;
|
| 211 |
+
double hits;
|
| 212 |
+
double attempts;
|
| 213 |
+
|
| 214 |
+
/* prediction cache to avoid double trie traversal */
|
| 215 |
+
double cached_probs[WORD_NSYM];
|
| 216 |
+
double cached_conf;
|
| 217 |
+
int cache_valid;
|
| 218 |
+
} WordModel;
|
| 219 |
+
|
| 220 |
+
static inline void word_init(WordModel *w) {
|
| 221 |
+
w->trie = trie_new();
|
| 222 |
+
wcht_init(&w->word_counts, 4096);
|
| 223 |
+
bht_init(&w->bigrams, 2048);
|
| 224 |
+
w->current_word_len = 0;
|
| 225 |
+
w->last_word_len = 0;
|
| 226 |
+
w->has_last_word = 0;
|
| 227 |
+
w->in_word = 0;
|
| 228 |
+
w->hits = 1.0;
|
| 229 |
+
w->attempts = 2.0;
|
| 230 |
+
w->cache_valid = 0;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
static inline void word_free(WordModel *w) {
|
| 234 |
+
trie_free(w->trie);
|
| 235 |
+
wcht_free(&w->word_counts);
|
| 236 |
+
bht_free(&w->bigrams);
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
static inline void word_add_to_trie(WordModel *w, const uint8_t *word, int len) {
|
| 240 |
+
uint64_t wh = word_hash(word, len);
|
| 241 |
+
wcht_add(&w->word_counts, wh, 1);
|
| 242 |
+
|
| 243 |
+
TrieNode *node = w->trie;
|
| 244 |
+
for (int i = 0; i < len; i++) {
|
| 245 |
+
int b = word[i];
|
| 246 |
+
if (!node->children[b])
|
| 247 |
+
node->children[b] = trie_new();
|
| 248 |
+
TrieNode *entry = node->children[b];
|
| 249 |
+
if (i + 1 < len)
|
| 250 |
+
trie_add_cont(entry, word[i + 1]);
|
| 251 |
+
node = entry;
|
| 252 |
+
}
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
/* Get continuations for a prefix. Returns count of distinct continuations.
|
| 256 |
+
Fills keys[] and vals[] arrays. */
|
| 257 |
+
static inline int word_get_continuations(WordModel *w, const uint8_t *prefix,
|
| 258 |
+
int prefix_len, int *keys, int *vals) {
|
| 259 |
+
if (prefix_len == 0) return 0;
|
| 260 |
+
TrieNode *node = w->trie;
|
| 261 |
+
for (int i = 0; i < prefix_len; i++) {
|
| 262 |
+
int b = prefix[i];
|
| 263 |
+
if (!node->children[b]) return 0;
|
| 264 |
+
TrieNode *entry = node->children[b];
|
| 265 |
+
if (i == prefix_len - 1) {
|
| 266 |
+
int n = entry->cont_count;
|
| 267 |
+
for (int j = 0; j < n; j++) {
|
| 268 |
+
keys[j] = entry->cont_keys[j];
|
| 269 |
+
vals[j] = entry->cont_vals[j];
|
| 270 |
+
}
|
| 271 |
+
return n;
|
| 272 |
+
}
|
| 273 |
+
node = entry;
|
| 274 |
+
}
|
| 275 |
+
return 0;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
/*
|
| 279 |
+
* predict: fills probs[256] if prediction available.
|
| 280 |
+
* Returns 1 with confidence in *out_conf, or 0 if no prediction.
|
| 281 |
+
*/
|
| 282 |
+
static inline int word_predict(WordModel *w, double *probs, double *out_conf) {
|
| 283 |
+
static const int boundary_chars[] = {32, 10, 13, 44, 46, 59, 58, 33, 63, 41, 93};
|
| 284 |
+
static const int n_boundary = 11;
|
| 285 |
+
|
| 286 |
+
if (w->in_word && w->current_word_len >= 1) {
|
| 287 |
+
int keys[64], vals[64];
|
| 288 |
+
int nc = word_get_continuations(w, w->current_word,
|
| 289 |
+
w->current_word_len, keys, vals);
|
| 290 |
+
if (nc > 0) {
|
| 291 |
+
memset(probs, 0, WORD_NSYM * sizeof(double));
|
| 292 |
+
int total = 0;
|
| 293 |
+
for (int i = 0; i < nc; i++) total += vals[i];
|
| 294 |
+
double inv_total = 1.0 / total;
|
| 295 |
+
for (int i = 0; i < nc; i++)
|
| 296 |
+
probs[keys[i]] += vals[i] * inv_total;
|
| 297 |
+
|
| 298 |
+
/* word boundary probability */
|
| 299 |
+
uint64_t wh = word_hash(w->current_word, w->current_word_len);
|
| 300 |
+
int wcount = wcht_get(&w->word_counts, wh);
|
| 301 |
+
if (wcount > 0) {
|
| 302 |
+
double bw = (double)wcount / (wcount + total);
|
| 303 |
+
for (int i = 0; i < WORD_NSYM; i++)
|
| 304 |
+
probs[i] *= (1.0 - bw);
|
| 305 |
+
for (int i = 0; i < n_boundary; i++)
|
| 306 |
+
probs[boundary_chars[i]] += bw / n_boundary;
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
int plen = w->current_word_len;
|
| 310 |
+
double confidence = (plen / 3.0 < 1.0 ? plen / 3.0 : 1.0);
|
| 311 |
+
double cont_factor = nc * 0.5;
|
| 312 |
+
if (cont_factor > 1.0) cont_factor = 1.0;
|
| 313 |
+
confidence *= cont_factor;
|
| 314 |
+
confidence *= (w->hits / w->attempts);
|
| 315 |
+
|
| 316 |
+
double sum = 0.0;
|
| 317 |
+
for (int i = 0; i < WORD_NSYM; i++) sum += probs[i];
|
| 318 |
+
if (sum > 0.0) {
|
| 319 |
+
double inv = 1.0 / sum;
|
| 320 |
+
for (int i = 0; i < WORD_NSYM; i++) probs[i] *= inv;
|
| 321 |
+
*out_conf = confidence;
|
| 322 |
+
return 1;
|
| 323 |
+
}
|
| 324 |
+
}
|
| 325 |
+
} else if (!w->in_word && w->has_last_word) {
|
| 326 |
+
uint64_t wh = word_hash(w->last_word, w->last_word_len);
|
| 327 |
+
BigramEntry *be = NULL;
|
| 328 |
+
/* look up without creating */
|
| 329 |
+
uint32_t idx = (uint32_t)(wh & w->bigrams.mask);
|
| 330 |
+
for (;;) {
|
| 331 |
+
if (w->bigrams.entries[idx].key == wh) {
|
| 332 |
+
be = &w->bigrams.entries[idx];
|
| 333 |
+
break;
|
| 334 |
+
}
|
| 335 |
+
if (w->bigrams.entries[idx].key == 0) break;
|
| 336 |
+
idx = (idx + 1) & w->bigrams.mask;
|
| 337 |
+
}
|
| 338 |
+
if (be && be->total > 0) {
|
| 339 |
+
memset(probs, 0, WORD_NSYM * sizeof(double));
|
| 340 |
+
double inv = 1.0 / be->total;
|
| 341 |
+
for (int i = 0; i < WORD_NSYM; i++)
|
| 342 |
+
if (be->counts[i] > 0)
|
| 343 |
+
probs[i] = be->counts[i] * inv;
|
| 344 |
+
|
| 345 |
+
double confidence = (be->total / 5.0 < 1.0 ? be->total / 5.0 : 1.0);
|
| 346 |
+
confidence *= 0.3 * (w->hits / w->attempts);
|
| 347 |
+
|
| 348 |
+
double sum = 0.0;
|
| 349 |
+
for (int i = 0; i < WORD_NSYM; i++) sum += probs[i];
|
| 350 |
+
if (sum > 0.0) {
|
| 351 |
+
double inv2 = 1.0 / sum;
|
| 352 |
+
for (int i = 0; i < WORD_NSYM; i++) probs[i] *= inv2;
|
| 353 |
+
*out_conf = confidence;
|
| 354 |
+
return 1;
|
| 355 |
+
}
|
| 356 |
+
}
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
*out_conf = 0.0;
|
| 360 |
+
return 0;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
/* Predict with caching: compute once, reuse in update */
|
| 364 |
+
static inline int word_predict_cached(WordModel *w, double *probs, double *out_conf) {
|
| 365 |
+
if (w->cache_valid) {
|
| 366 |
+
memcpy(probs, w->cached_probs, sizeof(w->cached_probs));
|
| 367 |
+
*out_conf = w->cached_conf;
|
| 368 |
+
return (*out_conf > 0.0) ? 1 : 0;
|
| 369 |
+
}
|
| 370 |
+
int ret = word_predict(w, probs, out_conf);
|
| 371 |
+
if (ret) {
|
| 372 |
+
memcpy(w->cached_probs, probs, sizeof(w->cached_probs));
|
| 373 |
+
w->cached_conf = *out_conf;
|
| 374 |
+
} else {
|
| 375 |
+
w->cached_conf = 0.0;
|
| 376 |
+
}
|
| 377 |
+
w->cache_valid = 1;
|
| 378 |
+
return ret;
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
static inline void word_update(WordModel *w, uint8_t byte_val) {
|
| 382 |
+
/* track accuracy using cached prediction */
|
| 383 |
+
double pred_conf = w->cached_conf;
|
| 384 |
+
int has_pred = w->cache_valid && pred_conf > 0.01;
|
| 385 |
+
if (has_pred) {
|
| 386 |
+
w->attempts += 1.0;
|
| 387 |
+
if (w->cached_probs[byte_val] > 0.05)
|
| 388 |
+
w->hits += 1.0;
|
| 389 |
+
if (w->attempts > 500.0) {
|
| 390 |
+
w->hits *= 0.99;
|
| 391 |
+
w->attempts *= 0.99;
|
| 392 |
+
}
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
int is_wc = is_word_char(byte_val);
|
| 396 |
+
if (is_wc) {
|
| 397 |
+
if (!w->in_word) {
|
| 398 |
+
w->current_word_len = 0;
|
| 399 |
+
w->in_word = 1;
|
| 400 |
+
/* bigram: last_word → first byte of new word */
|
| 401 |
+
if (w->has_last_word) {
|
| 402 |
+
uint64_t wh = word_hash(w->last_word, w->last_word_len);
|
| 403 |
+
BigramEntry *be = bht_get_or_create(&w->bigrams, wh);
|
| 404 |
+
be->counts[byte_val]++;
|
| 405 |
+
be->total++;
|
| 406 |
+
}
|
| 407 |
+
}
|
| 408 |
+
if (w->current_word_len < 255)
|
| 409 |
+
w->current_word[w->current_word_len++] = byte_val;
|
| 410 |
+
} else {
|
| 411 |
+
if (w->in_word && w->current_word_len >= 2) {
|
| 412 |
+
word_add_to_trie(w, w->current_word, w->current_word_len);
|
| 413 |
+
w->last_word_len = w->current_word_len;
|
| 414 |
+
memcpy(w->last_word, w->current_word, w->current_word_len);
|
| 415 |
+
w->has_last_word = 1;
|
| 416 |
+
} else if (w->in_word) {
|
| 417 |
+
w->last_word_len = w->current_word_len;
|
| 418 |
+
memcpy(w->last_word, w->current_word, w->current_word_len);
|
| 419 |
+
w->has_last_word = 1;
|
| 420 |
+
}
|
| 421 |
+
w->in_word = 0;
|
| 422 |
+
w->current_word_len = 0;
|
| 423 |
+
}
|
| 424 |
+
w->cache_valid = 0; /* invalidate cache after state change */
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
static inline void blend_word_model(double *probs, const double *word_probs,
|
| 428 |
+
double word_confidence) {
|
| 429 |
+
if (word_confidence < 0.01) return;
|
| 430 |
+
double weight = word_confidence * 0.35;
|
| 431 |
+
if (weight > 0.45) weight = 0.45;
|
| 432 |
+
double sum = 0.0;
|
| 433 |
+
for (int i = 0; i < WORD_NSYM; i++) {
|
| 434 |
+
probs[i] = probs[i] * (1.0 - weight) + word_probs[i] * weight;
|
| 435 |
+
if (probs[i] < 1e-10) probs[i] = 1e-10;
|
| 436 |
+
sum += probs[i];
|
| 437 |
+
}
|
| 438 |
+
double inv = 1.0 / sum;
|
| 439 |
+
for (int i = 0; i < WORD_NSYM; i++)
|
| 440 |
+
probs[i] *= inv;
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
#endif /* WORD_H */
|