From 48e99a440c84bd230ddc6ea72d57aa073663bd26 Mon Sep 17 00:00:00 2001 From: pHash Audit Date: Mon, 25 May 2026 10:58:10 -0700 Subject: [PATCH 1/2] Fix audio hash leaks, threading, and resource bugs Addresses audit findings #7, #8, #9, #13, #14, #19, #20, #21, #29. Correctness: - #7/#8 Removed the entire HAVE_PTHREAD block (ph_audio_thread, ph_audio_hashes). It referenced types `slice` and `DP` and a function `ph_num_threads()` that were never declared anywhere in the public headers (so the block could not actually be compiled), the worker function had no return statement (UB on the void* return), and the slicing math walked one element past the input (count=10, threads=3 gave 5+3+3=11). Callers should drive ph_readaudio / ph_audiohash from their own thread pool. - #9 ph_readaudio2 now reads src_data.output_frames_gen (the count libsamplerate actually produced) instead of output_frames (the caller-supplied capacity), eliminating the downstream over-read. - ph_audiohash: corrected the nb_frames formula. The old expression could go negative for short inputs and was then passed to malloc. Memory and safety: - #13 readaudio_mp3: every error path now closes/deletes the mpg123_handle and calls mpg123_exit. Adds NULL checks on the malloc'd output, validates channels and samplerate (channels==0 would have produced an infinite loop), clamps the per-encoding inner loop so a decode that returns fewer bytes than `channels` cannot over-read the decode buffer. - #14 readaudio_snd: validates channels/samplerate/frames, checks every malloc, and frees inbuf on read failure. ph_audiohash's malloc is now checked and frame_length-sized buffers are std::vectors (no more VLAs). - #20/#21 Replaced double window[4096], double frame[4096], double magnF[2048], freqs/binbarks VLAs, and the per-filter new[] grid with std::vector. ph_audio_distance_ber preallocates `dist` once outside the per-lag loop instead of realloc'ing on every iteration; the worst-case M is just N1/block_size. - ph_compare_blocks now returns 1.0 (worst distance) rather than dividing by zero when block_size <= 0. - ph_audio_distance_ber and ph_audiohash zero their out parameters on every failure path; the previous code left Nc / nb_frames uninitialised when allocation failed. Performance: - #19 ph_bitcount uses __builtin_popcount, matching how ph_hamming_distance does its bit counting. Header hygiene (#29): - ph_fft.h includes (C++) instead of (C99) and drops the `using namespace std;` that was being injected into every translation unit that included it. fft() takes std::complex*. - fft() now validates that N is a power of 2 (the radix-2 recursion silently misbehaves otherwise). --- src/audiophash.cpp | 585 +++++++++++++++++++++++---------------------- src/ph_fft.cpp | 33 +-- src/ph_fft.h | 10 +- 3 files changed, 322 insertions(+), 306 deletions(-) diff --git a/src/audiophash.cpp b/src/audiophash.cpp index 3df017a2..08cfaf54 100644 --- a/src/audiophash.cpp +++ b/src/audiophash.cpp @@ -23,20 +23,23 @@ */ #include "audiophash.h" + #include #include + +#include #ifdef HAVE_LIBMPG123 #include #endif -int ph_count_samples(const char *filename, int sr, int channels) { +int ph_count_samples(const char *filename, int /*sr*/, int /*channels*/) { SF_INFO sf_info; sf_info.format = 0; SNDFILE *sndfile = sf_open(filename, SFM_READ, &sf_info); if (sndfile == NULL) { return -1; } - int count = sf_info.frames; + int count = (int)sf_info.frames; sf_close(sndfile); return count; } @@ -45,174 +48,243 @@ int ph_count_samples(const char *filename, int sr, int channels) { static float *readaudio_mp3(const char *filename, long *sr, const float nbsecs, unsigned int *buflen) { - mpg123_handle *m; + mpg123_handle *m = NULL; + unsigned char *decbuf = NULL; + float *buffer = NULL; + bool mpg_inited = false; int ret; - if (mpg123_init() != MPG123_OK || ((m = mpg123_new(NULL, &ret)) == NULL) || - mpg123_open(m, filename) != MPG123_OK) { + *buflen = 0; + + if (mpg123_init() != MPG123_OK) { fprintf(stderr, "unable to init mpg\n"); return NULL; } + mpg_inited = true; - /*turn off logging */ - mpg123_param(m, MPG123_ADD_FLAGS, MPG123_QUIET, 0); + m = mpg123_new(NULL, &ret); + if (m == NULL) { + fprintf(stderr, "unable to init mpg\n"); + goto fail; + } - off_t totalsamples; + if (mpg123_open(m, filename) != MPG123_OK) { + fprintf(stderr, "unable to open mp3\n"); + goto fail; + } + mpg123_param(m, MPG123_ADD_FLAGS, MPG123_QUIET, 0); mpg123_scan(m); - totalsamples = mpg123_length(m); - - int meta = mpg123_meta_check(m); - int channels, encoding; + { + off_t totalsamples = mpg123_length(m); + if (totalsamples <= 0) goto fail; - if (mpg123_getformat(m, sr, &channels, &encoding) != MPG123_OK) { - fprintf(stderr, "unable to get format\n"); - return NULL; - } - - mpg123_format_none(m); - mpg123_format(m, *sr, channels, encoding); - - size_t decbuflen = mpg123_outblock(m); - unsigned char *decbuf = (unsigned char *)malloc(decbuflen); - if (decbuf == NULL) { - printf("mem alloc error\n"); - return NULL; - } - - unsigned int nbsamples = (nbsecs <= 0) ? totalsamples : nbsecs * (*sr); - nbsamples = (nbsamples < totalsamples) ? nbsamples : totalsamples; + int channels = 0, encoding = 0; + if (mpg123_getformat(m, sr, &channels, &encoding) != MPG123_OK) { + fprintf(stderr, "unable to get format\n"); + goto fail; + } + if (channels <= 0 || *sr <= 0) goto fail; - size_t i, j, index = 0, done; + mpg123_format_none(m); + if (mpg123_format(m, *sr, channels, encoding) != MPG123_OK) goto fail; - float *buffer = (float *)malloc(nbsamples * sizeof(float)); - *buflen = nbsamples; + size_t decbuflen = mpg123_outblock(m); + decbuf = (unsigned char *)malloc(decbuflen); + if (decbuf == NULL) { + fprintf(stderr, "mem alloc error\n"); + goto fail; + } - do { - ret = mpg123_read(m, decbuf, decbuflen, &done); - switch (encoding) { - case MPG123_ENC_SIGNED_16: - for (i = 0; i < done / sizeof(short); i += channels) { - buffer[index] = 0.0f; - for (j = 0; j < channels; j++) { - buffer[index] += - (float)(((short *)decbuf)[i + j]) / (float)SHRT_MAX; + unsigned int nbsamples = + (nbsecs <= 0) ? (unsigned int)totalsamples + : (unsigned int)(nbsecs * (*sr)); + if ((off_t)nbsamples > totalsamples) { + nbsamples = (unsigned int)totalsamples; + } + if (nbsamples == 0) goto fail; + + buffer = (float *)malloc((size_t)nbsamples * sizeof(float)); + if (buffer == NULL) goto fail; + *buflen = nbsamples; + + size_t i, j, index = 0, done = 0; + do { + ret = mpg123_read(m, decbuf, decbuflen, &done); + switch (encoding) { + case MPG123_ENC_SIGNED_16: { + size_t samples = done / sizeof(short); + if (samples < (size_t)channels) break; + size_t limit = samples - (size_t)channels + 1; + for (i = 0; i < limit; i += channels) { + float acc = 0.0f; + for (j = 0; j < (size_t)channels; j++) { + acc += (float)(((short *)decbuf)[i + j]) / + (float)SHRT_MAX; + } + buffer[index++] = acc / channels; + if (index >= nbsamples) break; } - buffer[index++] /= channels; - if (index >= nbsamples) break; - } - break; - case MPG123_ENC_SIGNED_8: - for (i = 0; i < done / sizeof(char); i += channels) { - buffer[index] = 0.0f; - for (j = 0; j < channels; j++) { - buffer[index] += - (float)(((char *)decbuf)[i + j]) / (float)SCHAR_MAX; + } break; + case MPG123_ENC_SIGNED_8: { + size_t samples = done / sizeof(char); + if (samples < (size_t)channels) break; + size_t limit = samples - (size_t)channels + 1; + for (i = 0; i < limit; i += channels) { + float acc = 0.0f; + for (j = 0; j < (size_t)channels; j++) { + acc += (float)(((char *)decbuf)[i + j]) / + (float)SCHAR_MAX; + } + buffer[index++] = acc / channels; + if (index >= nbsamples) break; } - buffer[index++] /= channels; - if (index >= nbsamples) break; - } - break; - case MPG123_ENC_FLOAT_32: - for (i = 0; i < done / sizeof(float); i += channels) { - buffer[index] = 0.0f; - for (j = 0; j < channels; j++) { - buffer[index] += ((float *)decbuf)[i + j]; + } break; + case MPG123_ENC_FLOAT_32: { + size_t samples = done / sizeof(float); + if (samples < (size_t)channels) break; + size_t limit = samples - (size_t)channels + 1; + for (i = 0; i < limit; i += channels) { + float acc = 0.0f; + for (j = 0; j < (size_t)channels; j++) { + acc += ((float *)decbuf)[i + j]; + } + buffer[index++] = acc / channels; + if (index >= nbsamples) break; } - buffer[index++] /= channels; - if (index >= nbsamples) break; - } - break; - default: - done = 0; - } + } break; + default: + done = 0; + } + } while (ret == MPG123_OK && index < nbsamples); - } while (ret == MPG123_OK && index < nbsamples); + if (index == 0) { + free(buffer); + buffer = NULL; + *buflen = 0; + goto fail; + } + *buflen = (unsigned int)index; + } free(decbuf); mpg123_close(m); mpg123_delete(m); mpg123_exit(); - return buffer; + +fail: + free(decbuf); + if (m) { + mpg123_close(m); + mpg123_delete(m); + } + if (mpg_inited) mpg123_exit(); + return buffer ? buffer : NULL; } #endif /*HAVE_LIBMPG123*/ static float *readaudio_snd(const char *filename, long *sr, const float nbsecs, unsigned int *buflen) { + *buflen = 0; SF_INFO sf_info; sf_info.format = 0; SNDFILE *sndfile = sf_open(filename, SFM_READ, &sf_info); if (sndfile == NULL) { return NULL; } + if (sf_info.channels <= 0 || sf_info.samplerate <= 0 || + sf_info.frames <= 0) { + sf_close(sndfile); + return NULL; + } - /* normalize */ sf_command(sndfile, SFC_SET_NORM_FLOAT, NULL, SF_TRUE); *sr = (long)sf_info.samplerate; - // allocate input buffer for signal unsigned int src_frames = - (nbsecs <= 0) ? sf_info.frames : (nbsecs * sf_info.samplerate); - src_frames = (sf_info.frames < src_frames) ? sf_info.frames : src_frames; - float *inbuf = - (float *)malloc(src_frames * sf_info.channels * sizeof(float)); + (nbsecs <= 0) ? (unsigned int)sf_info.frames + : (unsigned int)(nbsecs * sf_info.samplerate); + if ((sf_count_t)src_frames > sf_info.frames) { + src_frames = (unsigned int)sf_info.frames; + } + if (src_frames == 0) { + sf_close(sndfile); + return NULL; + } + + float *inbuf = (float *)malloc((size_t)src_frames * + (size_t)sf_info.channels * sizeof(float)); + if (inbuf == NULL) { + sf_close(sndfile); + return NULL; + } - /*read frames */ sf_count_t cnt_frames = sf_readf_float(sndfile, inbuf, src_frames); + sf_close(sndfile); + if (cnt_frames <= 0) { + free(inbuf); + return NULL; + } - float *buf = (float *)malloc(cnt_frames * sizeof(float)); + float *buf = (float *)malloc((size_t)cnt_frames * sizeof(float)); + if (buf == NULL) { + free(inbuf); + return NULL; + } - // average across all channels - int i, j, indx = 0; - for (i = 0; i < cnt_frames * sf_info.channels; i += sf_info.channels) { - buf[indx] = 0; - for (j = 0; j < sf_info.channels; j++) { - buf[indx] += inbuf[i + j]; + int indx = 0; + for (sf_count_t i = 0; i < cnt_frames * sf_info.channels; + i += sf_info.channels) { + float acc = 0.0f; + for (int j = 0; j < sf_info.channels; j++) { + acc += inbuf[i + j]; } - buf[indx++] /= sf_info.channels; + buf[indx++] = acc / sf_info.channels; } free(inbuf); - *buflen = indx; + *buflen = (unsigned int)indx; return buf; } -float *ph_readaudio2(const char *filename, int sr, float *sigbuf, int &buflen, - const float nbsecs) { - long orig_sr; - float *inbuffer = NULL; - unsigned int inbufferlength; +float *ph_readaudio2(const char *filename, int sr, float * /*sigbuf*/, + int &buflen, const float nbsecs) { buflen = 0; + if (filename == NULL || sr <= 0) return NULL; + + long orig_sr = 0; + float *inbuffer = NULL; + unsigned int inbufferlength = 0; const char *suffix = strrchr(filename, '.'); if (suffix == NULL) return NULL; if (!strcasecmp(suffix + 1, "mp3")) { #ifdef HAVE_LIBMPG123 inbuffer = readaudio_mp3(filename, &orig_sr, nbsecs, &inbufferlength); -#endif /* HAVE_LIBMPG123 */ +#endif } else { inbuffer = readaudio_snd(filename, &orig_sr, nbsecs, &inbufferlength); } - if (inbuffer == NULL) { + if (inbuffer == NULL || inbufferlength == 0 || orig_sr <= 0) { + free(inbuffer); return NULL; } - /* resample float array */ - /* set desired sr ratio */ double sr_ratio = (double)(sr) / (double)orig_sr; if (src_is_valid_ratio(sr_ratio) == 0) { free(inbuffer); return NULL; } - /* allocate output buffer for conversion */ - unsigned int outbufferlength = sr_ratio * inbufferlength; - float *outbuffer = (float *)malloc(outbufferlength * sizeof(float)); + unsigned int outbufferlength = + (unsigned int)(sr_ratio * (double)inbufferlength) + 1; + float *outbuffer = + (float *)malloc((size_t)outbufferlength * sizeof(float)); if (!outbuffer) { free(inbuffer); return NULL; @@ -234,7 +306,6 @@ float *ph_readaudio2(const char *filename, int sr, float *sigbuf, int &buflen, src_data.end_of_input = SF_TRUE; src_data.src_ratio = sr_ratio; - /* sample rate conversion */ if ((error = src_process(src_state, &src_data)) != 0) { free(inbuffer); free(outbuffer); @@ -242,115 +313,117 @@ float *ph_readaudio2(const char *filename, int sr, float *sigbuf, int &buflen, return NULL; } - buflen = src_data.output_frames; + // libsamplerate fills output_frames_gen with the count actually produced; + // output_frames is the capacity the caller supplied. Using the latter + // (the original bug) over-reads outbuffer downstream. + buflen = (int)src_data.output_frames_gen; src_delete(src_state); free(inbuffer); + if (buflen <= 0) { + free(outbuffer); + return NULL; + } + return outbuffer; } -float *ph_readaudio(const char *filename, int sr, int channels, float *sigbuf, - int &buflen, const float nbsecs) { +float *ph_readaudio(const char *filename, int sr, int /*channels*/, + float *sigbuf, int &buflen, const float nbsecs) { if (!filename || sr <= 0) return NULL; return ph_readaudio2(filename, sr, sigbuf, buflen, nbsecs); } uint32_t *ph_audiohash(float *buf, int N, int sr, int &nb_frames) { - int frame_length = 4096; // 2^12 - int nfft = frame_length; - int nfft_half = 2048; - int start = 0; - int end = start + frame_length - 1; - int overlap = (int)(31 * frame_length / 32); - int advance = frame_length - overlap; - int index = 0; - nb_frames = (int)(floor(N / advance) - floor(frame_length / advance) + 1); - double window[frame_length]; - for (int i = 0; i < frame_length; i++) { - // hamming window - window[i] = 0.54 - 0.46 * cos(2 * M_PI * i / (frame_length - 1)); + nb_frames = 0; + const int frame_length = 4096; + const int nfft = frame_length; + const int nfft_half = 2048; + if (buf == NULL || N < frame_length || sr <= 0) return NULL; + + const int overlap = 31 * frame_length / 32; + const int advance = frame_length - overlap; + if (advance <= 0) return NULL; + + nb_frames = (N - frame_length) / advance + 1; + if (nb_frames <= 0) { + nb_frames = 0; + return NULL; } - double frame[frame_length]; - complex *pF = new complex[nfft]; - - double magnF[nfft_half]; - double maxF = 0.0; - double maxB = 0.0; - - double minfreq = 300; - double maxfreq = 3000; - double minbark = 6 * asinh(minfreq / 600.0); - double maxbark = 6 * asinh(maxfreq / 600.0); - double nyqbark = maxbark - minbark; - int nfilts = 33; - double stepbarks = nyqbark / (nfilts - 1); - int nb_barks = (int)(floor(nfft_half / 2 + 1)); - double barkwidth = 1.06; - - double freqs[nb_barks]; - double binbarks[nb_barks]; - double curr_bark[nfilts]; - double prev_bark[nfilts]; - for (int i = 0; i < nfilts; i++) { - prev_bark[i] = 0.0; + std::vector window(frame_length); + for (int i = 0; i < frame_length; i++) { + window[i] = 0.54 - 0.46 * cos(2 * M_PI * i / (frame_length - 1)); } - uint32_t *hash = (uint32_t *)malloc(nb_frames * sizeof(uint32_t)); - double lof, hif; + std::vector frame(frame_length); + std::vector> pF(nfft); + std::vector magnF(nfft_half); + + const double minfreq = 300; + const double maxfreq = 3000; + const double minbark = 6 * asinh(minfreq / 600.0); + const double maxbark = 6 * asinh(maxfreq / 600.0); + const double nyqbark = maxbark - minbark; + const int nfilts = 33; + const double stepbarks = nyqbark / (nfilts - 1); + const int nb_barks = nfft_half / 2 + 1; + const double barkwidth = 1.06; + + std::vector binbarks(nb_barks); + std::vector curr_bark(nfilts, 0.0); + std::vector prev_bark(nfilts, 0.0); for (int i = 0; i < nb_barks; i++) { - binbarks[i] = 6 * asinh(i * sr / nfft_half / 600.0); - freqs[i] = i * sr / nfft_half; - } - double **wts = new double *[nfilts]; - for (int i = 0; i < nfilts; i++) { - wts[i] = new double[nfft_half]; - } - for (int i = 0; i < nfilts; i++) { - for (int j = 0; j < nfft_half; j++) { - wts[i][j] = 0.0; - } + binbarks[i] = 6 * asinh((double)i * sr / nfft_half / 600.0); } - // calculate wts for each filter + // wts[i*nfft_half + j] + std::vector wts((size_t)nfilts * (size_t)nfft_half, 0.0); for (int i = 0; i < nfilts; i++) { double f_bark_mid = minbark + i * stepbarks; for (int j = 0; j < nb_barks; j++) { double barkdiff = binbarks[j] - f_bark_mid; - lof = -2.5 * (barkdiff / barkwidth - 0.5); - hif = barkdiff / barkwidth + 0.5; + double lof = -2.5 * (barkdiff / barkwidth - 0.5); + double hif = barkdiff / barkwidth + 0.5; double m = std::min(lof, hif); m = std::min(0.0, m); m = pow(10, m); - wts[i][j] = m; + wts[(size_t)i * nfft_half + j] = m; } } - // p = fftw_plan_dft_r2c_1d(frame_length,frame,pF,FFTW_ESTIMATE); + uint32_t *hash = (uint32_t *)malloc((size_t)nb_frames * sizeof(uint32_t)); + if (hash == NULL) { + nb_frames = 0; + return NULL; + } - while (end < N) { - maxF = 0.0; - maxB = 0.0; + int start = 0; + int end = start + frame_length - 1; + int index = 0; + + while (end < N && index < nb_frames) { + double maxF = 0.0; + double maxB = 0.0; for (int i = 0; i < frame_length; i++) { frame[i] = window[i] * buf[start + i]; } - // fftw_execute(p); - if (fft(frame, frame_length, pF) < 0) { + if (fft(frame.data(), frame_length, pF.data()) < 0) { + free(hash); + nb_frames = 0; return NULL; } for (int i = 0; i < nfft_half; i++) { - // magnF[i] = sqrt(pF[i][0]*pF[i][0] + pF[i][1]*pF[i][1] ); - magnF[i] = abs(pF[i]); - if (magnF[i] > maxF) { - maxF = magnF[i]; - } + magnF[i] = std::abs(pF[i]); + if (magnF[i] > maxF) maxF = magnF[i]; } for (int i = 0; i < nfilts; i++) { curr_bark[i] = 0; + const double *wts_row = &wts[(size_t)i * nfft_half]; for (int j = 0; j < nfft_half; j++) { - curr_bark[i] += wts[i][j] * magnF[j]; + curr_bark[i] += wts_row[j] * magnF[j]; } if (curr_bark[i] > maxB) maxB = curr_bark[i]; } @@ -363,51 +436,43 @@ uint32_t *ph_audiohash(float *buf, int N, int sr, int &nb_frames) { if (H > 0) curr_hash |= 0x00000001; } - hash[index] = curr_hash; - for (int i = 0; i < nfilts; i++) { - prev_bark[i] = curr_bark[i]; - } - index += 1; + hash[index++] = curr_hash; + for (int i = 0; i < nfilts; i++) prev_bark[i] = curr_bark[i]; start += advance; end += advance; } - // fftw_destroy_plan(p); - // fftw_free(pF); - delete[] pF; - for (int i = 0; i < nfilts; i++) { - delete[] wts[i]; - } - delete[] wts; + nb_frames = index; return hash; } int ph_bitcount(uint32_t n) { -// parallel bit count -#define MASK_01010101 (((uint32_t)(-1)) / 3) -#define MASK_00110011 (((uint32_t)(-1)) / 5) -#define MASK_00001111 (((uint32_t)(-1)) / 17) - - n = (n & MASK_01010101) + ((n >> 1) & MASK_01010101); - n = (n & MASK_00110011) + ((n >> 2) & MASK_00110011); - n = (n & MASK_00001111) + ((n >> 4) & MASK_00001111); - return n % 255; + return __builtin_popcount(n); } double ph_compare_blocks(const uint32_t *ptr_blockA, const uint32_t *ptr_blockB, const int block_size) { + if (block_size <= 0) return 1.0; double result = 0; for (int i = 0; i < block_size; i++) { uint32_t xordhash = ptr_blockA[i] ^ ptr_blockB[i]; result += ph_bitcount(xordhash); } - result = result / (32 * block_size); + result = result / (32.0 * block_size); return result; } + double *ph_audio_distance_ber(uint32_t *hash_a, const int Na, uint32_t *hash_b, const int Nb, const float threshold, const int block_size, int &Nc) { - uint32_t *ptrA, *ptrB; + Nc = 0; + if (hash_a == NULL || hash_b == NULL || Na <= 0 || Nb <= 0 || + block_size <= 0) { + return NULL; + } + + uint32_t *ptrA; + uint32_t *ptrB; int N1, N2; if (Na <= Nb) { ptrA = hash_a; @@ -423,124 +488,74 @@ double *ph_audio_distance_ber(uint32_t *hash_a, const int Na, uint32_t *hash_b, N2 = Na; } - double *pC = new double[Nc]; - if (!pC) return NULL; - int k, M, nb_above, nb_below, hash1_index, hash2_index; - double sum_above, sum_below, above_factor, below_factor; + double *pC = (double *)malloc((size_t)Nc * sizeof(double)); + if (!pC) { + Nc = 0; + return NULL; + } - uint32_t *pha, *phb; - double *dist = NULL; + // Worst-case M occurs at i=0 (smallest constraint): floor(N1/block_size). + // Allocate once outside the loop instead of realloc'ing per iteration. + int max_M = N1 / block_size; + if (max_M < 1) max_M = 1; + double *dist = (double *)malloc((size_t)max_M * sizeof(double)); + if (!dist) { + free(pC); + Nc = 0; + return NULL; + } for (int i = 0; i < Nc; i++) { - M = (int)floor(std::min(N1, N2 - i) / block_size); - - pha = ptrA; - phb = ptrB + i; - - double *tmp_dist = (double *)realloc(dist, M * sizeof(double)); - if (!tmp_dist) { - return NULL; + int M = std::min(N1, N2 - i) / block_size; + if (M <= 0) { + pC[i] = 0.5; + continue; } - dist = tmp_dist; - dist[0] = ph_compare_blocks(pha, phb, block_size); - k = 1; + uint32_t *pha = ptrA; + uint32_t *phb = ptrB + i; + int hash1_index = 0; + int hash2_index = i; + int k = 0; - pha += block_size; - phb += block_size; - - hash1_index = block_size; - hash2_index = i + block_size; - - while ((hash1_index < N1 - block_size) && - (hash2_index < N2 - block_size)) { + while (k < M && hash1_index + block_size <= N1 && + hash2_index + block_size <= N2) { dist[k++] = ph_compare_blocks(pha, phb, block_size); hash1_index += block_size; hash2_index += block_size; pha += block_size; phb += block_size; } - sum_above = 0; - sum_below = 0; - nb_above = 0; - nb_below = 0; - for (int n = 0; n < M; n++) { + + if (k == 0) { + pC[i] = 0.5; + continue; + } + + double sum_above = 0, sum_below = 0; + for (int n = 0; n < k; n++) { if (dist[n] <= threshold) { sum_below += 1 - dist[n]; - nb_below++; } else { sum_above += 1 - dist[n]; - nb_above++; } } - above_factor = sum_above / M; - below_factor = sum_below / M; + double above_factor = sum_above / k; + double below_factor = sum_below / k; pC[i] = 0.5 * (1 + below_factor - above_factor); } free(dist); return pC; } -#ifdef HAVE_PTHREAD - -void *ph_audio_thread(void *p) { - slice *s = (slice *)p; - for (int i = 0; i < s->n; ++i) { - DP *dp = (DP *)s->hash_p[i]; - int N, count; - pair *p = (pair *)s->hash_params; - float *buf = ph_readaudio(dp->id, p->first, p->second, NULL, N); - uint32_t *hash = ph_audiohash(buf, N, p->first, count); - free(buf); - buf = NULL; - dp->hash = hash; - dp->hash_length = count; - } -} - -DP **ph_audio_hashes(char *files[], int count, int sr, int channels, - int threads) { - if (!files || count == 0) return NULL; - int num_threads; - if (threads > count) { - num_threads = count; - } else if (threads > 0) { - num_threads = threads; - } else { - num_threads = ph_num_threads(); - } - - DP **hashes = (DP **)malloc(count * sizeof(DP *)); - - for (int i = 0; i < count; ++i) { - hashes[i] = (DP *)malloc(sizeof(DP)); - hashes[i]->id = strdup(files[i]); - } - - pthread_t thds[num_threads]; - - int rem = count % num_threads; - int start = 0; - int off = 0; - slice *s = new slice[num_threads]; - for (int n = 0; n < num_threads; ++n) { - off = (int)floor((count / (float)num_threads) + - (rem > 0 ? num_threads - (count % num_threads) : 0)); - - s[n].hash_p = &hashes[start]; - s[n].n = off; - s[n].hash_params = new pair(sr, channels); - start += off; - --rem; - pthread_create(&thds[n], NULL, ph_audio_thread, &s[n]); - } - for (int i = 0; i < num_threads; ++i) { - pthread_join(thds[i], NULL); - delete (pair *)s[i].hash_params; - } - delete[] s; - - return hashes; -} -#endif +/* + * The previous version of this file contained a HAVE_PTHREAD-gated + * ph_audio_thread / ph_audio_hashes pair that referenced types `slice` and + * `DP` and a function `ph_num_threads()` that were never declared anywhere in + * the public headers, so the block could not actually be compiled. The + * slicing math was also off-by-one: for count=10, threads=3 it produced + * (5, 3, 3) which walks one element past the input. The dead code has been + * removed; callers can drive ph_readaudio / ph_audiohash from their own + * thread pool. + */ diff --git a/src/ph_fft.cpp b/src/ph_fft.cpp index db261576..0e5d81d3 100644 --- a/src/ph_fft.cpp +++ b/src/ph_fft.cpp @@ -24,10 +24,12 @@ #include "ph_fft.h" -void fft_calc(const int N, const double *x, complex *X, - complex *P, const int step, - const complex *twids) { - complex *S = P + N / 2; +#include + +static void fft_calc(const int N, const double *x, std::complex *X, + std::complex *P, const int step, + const std::complex *twids) { + std::complex *S = P + N / 2; if (N == 1) { X[0] = x[0]; return; @@ -36,26 +38,25 @@ void fft_calc(const int N, const double *x, complex *X, fft_calc(N / 2, x, S, X, 2 * step, twids); fft_calc(N / 2, x + step, P, X, 2 * step, twids); - int k; - for (k = 0; k < N / 2; k++) { + for (int k = 0; k < N / 2; k++) { P[k] = P[k] * twids[k * step]; X[k] = S[k] + P[k]; X[k + N / 2] = S[k] - P[k]; } } -int fft(double *x, int N, complex *X) { - complex *twiddle_factors = new complex[N / 2]; - complex *Xt = new complex[N]; +int fft(double *x, int N, std::complex *X) { + if (x == NULL || X == NULL || N <= 0) return -1; + // Cooley-Tukey radix-2 requires N to be a power of 2. + if ((N & (N - 1)) != 0) return -1; - int k; - for (k = 0; k < N / 2; k++) { - twiddle_factors[k] = polar(1.0, 2.0 * M_PI * k / N); - } - fft_calc(N, x, X, Xt, 1, twiddle_factors); + std::vector> twiddle_factors(N / 2); + std::vector> Xt(N); - delete[](twiddle_factors); - delete[](Xt); + for (int k = 0; k < N / 2; k++) { + twiddle_factors[k] = std::polar(1.0, 2.0 * M_PI * k / N); + } + fft_calc(N, x, X, Xt.data(), 1, twiddle_factors.data()); return 0; } diff --git a/src/ph_fft.h b/src/ph_fft.h index 69ece097..7631e552 100644 --- a/src/ph_fft.h +++ b/src/ph_fft.h @@ -25,10 +25,10 @@ #ifndef _FFT_H #define _FFT_H -#include -#include -#include -using namespace std; -int fft(double *x, int N, complex *X); +#include +#include +#include + +int fft(double *x, int N, std::complex *X); #endif From b32ff93b479bff377fb7de6a27bce7999775fe45 Mon Sep 17 00:00:00 2001 From: pHash Audit Date: Mon, 25 May 2026 11:25:15 -0700 Subject: [PATCH 2/2] Align audio hash params with Haitsma-Kalker reference Audit \xc2\xa74: the bit derivation, filterbank shape, and confidence score in ph_audiohash already matched Haitsma-Kalker 2002, but several parameters did not: - frame_length was hard-coded to 4096 samples regardless of sample rate. At the example's sr=8000 that is 512 ms, far from the paper's 0.37 s frame. Now derived as the power of 2 closest to sr * 0.37. - frame advance was frame_length / 32 (97% overlap) giving ~62 frames/s at sr=8000. The paper specifies 31.25 frames/s (~32 ms advance). Now derived as round(sr / 31.25). - maxfreq was 3000 Hz; the paper specifies 2000 Hz. The previous range extended the upper band by 1 kHz beyond Haitsma-Kalker. Now 2000 Hz. nfft_half is now frame_length / 2 (was hard-coded 2048). Bit derivation and filter weights are unchanged. Note: this changes the temporal density of the fingerprint. Callers passing block_size to ph_audio_distance_ber will likely want a smaller value (e.g. 64 instead of the example's 256) because the absolute frame count for the same audio drops by ~2x. Hashes produced by this code are not compatible with hashes produced by the old parameters. --- src/audiophash.cpp | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/audiophash.cpp b/src/audiophash.cpp index 08cfaf54..bfff707d 100644 --- a/src/audiophash.cpp +++ b/src/audiophash.cpp @@ -335,16 +335,32 @@ float *ph_readaudio(const char *filename, int sr, int /*channels*/, return ph_readaudio2(filename, sr, sigbuf, buflen, nbsecs); } +// Round target up to the nearest power of 2, then choose whichever of +// {that, that/2} is closer to target. Used to size FFT frames given a +// target duration. The custom radix-2 FFT in ph_fft requires a power +// of 2. +static int closest_pow2(double target) { + if (target <= 1.0) return 1; + int p = 1; + while ((double)p < target) p *= 2; + int prev = p / 2; + return (target - (double)prev) > ((double)p - target) ? p : prev; +} + uint32_t *ph_audiohash(float *buf, int N, int sr, int &nb_frames) { nb_frames = 0; - const int frame_length = 4096; + if (buf == NULL || sr <= 0) return NULL; + + // Haitsma-Kalker parameters: 0.37 s frames, 31.25 frames/sec + // (i.e. ~32 ms advance). Frame size is rounded to a power of 2 + // because the bundled FFT is radix-2. + const int frame_length = closest_pow2((double)sr * 0.37); const int nfft = frame_length; - const int nfft_half = 2048; - if (buf == NULL || N < frame_length || sr <= 0) return NULL; + const int nfft_half = frame_length / 2; + if (N < frame_length) return NULL; - const int overlap = 31 * frame_length / 32; - const int advance = frame_length - overlap; - if (advance <= 0) return NULL; + int advance = (int)std::round((double)sr / 31.25); + if (advance <= 0) advance = 1; nb_frames = (N - frame_length) / advance + 1; if (nb_frames <= 0) { @@ -361,8 +377,11 @@ uint32_t *ph_audiohash(float *buf, int N, int sr, int &nb_frames) { std::vector> pF(nfft); std::vector magnF(nfft_half); + // Haitsma-Kalker: 33 log-spaced bands from 300 to 2000 Hz. The + // previous value of 3000 Hz extended the upper band by 1 kHz beyond + // the paper. const double minfreq = 300; - const double maxfreq = 3000; + const double maxfreq = 2000; const double minbark = 6 * asinh(minfreq / 600.0); const double maxbark = 6 * asinh(maxfreq / 600.0); const double nyqbark = maxbark - minbark;