aboutsummaryrefslogtreecommitdiffstats
path: root/squeeze.c
diff options
context:
space:
mode:
Diffstat (limited to 'squeeze.c')
-rw-r--r--squeeze.c546
1 files changed, 0 insertions, 546 deletions
diff --git a/squeeze.c b/squeeze.c
deleted file mode 100644
index 09e7e2e..0000000
--- a/squeeze.c
+++ /dev/null
@@ -1,546 +0,0 @@
-/*
-Copyright 2011 Google Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-Author: lode.vandevenne@gmail.com (Lode Vandevenne)
-Author: jyrki.alakuijala@gmail.com (Jyrki Alakuijala)
-*/
-
-#include "squeeze.h"
-
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-
-#include "blocksplitter.h"
-#include "deflate.h"
-#include "tree.h"
-#include "util.h"
-
-typedef struct SymbolStats {
- /* The literal and length symbols. */
- size_t litlens[288];
- /* The 32 unique dist symbols, not the 32768 possible dists. */
- size_t dists[32];
-
- double ll_symbols[288]; /* Length of each lit/len symbol in bits. */
- double d_symbols[32]; /* Length of each dist symbol in bits. */
-} SymbolStats;
-
-/* Sets everything to 0. */
-static void InitStats(SymbolStats* stats) {
- memset(stats->litlens, 0, 288 * sizeof(stats->litlens[0]));
- memset(stats->dists, 0, 32 * sizeof(stats->dists[0]));
-
- memset(stats->ll_symbols, 0, 288 * sizeof(stats->ll_symbols[0]));
- memset(stats->d_symbols, 0, 32 * sizeof(stats->d_symbols[0]));
-}
-
-static void CopyStats(SymbolStats* source, SymbolStats* dest) {
- memcpy(dest->litlens, source->litlens, 288 * sizeof(dest->litlens[0]));
- memcpy(dest->dists, source->dists, 32 * sizeof(dest->dists[0]));
-
- memcpy(dest->ll_symbols, source->ll_symbols,
- 288 * sizeof(dest->ll_symbols[0]));
- memcpy(dest->d_symbols, source->d_symbols, 32 * sizeof(dest->d_symbols[0]));
-}
-
-/* Adds the bit lengths. */
-static void AddWeighedStatFreqs(const SymbolStats* stats1, double w1,
- const SymbolStats* stats2, double w2,
- SymbolStats* result) {
- size_t i;
- for (i = 0; i < 288; i++) {
- result->litlens[i] =
- (size_t) (stats1->litlens[i] * w1 + stats2->litlens[i] * w2);
- }
- for (i = 0; i < 32; i++) {
- result->dists[i] =
- (size_t) (stats1->dists[i] * w1 + stats2->dists[i] * w2);
- }
- result->litlens[256] = 1; /* End symbol. */
-}
-
-typedef struct RanState {
- unsigned int m_w, m_z;
-} RanState;
-
-static void InitRanState(RanState* state) {
- state->m_w = 1;
- state->m_z = 2;
-}
-
-/* Get random number: "Multiply-With-Carry" generator of G. Marsaglia */
-static unsigned int Ran(RanState* state) {
- state->m_z = 36969 * (state->m_z & 65535) + (state->m_z >> 16);
- state->m_w = 18000 * (state->m_w & 65535) + (state->m_w >> 16);
- return (state->m_z << 16) + state->m_w; /* 32-bit result. */
-}
-
-static void RandomizeFreqs(RanState* state, size_t* freqs, int n) {
- int i;
- for (i = 0; i < n; i++) {
- if ((Ran(state) >> 4) % 3 == 0) freqs[i] = freqs[Ran(state) % n];
- }
-}
-
-static void RandomizeStatFreqs(RanState* state, SymbolStats* stats) {
- RandomizeFreqs(state, stats->litlens, 288);
- RandomizeFreqs(state, stats->dists, 32);
- stats->litlens[256] = 1; /* End symbol. */
-}
-
-static void ClearStatFreqs(SymbolStats* stats) {
- size_t i;
- for (i = 0; i < 288; i++) stats->litlens[i] = 0;
- for (i = 0; i < 32; i++) stats->dists[i] = 0;
-}
-
-/*
-Function that calculates a cost based on a model for the given LZ77 symbol.
-litlen: means literal symbol if dist is 0, length otherwise.
-*/
-typedef double CostModelFun(unsigned litlen, unsigned dist, void* context);
-
-/*
-Cost model which should exactly match fixed tree.
-type: CostModelFun
-*/
-static double GetCostFixed(unsigned litlen, unsigned dist, void* unused) {
- (void)unused;
- if (dist == 0) {
- if (litlen <= 143) return 8;
- else return 9;
- } else {
- int dbits = ZopfliGetDistExtraBits(dist);
- int lbits = ZopfliGetLengthExtraBits(litlen);
- int lsym = ZopfliGetLengthSymbol(litlen);
- double cost = 0;
- if (lsym <= 279) cost += 7;
- else cost += 8;
- cost += 5; /* Every dist symbol has length 5. */
- return cost + dbits + lbits;
- }
-}
-
-/*
-Cost model based on symbol statistics.
-type: CostModelFun
-*/
-static double GetCostStat(unsigned litlen, unsigned dist, void* context) {
- SymbolStats* stats = (SymbolStats*)context;
- if (dist == 0) {
- return stats->ll_symbols[litlen];
- } else {
- int lsym = ZopfliGetLengthSymbol(litlen);
- int lbits = ZopfliGetLengthExtraBits(litlen);
- int dsym = ZopfliGetDistSymbol(dist);
- int dbits = ZopfliGetDistExtraBits(dist);
- return stats->ll_symbols[lsym] + lbits + stats->d_symbols[dsym] + dbits;
- }
-}
-
-/*
-Finds the minimum possible cost this cost model can return for valid length and
-distance symbols.
-*/
-static double GetCostModelMinCost(CostModelFun* costmodel, void* costcontext) {
- double mincost;
- int bestlength = 0; /* length that has lowest cost in the cost model */
- int bestdist = 0; /* distance that has lowest cost in the cost model */
- int i;
- /*
- Table of distances that have a different distance symbol in the deflate
- specification. Each value is the first distance that has a new symbol. Only
- different symbols affect the cost model so only these need to be checked.
- See RFC 1951 section 3.2.5. Compressed blocks (length and distance codes).
- */
- static const int dsymbols[30] = {
- 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513,
- 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577
- };
-
- mincost = ZOPFLI_LARGE_FLOAT;
- for (i = 3; i < 259; i++) {
- double c = costmodel(i, 1, costcontext);
- if (c < mincost) {
- bestlength = i;
- mincost = c;
- }
- }
-
- mincost = ZOPFLI_LARGE_FLOAT;
- for (i = 0; i < 30; i++) {
- double c = costmodel(3, dsymbols[i], costcontext);
- if (c < mincost) {
- bestdist = dsymbols[i];
- mincost = c;
- }
- }
-
- return costmodel(bestlength, bestdist, costcontext);
-}
-
-/*
-Performs the forward pass for "squeeze". Gets the most optimal length to reach
-every byte from a previous byte, using cost calculations.
-s: the ZopfliBlockState
-in: the input data array
-instart: where to start
-inend: where to stop (not inclusive)
-costmodel: function to calculate the cost of some lit/len/dist pair.
-costcontext: abstract context for the costmodel function
-length_array: output array of size (inend - instart) which will receive the best
- length to reach this byte from a previous byte.
-returns the cost that was, according to the costmodel, needed to get to the end.
-*/
-static double GetBestLengths(ZopfliBlockState *s,
- const unsigned char* in,
- size_t instart, size_t inend,
- CostModelFun* costmodel, void* costcontext,
- unsigned short* length_array) {
- /* Best cost to get here so far. */
- size_t blocksize = inend - instart;
- float* costs;
- size_t i = 0, k;
- unsigned short leng;
- unsigned short dist;
- unsigned short sublen[259];
- size_t windowstart = instart > ZOPFLI_WINDOW_SIZE
- ? instart - ZOPFLI_WINDOW_SIZE : 0;
- ZopfliHash hash;
- ZopfliHash* h = &hash;
- double result;
- double mincost = GetCostModelMinCost(costmodel, costcontext);
-
- if (instart == inend) return 0;
-
- costs = (float*)malloc(sizeof(float) * (blocksize + 1));
- if (!costs) exit(-1); /* Allocation failed. */
-
- ZopfliInitHash(ZOPFLI_WINDOW_SIZE, h);
- ZopfliWarmupHash(in, windowstart, inend, h);
- for (i = windowstart; i < instart; i++) {
- ZopfliUpdateHash(in, i, inend, h);
- }
-
- for (i = 1; i < blocksize + 1; i++) costs[i] = ZOPFLI_LARGE_FLOAT;
- costs[0] = 0; /* Because it's the start. */
- length_array[0] = 0;
-
- for (i = instart; i < inend; i++) {
- size_t j = i - instart; /* Index in the costs array and length_array. */
- ZopfliUpdateHash(in, i, inend, h);
-
-#ifdef ZOPFLI_SHORTCUT_LONG_REPETITIONS
- /* If we're in a long repetition of the same character and have more than
- ZOPFLI_MAX_MATCH characters before and after our position. */
- if (h->same[i & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH * 2
- && i > instart + ZOPFLI_MAX_MATCH + 1
- && i + ZOPFLI_MAX_MATCH * 2 + 1 < inend
- && h->same[(i - ZOPFLI_MAX_MATCH) & ZOPFLI_WINDOW_MASK]
- > ZOPFLI_MAX_MATCH) {
- double symbolcost = costmodel(ZOPFLI_MAX_MATCH, 1, costcontext);
- /* Set the length to reach each one to ZOPFLI_MAX_MATCH, and the cost to
- the cost corresponding to that length. Doing this, we skip
- ZOPFLI_MAX_MATCH values to avoid calling ZopfliFindLongestMatch. */
- for (k = 0; k < ZOPFLI_MAX_MATCH; k++) {
- costs[j + ZOPFLI_MAX_MATCH] = costs[j] + symbolcost;
- length_array[j + ZOPFLI_MAX_MATCH] = ZOPFLI_MAX_MATCH;
- i++;
- j++;
- ZopfliUpdateHash(in, i, inend, h);
- }
- }
-#endif
-
- ZopfliFindLongestMatch(s, h, in, i, inend, ZOPFLI_MAX_MATCH, sublen,
- &dist, &leng);
-
- /* Literal. */
- if (i + 1 <= inend) {
- double newCost = costs[j] + costmodel(in[i], 0, costcontext);
- assert(newCost >= 0);
- if (newCost < costs[j + 1]) {
- costs[j + 1] = newCost;
- length_array[j + 1] = 1;
- }
- }
- /* Lengths. */
- for (k = 3; k <= leng && i + k <= inend; k++) {
- double newCost;
-
- /* Calling the cost model is expensive, avoid this if we are already at
- the minimum possible cost that it can return. */
- if (costs[j + k] - costs[j] <= mincost) continue;
-
- newCost = costs[j] + costmodel(k, sublen[k], costcontext);
- assert(newCost >= 0);
- if (newCost < costs[j + k]) {
- assert(k <= ZOPFLI_MAX_MATCH);
- costs[j + k] = newCost;
- length_array[j + k] = k;
- }
- }
- }
-
- assert(costs[blocksize] >= 0);
- result = costs[blocksize];
-
- ZopfliCleanHash(h);
- free(costs);
-
- return result;
-}
-
-/*
-Calculates the optimal path of lz77 lengths to use, from the calculated
-length_array. The length_array must contain the optimal length to reach that
-byte. The path will be filled with the lengths to use, so its data size will be
-the amount of lz77 symbols.
-*/
-static void TraceBackwards(size_t size, const unsigned short* length_array,
- unsigned short** path, size_t* pathsize) {
- size_t index = size;
- if (size == 0) return;
- for (;;) {
- ZOPFLI_APPEND_DATA(length_array[index], path, pathsize);
- assert(length_array[index] <= index);
- assert(length_array[index] <= ZOPFLI_MAX_MATCH);
- assert(length_array[index] != 0);
- index -= length_array[index];
- if (index == 0) break;
- }
-
- /* Mirror result. */
- for (index = 0; index < *pathsize / 2; index++) {
- unsigned short temp = (*path)[index];
- (*path)[index] = (*path)[*pathsize - index - 1];
- (*path)[*pathsize - index - 1] = temp;
- }
-}
-
-static void FollowPath(ZopfliBlockState* s,
- const unsigned char* in, size_t instart, size_t inend,
- unsigned short* path, size_t pathsize,
- ZopfliLZ77Store* store) {
- size_t i, j, pos = 0;
- size_t windowstart = instart > ZOPFLI_WINDOW_SIZE
- ? instart - ZOPFLI_WINDOW_SIZE : 0;
-
- size_t total_length_test = 0;
-
- ZopfliHash hash;
- ZopfliHash* h = &hash;
-
- if (instart == inend) return;
-
- ZopfliInitHash(ZOPFLI_WINDOW_SIZE, h);
- ZopfliWarmupHash(in, windowstart, inend, h);
- for (i = windowstart; i < instart; i++) {
- ZopfliUpdateHash(in, i, inend, h);
- }
-
- pos = instart;
- for (i = 0; i < pathsize; i++) {
- unsigned short length = path[i];
- unsigned short dummy_length;
- unsigned short dist;
- assert(pos < inend);
-
- ZopfliUpdateHash(in, pos, inend, h);
-
- /* Add to output. */
- if (length >= ZOPFLI_MIN_MATCH) {
- /* Get the distance by recalculating longest match. The found length
- should match the length from the path. */
- ZopfliFindLongestMatch(s, h, in, pos, inend, length, 0,
- &dist, &dummy_length);
- assert(!(dummy_length != length && length > 2 && dummy_length > 2));
- ZopfliVerifyLenDist(in, inend, pos, dist, length);
- ZopfliStoreLitLenDist(length, dist, store);
- total_length_test += length;
- } else {
- length = 1;
- ZopfliStoreLitLenDist(in[pos], 0, store);
- total_length_test++;
- }
-
-
- assert(pos + length <= inend);
- for (j = 1; j < length; j++) {
- ZopfliUpdateHash(in, pos + j, inend, h);
- }
-
- pos += length;
- }
-
- ZopfliCleanHash(h);
-}
-
-/* Calculates the entropy of the statistics */
-static void CalculateStatistics(SymbolStats* stats) {
- ZopfliCalculateEntropy(stats->litlens, 288, stats->ll_symbols);
- ZopfliCalculateEntropy(stats->dists, 32, stats->d_symbols);
-}
-
-/* Appends the symbol statistics from the store. */
-static void GetStatistics(const ZopfliLZ77Store* store, SymbolStats* stats) {
- size_t i;
- for (i = 0; i < store->size; i++) {
- if (store->dists[i] == 0) {
- stats->litlens[store->litlens[i]]++;
- } else {
- stats->litlens[ZopfliGetLengthSymbol(store->litlens[i])]++;
- stats->dists[ZopfliGetDistSymbol(store->dists[i])]++;
- }
- }
- stats->litlens[256] = 1; /* End symbol. */
-
- CalculateStatistics(stats);
-}
-
-/*
-Does a single run for ZopfliLZ77Optimal. For good compression, repeated runs
-with updated statistics should be performed.
-
-s: the block state
-in: the input data array
-instart: where to start
-inend: where to stop (not inclusive)
-path: pointer to dynamically allocated memory to store the path
-pathsize: pointer to the size of the dynamic path array
-length_array: array if size (inend - instart) used to store lengths
-costmodel: function to use as the cost model for this squeeze run
-costcontext: abstract context for the costmodel function
-store: place to output the LZ77 data
-returns the cost that was, according to the costmodel, needed to get to the end.
- This is not the actual cost.
-*/
-static double LZ77OptimalRun(ZopfliBlockState* s,
- const unsigned char* in, size_t instart, size_t inend,
- unsigned short** path, size_t* pathsize,
- unsigned short* length_array, CostModelFun* costmodel,
- void* costcontext, ZopfliLZ77Store* store) {
- double cost = GetBestLengths(
- s, in, instart, inend, costmodel, costcontext, length_array);
- free(*path);
- *path = 0;
- *pathsize = 0;
- TraceBackwards(inend - instart, length_array, path, pathsize);
- FollowPath(s, in, instart, inend, *path, *pathsize, store);
- assert(cost < ZOPFLI_LARGE_FLOAT);
- return cost;
-}
-
-void ZopfliLZ77Optimal(ZopfliBlockState *s,
- const unsigned char* in, size_t instart, size_t inend,
- ZopfliLZ77Store* store) {
- /* Dist to get to here with smallest cost. */
- size_t blocksize = inend - instart;
- unsigned short* length_array =
- (unsigned short*)malloc(sizeof(unsigned short) * (blocksize + 1));
- unsigned short* path = 0;
- size_t pathsize = 0;
- ZopfliLZ77Store currentstore;
- SymbolStats stats, beststats, laststats;
- int i;
- double cost;
- double bestcost = ZOPFLI_LARGE_FLOAT;
- double lastcost = 0;
- /* Try randomizing the costs a bit once the size stabilizes. */
- RanState ran_state;
- int lastrandomstep = -1;
-
- if (!length_array) exit(-1); /* Allocation failed. */
-
- InitRanState(&ran_state);
- InitStats(&stats);
- ZopfliInitLZ77Store(&currentstore);
-
- /* Do regular deflate, then loop multiple shortest path runs, each time using
- the statistics of the previous run. */
-
- /* Initial run. */
- ZopfliLZ77Greedy(s, in, instart, inend, &currentstore);
- GetStatistics(&currentstore, &stats);
-
- /* Repeat statistics with each time the cost model from the previous stat
- run. */
- for (i = 0; i < s->options->numiterations; i++) {
- ZopfliCleanLZ77Store(&currentstore);
- ZopfliInitLZ77Store(&currentstore);
- LZ77OptimalRun(s, in, instart, inend, &path, &pathsize,
- length_array, GetCostStat, (void*)&stats,
- &currentstore);
- cost = ZopfliCalculateBlockSize(currentstore.litlens, currentstore.dists,
- 0, currentstore.size, 2);
- if (s->options->verbose_more || (s->options->verbose && cost < bestcost)) {
- fprintf(stderr, "Iteration %d: %d bit\n", i, (int) cost);
- }
- if (cost < bestcost) {
- /* Copy to the output store. */
- ZopfliCopyLZ77Store(&currentstore, store);
- CopyStats(&stats, &beststats);
- bestcost = cost;
- }
- CopyStats(&stats, &laststats);
- ClearStatFreqs(&stats);
- GetStatistics(&currentstore, &stats);
- if (lastrandomstep != -1) {
- /* This makes it converge slower but better. Do it only once the
- randomness kicks in so that if the user does few iterations, it gives a
- better result sooner. */
- AddWeighedStatFreqs(&stats, 1.0, &laststats, 0.5, &stats);
- CalculateStatistics(&stats);
- }
- if (i > 5 && cost == lastcost) {
- CopyStats(&beststats, &stats);
- RandomizeStatFreqs(&ran_state, &stats);
- CalculateStatistics(&stats);
- lastrandomstep = i;
- }
- lastcost = cost;
- }
-
- free(length_array);
- free(path);
- ZopfliCleanLZ77Store(&currentstore);
-}
-
-void ZopfliLZ77OptimalFixed(ZopfliBlockState *s,
- const unsigned char* in,
- size_t instart, size_t inend,
- ZopfliLZ77Store* store)
-{
- /* Dist to get to here with smallest cost. */
- size_t blocksize = inend - instart;
- unsigned short* length_array =
- (unsigned short*)malloc(sizeof(unsigned short) * (blocksize + 1));
- unsigned short* path = 0;
- size_t pathsize = 0;
-
- if (!length_array) exit(-1); /* Allocation failed. */
-
- s->blockstart = instart;
- s->blockend = inend;
-
- /* Shortest path for fixed tree This one should give the shortest possible
- result for fixed tree, no repeated runs are needed since the tree is known. */
- LZ77OptimalRun(s, in, instart, inend, &path, &pathsize,
- length_array, GetCostFixed, 0, store);
-
- free(length_array);
- free(path);
-}