Huffman coding (or encoding) is a lossless data compression technique based on an algorithm developed by David A. Huffman in 1952. A variable length bit level code is assigned for each character in this technique. The length of the code is related to frequency of characters. Most frequent character is represented by the smallest code while least frequent character gets longest code. Since the frequencies of characters vary for different data sets or files, there is not one code table that efficiently works for every data sets.

Implementation of Huffman algorithm has four main steps;

Here is an example of implementing Huffman algorithm in C;

First, variable definitions:

PBYTE psrcbuf; PBYTE pdestbuf; UINT uisrcsize; UINT idestsize; UINT i; UINT j; int iinsert; DWORD dwtmp; int inodecnt; int iremain; int ileafcnt; DWORD itmpcode; int ilevel; UINT uidestcnt; _Table tree[511]; _HashTableComp HashTableComp[256]; _HashTable HashTable[256]; _Header Header = {"D52Huf", 0, 0};

where:

struct _Table_ { DWORD dwfreq; UCHAR ucAsciiCode; WORD wCode; char cCodeLength; struct _Table_ *pParent; }; typedef struct _Table_ _Table; typedef struct { char sSignature[7]; DWORD dwSize; int iLeafCnt; } _Header; typedef struct { DWORD dwfreq; UCHAR ucAsciiCode; WORD wCode; char cCodeLength; } _HashTableComp; typedef struct { UCHAR ucAsciiCode; WORD wCode; char cCodeLength; } _HashTable;

Calculate the frequency of characters;

ZeroMemory(tree, sizeof(tree)); ZeroMemory(HashTableComp, sizeof(HashTableComp)); ZeroMemory(HashTable, sizeof(HashTable)); // initialize tree with ascii codes for(j = 0; j < 256; j++) { tree[j].ucAsciiCode = j; HashTableComp[j].ucAsciiCode = j; } // calculate freq of ascii codes for(i = 0; i < uisrcsize; i++) tree[psrcbuf[i]].dwfreq += 1; // sort in descending order by frequency qsort(tree, 256, sizeof(_Table), DescendingSortByFrequency); // find size of freq list to exclude unused chars ileafcnt = 0; while(tree[ileafcnt].dwfreq > 0) { ileafcnt++; }; // sort in ascending order by frequency qsort(tree, ileafcnt, sizeof(_Table), AscendingSortByFrequency);

Build Prefix Tree

example; assume that data set consists of following 10 bytes as input; FF FF 3F 07 15 FF 00 00 00 00

Created binary tree;

Char

Freq

Leaf

3F

1

2

07

15

FF

00

4

3

leaf

5

9

Here is the binary tree;

iremain = ileafcnt; i = 0; inodecnt = 0; do { dwtmp = tree[i].dwfreq + tree[i + 1].dwfreq; if(tree[i].cCodeLength != -1) { tree[i].cCodeLength = 0; } if(tree[i + 1].cCodeLength != -1) { tree[i + 1].cCodeLength = 0; } tree[i].wCode = 0; tree[i + 1].wCode = 1; iremain--; iinsert = Insert2Array(i + 1, ileafcnt + inodecnt, dwtmp, tree); tree[i].pParent = tree[i + 1].pParent = &tree[iinsert]; i += 2; inodecnt++; } while(iremain > 1);

Huffman code table;

Code

1111

1110

110

10

0

_Table *root = tree[0].pParent; //calculation of codes j = 0; for(i = 0; (int) i < (inodecnt + ileafcnt); i++) { itmpcode = 0; if(tree[i].cCodeLength != -1) { root = tree[i].pParent; ilevel = 1; do { itmpcode = root->wCode << ilevel++; tree[i].wCode |= itmpcode; tree[i].cCodeLength++; root = root->pParent; }while(root !=NULL); if(tree[i].cCodeLength > 32) return 0; //does not allow HashTableComp[tree[i].ucAsciiCode].dwfreq = tree[i].dwfreq; HashTable[j].wCode = tree[i].wCode; for(int k = 0; k < tree[i].cCodeLength; k++) { itmpcode = HashTableComp[tree[i].ucAsciiCode].wCode; HashTableComp[tree[i].ucAsciiCode].wCode = (itmpcode << 1) | ((tree[i].wCode >> k) & 1); } HashTable[j].ucAsciiCode = tree[i].ucAsciiCode; HashTable[j].cCodeLength = tree[i].cCodeLength; HashTableComp[tree[i].ucAsciiCode].cCodeLength = tree[i].cCodeLength; j++; } }

Last step is to convert input data to encoded data;

80 bits

21 bits

Encoded data in hex is; 19 FC A0. Since data can be stored to disks in bytes, the coded data needs to be padded by “0”s at MSB or LSB side. This example uses padding at MSB site. As a result, the input is compressed by 3 times since It takes 3 bytes while the original is 10 bytes.

UINT Bit2Byte(_HashTableComp HashTable[], int ileafcnt, BYTE srcbuf[], UINT uisrcsize, BYTE destbuf[]) { UINT i; WORD wword = 0; BYTE ucbitcnt = 0; UINT uidestcnt = 0; for(i = 0; i < uisrcsize; i++) { wword |= HashTable[srcbuf[i]].wCode << ucbitcnt; ucbitcnt += HashTable[srcbuf[i]].cCodeLength; if(ucbitcnt > 16) { ucbitcnt -= 16; destbuf[uidestcnt++] = (unsigned short) wword & 0x0FF; destbuf[uidestcnt++] = ((unsigned short) wword >> 8 ) & 0x0FF; wword = 0; if(ucbitcnt > 0) // equals to > 16 wword = HashTable[srcbuf[i]].wCode >> (HashTable[srcbuf[i]].cCodeLength - ucbitcnt); } } if(ucbitcnt > 0) { destbuf[uidestcnt++] = (unsigned short) wword & 0x0FF; destbuf[uidestcnt++] = (unsigned short) wword >> 8; } *uiprogress = 100; return uidestcnt; }

Hovewer, Huffman code table will be needed to decompress the data. Therefore, it needs to be stored with the compressed data. The size of compressed data file will be increased by the size of the table.

Because of this overhead, size of compressed file could be more than original, if the size of original data is smaller than size of Huffman code table.

// save source buffer length and number of Huffman tree leaves Header.dwSize = uisrcsize; Header.iLeafCnt = ileafcnt; PBYTE pdestptr = pdestbuf; memset(pdestptr, 0, idestsize); *(_Header*)pdestbuf = Header; pdestptr += sizeof(Header); // save Huffman tree used leaves for(i = 0; i < (UINT)Header.iLeafCnt; i++) { memcpy(pdestptr, &HashTable[i], sizeof(_HashTable)); pdestptr += sizeof(_HashTable); }