基于哈夫曼编码完成的文件压缩及解压

原创工程项目 2022年1月26日 11:19 夏至未至 1875

当前内容 8797 字，在路上，马上到，马上到

### 压缩步骤
1.获取文件信息（这里采用TXT格式文本）；
2.压缩文件；
3.写配置文件（便于解压时用，无非就是存放原文件的索引之类的，比如说，文件中某个字符出现的个数，记录下来）
4.解压缩，使用压缩后的文件和配置文件解压文件；
5.用比对软件，比对解压后的文件和源文件是否相同；

### 代码结构
#### 文件信息类
    typedef long long LongType;
    struct FileInfo
    {
    	unsigned char _ch;       //字符
    	LongType _count;         //字符出现次数
    	string _code;            //字符对应的哈夫曼编码 
     
    	FileInfo(unsigned char ch = 0)
    		:_ch(ch)
    		,_count(0)
    	{}
     
     
    	FileInfo operator+(const FileInfo& x)
    	{
    		FileInfo tmp;
    		tmp._count = this->_count + x._count;
    		return tmp;
    	}
     
    	bool operator !=(const FileInfo& x) const
    	{
    		return this->_count != x._count;
    	}
    };
     
    bool operator<(const FileInfo info1,const FileInfo info2)
    {
    	return info1._count < info2._count;
    }
此为一个文件信息的类结构，包含字符，字符对应出现的次数，以及这个字符对应的哈夫曼编码
除了统计字符出现的次数及哈夫曼编码，还完成了几个运算符的重载

#### 建小堆
要获取哈夫曼编码，就得建立哈夫曼树，建立哈夫曼树用最小堆取操作，以下是最小堆建立过程

// 小堆
    template<class T>
    struct Less
    {
    	bool operator() (const T& l, const T& r)
    	{
    		return l < r; // operator<
    	}
     
    };
     
    template<class T>
    struct Greater
    {
    	bool operator() (const T& l, const T& r)
    	{
    		return l > r; // operator<
    	}
    };
     
    template<class T, class Compare = Less<T>>
    class Heap
    {
    public:
    	Heap()
    	{}
     
    	Heap(const T* a, size_t size)
    	{
    		for (size_t i = 0; i < size; ++i)
    		{
    			_arrays.push_back(a[i]);
    		}
     
    		// 建堆
    		for(int i = (_arrays.size()-2)/2; i >= 0; --i)
    		{
    			AdjustDown(i);
    		}
    	}
     
    	void Push(const T& x)
    	{
    		_arrays.push_back(x);
    		AdjustUp(_arrays.size()-1);
    	}
     
    	void Pop()
    	{
    		assert(_arrays.size() > 0);
    		swap(_arrays[0], _arrays[_arrays.size() - 1]);
    		_arrays.pop_back();
     
    		AdjustDown(0);
    	}
     
    	T& Top()
    	{
    		assert(_arrays.size() > 0);
    		return _arrays[0];
    	}
     
    	bool Empty()
    	{
    		return _arrays.empty();
    	}
     
    	int Size()
    	{
    		return _arrays.size();
    	}
     
    	void AdjustDown(int root)
    	{
    		int child = root*2 + 1;
    		// 	
    		Compare com;
    		while (child < _arrays.size())
    		{
    			// 比较出左右孩子中小的那个
    			if (child+1<_arrays.size() &&
    				*_arrays[child+1] < _arrays[child])
    			//if(child+1<_arrays.size() &&
    			//	com(_arrays[child+1],_arrays[child]))
    			{
    				++child;
    			}
     
    			if(*_arrays[child] < _arrays[root])
    			//if(com(_arrays[child],_arrays[root]))
    			{
    				swap(_arrays[child], _arrays[root]);
    				root = child;
    				child = 2*root+1;
    			}
    			else
    			{
    				break;
    			}
    		}
    	}
     
    	void AdjustUp(int child)
    	{
    		int parent = (child-1)/2;
     
    		//while (parent >= 0)
    		while (child > 0)
    		{
    			if (*_arrays[child] < _arrays[parent])
    			{
    				swap(_arrays[parent], _arrays[child]);
    				child = parent;
    				parent = (child-1)/2;
    			}
    			else
    			{
    				break;
    			}
    		}
    	}
     
     
    public:
    	vector<T> _arrays;
    };
最小堆里也完成了很多接口，包括push  pop等

#### 获取哈夫曼编码
根据哈夫曼树获取哈夫曼变慢：

void _GenerateHuffmanCode(HuffmanTreeNode<FileInfo>* root)
	{
		if (root == nullptr)
		{
			return;
		}
 
		_GenerateHuffmanCode(root->_left);
		_GenerateHuffmanCode(root->_right);
 
		//当前节点为叶子节点为空  才生成哈夫曼编码
		if (root->_left == nullptr && root->_right == nullptr)
		{
			HuffmanTreeNode<FileInfo>* cur = root;
			HuffmanTreeNode<FileInfo>* parent = cur->_parent;
			string& code = _infos[cur->_weight._ch]._code;
 
			while (parent)
			{
				if (parent->_left == cur)
				{
					code += '1';
				}
				else if (parent->_right == cur)
				{
					code += '0';
				}
				cur = parent;
				parent = cur->_parent;
			}
			reverse(code.begin(), code.end());
		}
	}

#### 建立哈夫曼树
    void CreateTree(T *a, size_t size, const T& invalid)
	{
		assert(a);
		Heap<HuffmanTreeNode<T>*> s1;  //草 终于发现问题  在这里   （堆里放的是指针，类型一定要对）
 
		//找两个最小的元素
		for (size_t i = 0; i < size; ++i)
		{
			if (a[i] != invalid)
			{
			HuffmanTreeNode<T>* node = new HuffmanTreeNode<T>(a[i]);
				s1.Push(node);
			}
		}
 
		while (s1.Size() > 1)
		{
			HuffmanTreeNode<T>* left = s1.Top();
			s1.Pop();
			HuffmanTreeNode<T>* right = s1.Top();
			s1.Pop();
 
 
			HuffmanTreeNode<T>* parent = new HuffmanTreeNode<T>(left->_weight + right->_weight);
 
		
			parent->_left = left;
			parent->_right = right;
 
			left->_parent = parent;
			right->_parent = parent;
 
			s1.Push(parent);
		}
		_root = s1.Top();
		s1.Pop();
	}

#### 行级读取文本
	bool _ReadLine(FILE *fOutLogFile, string& line)
	{
		char ch = fgetc(fOutLogFile);
		if (feof(fOutLogFile))
			return false;
		else
		{
			if (ch == '\n')
			{
				line += ch;
				ch = fgetc(fOutLogFile);
			}
 
			while (ch != '\n')
			{
				line += ch;
				ch = fgetc(fOutLogFile);
			}
			return true;
		}
	}

#### 文件压缩
	bool Compress(const char* filename)
	{
		//1.打开一个文件，统计文件字符出现的次数
		//2.生成对应的哈弗曼编码
		//3.压缩文件
		//4.写配置文件，方便解压缩
 
		assert(filename);
		FILE *fOut = fopen(filename, "rb");
		assert(fOut);
 
		//统计文件字符出现的次数
		unsigned char ch = fgetc(fOut);
		while (!feof(fOut))  //文件结束
		{
			_infos[ch]._count++;
			ch = fgetc(fOut);
		}
 
		HuffmanTree<FileInfo> ht;
		FileInfo invalid;
		ht.CreateTree(_infos, 256, invalid);
 
		//哈夫曼编码
		_GenerateHuffmanCode(ht.GetRoot());
 
		string compressFile = filename;
		compressFile += ".huf";
 
		//压缩后的文件名 后缀为《输入文件名+.huf》
		FILE *finCompress = fopen(compressFile.c_str(), "wb"); //获取string中的C字符串
		assert(finCompress);
 
		fseek(fOut, 0, SEEK_SET);//将文件指针移到开头
		char cha = fgetc(fOut);
		unsigned char inch = 0;
		int index = 0;  //一个字节的八位
		while (!feof(fOut))
		{
			string& code = _infos[(unsigned char)cha]._code;
 
			for (size_t i = 0; i < code.size(); ++i)
			{
				inch <<= 1;     //低位向高位进
				if (code[i] == '1')
				{
					inch |= 1;
				}
 
				if (++index == 8)
				{
					fputc(inch, finCompress); //够8位，装进文件
					index = 0;   //重新一轮开始
					inch = 0;
				}
			}
			cha = fgetc(fOut);
		}
 
		fclose(fOut);
 
		//如果index = 0 说明 上边8位刚好存满 不等 下一个自己又出来了
		if (index != 0)   //处理最后一个字符不够的问题
		{
			inch <<= (8 - index); //最高位必须装上 后边的浪费掉
			fputc(inch, finCompress);
		}
 
		fclose(finCompress);
	}

#### 写配置文件
    string logFile = filename;
    logFile += ".log";
    
    FILE *Log = fopen(logFile.c_str(), "wb");
    assert(Log);
    
    string chInfo;
    
    char str[128] = {0}; //没空间 不可以
    
    for (size_t i = 1; i < 256; ++i)
    {
    	if (_infos[i]._count > 0)
    	{
    		chInfo += _infos[i]._ch;
    		chInfo += ',';
    		chInfo += _itoa(_infos[i]._count,str,10);
    		chInfo += '\n';
    		fputs(chInfo.c_str(), Log);
    		chInfo.clear();
    	}
    }
    
    fclose(Log);

#### 文件解压

void _RestoreFiles(HuffmanTreeNode<FileInfo> *root, const char* Fileneme,long long size)
	{
		assert(root);
 
		//原压缩文件
		string name = Fileneme;
		name += ".huf";
		
		FILE* Out = fopen(name.c_str(),"rb");
		assert(Out);
		
		string restorefilename = Fileneme;
		restorefilename += ".over";
		FILE *over = fopen(restorefilename.c_str(),"wb");
		assert(over);
 
		int pos = 8;
		long long poss = size;
 
		unsigned char chz = fgetc(Out);
		while (poss>0)
		{
			HuffmanTreeNode<FileInfo>* cur = nullptr;
			cur = root;
			while (cur->_left != nullptr || cur->_right != nullptr)
			{
				pos--;
				unsigned char temp = chz >> pos;
				int ch = 1 & temp;
				if (ch == 0)
				{
					cur = cur->_right;
				}
 
				else if (ch == 1)
				{
					cur = cur->_left;
				}
 
				if (pos == 0)
				{
					chz = fgetc(Out);
					pos = 8;
				}
			}
			fputc(cur->_weight._ch, over);
 
			poss--;
		}
 
		fclose(Out);
		fclose(over);
	}
 
	void UnCompress(const char* Fileneme)//解压缩
	{
		//1.打开日志文件
		//2.根据信息还原哈夫曼树
		//3.还原信息；
		string UnCompressneme = Fileneme;
		UnCompressneme += ".log";
		FILE *fOutLogFile = fopen(UnCompressneme.c_str(), "rb");
		assert(fOutLogFile);
 
		string line;
		while (_ReadLine(fOutLogFile, line))
		{
			unsigned char ch = line[0];
			_infos[ch]._count = atoi(line.substr(2).c_str());
			line.clear();
		} 
 
		HuffmanTree<FileInfo> f;
		FileInfo invalid;
		f.CreateTree(_infos, 256, invalid);
 
		//根据重建的哈夫曼树 还原文件；
		long long size = f.GetRoot()->_weight._count;
		_RestoreFiles(f.GetRoot(), Fileneme,size);
	}

本文标题：基于哈夫曼编码完成的文件压缩及解压

本文作者：夏至未至

发布时间： 2022年1月26日 11:19

最近更新： 2022年4月17日 19:44

原文链接：

许可协议：署名-非商业性-禁止演绎 4.0 国际(CC BY-NC-ND 4.0)

请按协议转载并保留原文链接及作者

哈夫曼编码(1)

当前文章评论暂未开放,请移步至留言处留言。

码到城攻以码平川-码到城攻