200字范文,内容丰富有趣,生活中的好帮手!
200字范文 > 利用哈夫曼编码压缩文本

利用哈夫曼编码压缩文本

时间:2021-05-19 06:00:25

相关推荐

利用哈夫曼编码压缩文本

文章目录

使用哈夫曼编码进行压缩文本文本内容读取文件内容至内存中遍历文本内容,获取每个字符对应出现概率值建立哈夫曼树获取哈夫曼编码将转换后的编码写入新文件检测压缩率利用编码文件进行还原文本 完整code

使用哈夫曼编码进行压缩文本

文本内容

Even though neural network-based models have achieved

a lot, there are several important things that have not been

taken into consideration. Firstly, as shown in Figure 1, each

block is represented as a low-dimensional embedding with

manually selected features, which will cause the loss of

much semantic information. Secondly, the order of the nodes

plays an important role in representing binary functions,

while previous approaches did not design methods to extract

it. To solve these two problems, we propose an overall

framework with three components: semantic-aware modeling,

structural-aware modeling, and order-aware modeling.

读取文件内容至内存中

char *read_file(const char *filename)///从文件中读取字符串{FILE *fp = NULL;char buf[256];int len = 0;char *x = new char [10000];*x = '\0';if((fp = fopen(filename, "r"))==NULL){perror("can't open the file");exit(1);}while(fgets(buf, 255, fp) != NULL){len = strlen(buf);//printf("%d", len);if(buf[len-1] == '\n'){buf[len-1] = '\0';}//printf("%s\n", buf);strcat(x, buf);}//printf("%s\n", x);fclose(fp);return x;}

遍历文本内容,获取每个字符对应出现概率值

int samechar(char *x)///该函数将不同字符出现的次数记录下来{int flag;int n=0;tree_huffman[n].c = x[0];tree_huffman[n].w = 1.0;int l=strlen(x);for(int i=1;i<l;i++){int j;flag=0;for(j=0;j<=n;j++){if(x[i]==tree_huffman[j].c){flag=1;tree_huffman[j].w++;break;}}if(!flag){n++;tree_huffman[n].c=x[i];tree_huffman[n].w=1.0;}}return n+1;}

建立哈夫曼树

首先给所有节点的权值由小到大排序;取出最小的两个节点,将它们的权值相加,得到一个新节点使用这个新节点代替两个原来的节点继续排序,替换直到整个队列中只有一个节点这就构成了一棵哈夫曼树

int cmp(Tree a, Tree b){return a.w<b.w;}int create_huffman(int n) ///构建哈夫曼树{int rear = n, font = 0;Tree last;while(rear > font){sort(tree_huffman+font, tree_huffman+rear, cmp);last = tree_huffman[font];last.l = font;font++;if (rear <= font)break;else{Tree t = tree_huffman[font];last.w += t.w;last.c = '\0';last.r = font;last.par = -1;font++;tree_huffman[rear++] = last;}}for(int i=0;i<rear;++i){if(tree_huffman[i].l >= 0){int lchild = tree_huffman[i].l;tree_huffman[lchild].par = i;}if(tree_huffman[i].r >= 0){int rchild = tree_huffman[i].r;tree_huffman[rchild].par = i;}}//for(int i=0;i<rear;++i)//printf("%d, %c,%.2lf,%d,%d,%d\n", i, tree_huffman[i].c, tree_huffman[i].w, tree_huffman[i].l, tree_huffman[i].r, tree_huffman[i].par);return rear;}

获取哈夫曼编码

采用递归获取哈夫曼编码需要注意一个问题:使用char *进行递归时,系统只会给char *开辟一块内存这样造成哈夫曼编码前缀相同解决问题方法:换用string,string每次都开辟新内存来递归;或每次递归时给char *开辟新内存

void output_tree_code(Tree &temp, char *code){if(temp.l == -1 && temp.r == -1){printf("%s\n", code);strcpy(temp.flag, code);return;}char t1[256], t2[256];strcpy(t1, code);strcpy(t2, code);output_tree_code(tree_huffman[temp.l], strcat(t1, "0"));output_tree_code(tree_huffman[temp.r], strcat(t2, "1"));}

将转换后的编码写入新文件

将哈夫曼编码每八位截取,不足8位的后面补零再将单个字节写入二进制文件

void wfile(const char *filename1, const char *filename2, int n){FILE *fp = NULL;fp = fopen(filename2, "wb");char *buf = read_file(filename1);char *temp = new char [10000];temp[0] = '\0';for(int i=0;i<int(strlen(buf));++i){for(int j=0;j<n;++j){if(buf[i] == tree_huffman[j].c){strcat(temp, tree_huffman[j].flag);break;}}}printf("%s\n", temp);int len = 0;if(strlen(temp) % 8 == 0)len = strlen(temp) / 8;else{len = strlen(temp) / 8 + 1;while(strlen(temp) % 8 != 0){strcat(temp, "0");}}for(int i=0;i<len;++i){int a = 0;for(int j=0;j<8;++j){int ca = temp[i*8+j] - '0';a += ca * pow(2, 7-j);//printf("%d\t", ca * pow(2, 8-j));}fwrite(&a, 1, 1, fp);}fclose(fp);}

检测压缩率

查看压缩文件2.huffman的大小 查看原文件大小 342/636 = 53%,也就是说压缩率为47%,压缩了将近一半的大小,就压缩率而言,相当可观。

利用编码文件进行还原文本

读取哈夫曼编码还原文本;从第一个字符开始读,如果为0,则向根节点的左子树走,如果为1,则向根节点的右子树走;每读取一个字符,进行一次移动操作;直到移动至叶子节点,该节点的左右子树均为空,递归结束,输出该节点的字符;继续读取下一个字符;直到所有字符读取完毕;

void unzip_huffman(const char *filename, int root){FILE *fp = NULL;if((fp = fopen(filename, "rb"))==NULL){perror("can't open the file");exit(1);}int cb[10000];int num = 0;while(!feof(fp)){int element = getc(fp);//printf("%d\t", element);if(element != -1){for(int i=0;i<8;++i){int t = element % 2;element = element / 2;cb[num++] = t;}}}int ccb[10000];int xnum = 0;for(int i=0;i<num/8;++i){for(int j=0;j<8;++j){ccb[xnum++] = cb[i*8+7-j];}}fclose(fp);char ss[10000] = "\0";int sr = 0;num = 0;int rt = root;while(num < xnum){while(tree_huffman[rt].l != -1 && tree_huffman[rt].r != -1){if(ccb[num] == 0){rt = tree_huffman[rt].l;}else if(ccb[num] == 1){rt = tree_huffman[rt].r;}num++;}ss[sr++] = tree_huffman[rt].c;//printf("%c", tree_huffman[rt].c);rt = root;}printf("%s\n", ss);}

完整code

#include<iostream>#include<algorithm>#include<cstring>#include<string>#include<stdio.h>#include<stdlib.h>#include<cmath>using namespace std;typedef struct haffman{char c;///标记字符char flag[256]; ///标记哈夫曼编码double w;///标记权值int l, r;///左孩子有孩子int par;///标记是否存在父母节点}Tree;Tree tree_huffman[128]; ///ascii表共128个字符char *read_file(const char *filename)///从文件中读取字符串{FILE *fp = NULL;char buf[256];int len = 0;char *x = new char [10000];*x = '\0';if((fp = fopen(filename, "r"))==NULL){perror("can't open the file");exit(1);}while(fgets(buf, 255, fp) != NULL){len = strlen(buf);//printf("%d", len);if(buf[len-1] == '\n'){buf[len-1] = '\0';}//printf("%s\n", buf);strcat(x, buf);}//printf("%s\n", x);fclose(fp);return x;}int samechar(char *x)///该函数将不同字符出现的次数记录下来{int flag;int n=0;tree_huffman[n].c = x[0];tree_huffman[n].w = 1.0;int l=strlen(x);for(int i=1;i<l;i++){int j;flag=0;for(j=0;j<=n;j++){if(x[i]==tree_huffman[j].c){flag=1;tree_huffman[j].w++;break;}}if(!flag){n++;tree_huffman[n].c=x[i];tree_huffman[n].w=1.0;}}return n+1;}int cmp(Tree a, Tree b){return a.w<b.w;}int create_huffman(int n) ///构建哈夫曼树{int rear = n, font = 0;Tree last;while(rear > font){sort(tree_huffman+font, tree_huffman+rear, cmp);last = tree_huffman[font];last.l = font;font++;if (rear <= font)break;else{Tree t = tree_huffman[font];last.w += t.w;last.c = '\0';last.r = font;last.par = -1;font++;tree_huffman[rear++] = last;}}for(int i=0;i<rear;++i){if(tree_huffman[i].l >= 0){int lchild = tree_huffman[i].l;tree_huffman[lchild].par = i;}if(tree_huffman[i].r >= 0){int rchild = tree_huffman[i].r;tree_huffman[rchild].par = i;}}//for(int i=0;i<rear;++i)//printf("%d, %c,%.2lf,%d,%d,%d\n", i, tree_huffman[i].c, tree_huffman[i].w, tree_huffman[i].l, tree_huffman[i].r, tree_huffman[i].par);return rear;}void output_tree_code(Tree &temp, char *code){if(temp.l == -1 && temp.r == -1){printf("%s\n", code);strcpy(temp.flag, code);return;}char t1[256], t2[256];strcpy(t1, code);strcpy(t2, code);output_tree_code(tree_huffman[temp.l], strcat(t1, "0"));output_tree_code(tree_huffman[temp.r], strcat(t2, "1"));}void wfile(const char *filename1, const char *filename2, int n){FILE *fp = NULL;fp = fopen(filename2, "wb");char *buf = read_file(filename1);char *temp = new char [10000];temp[0] = '\0';for(int i=0;i<int(strlen(buf));++i){for(int j=0;j<n;++j){if(buf[i] == tree_huffman[j].c){//printf("%s\n", tree_huffman[j].flag);strcat(temp, tree_huffman[j].flag);//fwrite(tree_huffman[j].flag, (1.0/8.0)*strlen(tree_huffman[j].flag), 1, fp);break;}}}printf("%s\n", temp);/*for(int i=0;i<8;++i){int len = strlen(temp);if(len % 8 != 0){strcat(temp, "0");}elsebreak;}*///printf("%s\n", temp);int len = 0;if(strlen(temp) % 8 == 0)len = strlen(temp) / 8;else{len = strlen(temp) / 8 + 1;while(strlen(temp) % 8 != 0){strcat(temp, "0");}}for(int i=0;i<len;++i){int a = 0;for(int j=0;j<8;++j){int ca = temp[i*8+j] - '0';a += ca * pow(2, 7-j);//printf("%d\t", ca * pow(2, 8-j));}//printf("\n");//printf("%d\n", a);fwrite(&a, 1, 1, fp);}fclose(fp);}void unzip_huffman(const char *filename, int root){FILE *fp = NULL;if((fp = fopen(filename, "rb"))==NULL){perror("can't open the file");exit(1);}int cb[10000];int num = 0;while(!feof(fp)){int element = getc(fp);//printf("%d\t", element);if(element != -1){for(int i=0;i<8;++i){int t = element % 2;element = element / 2;cb[num++] = t;}}}//printf("\n");//printf("cb:\n");//for(int i=0;i<num;++i)//printf("%d ", cb[i]);//printf("\n");int ccb[10000];int xnum = 0;for(int i=0;i<num/8;++i){for(int j=0;j<8;++j){ccb[xnum++] = cb[i*8+7-j];}}//for(int i=0;i<xnum;++i)//printf("%d ", ccb[i]);fclose(fp);char ss[10000] = "\0";int sr = 0;num = 0;int rt = root;while(num < xnum){while(tree_huffman[rt].l != -1 && tree_huffman[rt].r != -1){if(ccb[num] == 0){rt = tree_huffman[rt].l;}else if(ccb[num] == 1){rt = tree_huffman[rt].r;}num++;}ss[sr++] = tree_huffman[rt].c;//printf("%c", tree_huffman[rt].c);rt = root;}printf("%s\n", ss);}int main(){ios::sync_with_stdio(false);const char *filename = "doc1.txt";char *st = read_file(filename);printf("input file's reading:\n");printf("%s\n", st);int n = samechar(st);printf("the char kinds of %d\n", n);for(int i=0;i<n;++i){//printf("%c", tree_huffman[i].c);tree_huffman[i].w /= strlen(st);tree_huffman[i].l = -1;tree_huffman[i].r = -1;tree_huffman[i].par = -1;memset(tree_huffman[i].flag, 0, sizeof(tree_huffman[i].flag));//printf("%.2lf\n", tree_huffman[i].w);}n = create_huffman(n);char code[256] = "\0";//printf("%d", n);output_tree_code(tree_huffman[n-1], code);for(int i=0;i<n;++i){printf("%d, %c,%.2lf,%d,%d,%d, ", i, tree_huffman[i].c, tree_huffman[i].w, tree_huffman[i].l, tree_huffman[i].r, tree_huffman[i].par);printf("%s\n", tree_huffman[i].flag);//cout<<tree_huffman[i].flag<<endl;}wfile(filename, "2.huffman", n);unzip_huffman("2.huffman", n-1);return 0;}

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。