#include "convert_uni_and_utf.h" /****************** unicode¸¦ utf·Î º¯È¯½ÃÅ°´Â ºÎºÐ ½ÃÀÛ *****************/ // UTF-8À̳ª UTF-7·Î º¯È¯ÇÒ ¹è¿­ÀÇ ¾Õ 3¹ÙÀÌÆ®¿¡ 0xEF, 0xBB, 0xBF¸¦ ³Ö±â // UTF-8À̳ª UTF-7·Î º¯È¯µÆÀ»¶§ ÇÊ¿äÇÑ ºÎºÐ °°À½ unsigned char * init_utf(unsigned char *q, int i) { memset(q, 0x00, i*2); *q = 0xEF; *(q + 1) = 0xBB; *(q + 2) = 0xBF; q += 3; return q; } // UNICODE°¡ little endian ÀÎÁö big endianÀÎÁö È®ÀÎ int confirm_unicode(unsigned char *p) { if( (*p == 0xFF) && (*(p+1) == 0xFE) ) return LITTLE_ENDIAN; else return BIG_ENDIAN; } // p¿¡ ÀÖ´Â 2¹ÙÀÌÆ®¸¦ ÇÕÇؼ­ ÇϳªÀÇ unsigned char ¸¦ ¸®ÅÏ // À̶§ flag·Î little_endianÀÎÁö big-endianÀÎÁö È®ÀÎÇؼ­ ÇÕÇϱâ. unsigned short make_syllable(unsigned char *p, int flag) { int k; if( flag == LITTLE_ENDIAN ) { k = ((unsigned short)*(p+1) << 8); k = k | (unsigned short)*p; } else { k = ((unsigned short)*p << 8); k = k | (unsigned short)*(p+1); } return k; } // kÀÇ ¿µ¿ªÀ» È®ÀÎ(UnicodeÀÇ ¿µ¿ª¿¡ µû¶ó¼­ utf·Î º¯È¯±â ´Þ¶óÁü)Çؼ­ q¿¡ ³Ö±â int make_utf(unsigned char *q, unsigned short k, int flag) { // unicode°¡ // 0x0000À̸鼭 º¯È¯ ¹æ½ÄÀÌ UTF-7ÀÎ °æ¿ì if( (flag == UTF_7) && (k == 0x0000) ) { *q = *q | 0xC0; *q = *q | (unsigned char)((k & 0x07C0) >> 6); *(q+1) = *(q+1) | 0x80; *(q+1) = *(q+1) | (unsigned char)(k & 0x003F); return CONVERT_TWO; } // 0x0000À̸鼭 º¯È¯ ¹æ½ÄÀÌ UTF-8ÀÎ °æ¿ì else if( (flag == UTF_8) && (k == 0x0000) ) { *q = (unsigned char)k; return CONVERT_ONE; } // 0x0001ºÎÅÍ 0x007F±îÁö else if( (k > 0x0000) && (k <= 0x007F) ) { *q = (unsigned char)k; return CONVERT_ONE; } // 0x0081ºÎÅÍ 0x07FF±îÁö else if( (k >= 0x0080) && (k <= 0x07FF) ) { *q = *q | 0xC0; *q = *q | (unsigned char)((k & 0x07C0) >> 6); *(q+1) = *(q+1) | 0x80; *(q+1) = *(q+1) | (unsigned char)(k & 0x003F); return CONVERT_TWO; } // 0x0800ºÎÅÍ 0xFFFF±îÁö else if( (k >= 0x0800) && (k <= 0xFFFF) ) { *q = *q | 0xE0; *q = *q | (unsigned char)((k & 0xF000) >> 12); *(q+1) = *(q+1) | 0x80; *(q+1) = *(q+1) | (unsigned char)((k & 0x0FC0) >> 6); *(q+2) = *(q+2) | 0x80; *(q+2) = *(q+2) | (unsigned char)(k & 0x003F); return CONVERT_THREE; } // 0x00010000 ÀÌ»óÀº Á¦¿Ü else { } return CONVERT_FAIL; } /* * int uni2utf(unsigned char *p, unsigned char *q, int, int) * * ù¹ø° Àμö´Â UNICODE°¡ µé¾î ÀÖ´Â unsigned char Çü ¹è¿­ÀÇ ÁÖ¼Ò * µÎ¹ø° Àμö´Â UTF-8ÀÌ µé¾î °¥ unsigned char Çü ¹è¿­ÀÇ ÁÖ¼Ò * ¼¼¹ø° Àμö´Â ù¹ø° ÀμöÀÇ Å©±â(UNICODE°¡ µé¾îÀÖ´Â Å©±â) * ³×¹ø° Àμö´Â º¯È¯ ¹æ½Ä(UTF-7 or UTF-8) * * ¸®ÅÏ°ª : º¯È¯µÈ UTF-8ÀÇ ±æÀÌ * */ int uni2utf(unsigned char *unicode, unsigned char *utf, int i, int flag) { int t, j; unsigned char *p, *q; unsigned short k; p = unicode; q = utf; q = init_utf(q, i); // a -> 61 00 if( confirm_unicode(p) == LITTLE_ENDIAN) { for(t = 2; t < i; t += 2) { k = make_syllable((p+t), LITTLE_ENDIAN); j = make_utf(q, k, flag); q += j; } } // a -> 00 61 else { for(t = 2; t < i; t += 2) { k = make_syllable((p+t), BIG_ENDIAN); j = make_utf(q, k, flag); q += j; } } return q-utf; } /****************** unicode¸¦ utf·Î º¯È¯½ÃÅ°´Â ºÎºÐ ³¡ *****************/ /****************** utf¸¦ unicode·Î º¯È¯½ÃÅ°´Â ºÎºÐ ½ÃÀÛ *****************/ // unicode·Î º¯È¯ÇÒ ¹è¿­ÀÇ ¾Õ 2¹ÙÀÌÆ®¿¡ 0xFF, 0xFE¸¦ ³Ö±â // 0xFE, 0xFF¸¦ ³ÖÀ» ¼öµµ ÀÖÁö¸¸, ÀÌ·¸°Ô ³ÖÀ¸¸é big-endian ¹æ½ÄÀÓ unsigned char * init_uni(unsigned char *q, int i) { memset(q, 0x00, i*2); *q = 0xFF; *(q + 1) = 0xFE; q += 2; return q; } // utf-8ÀÇ ¸Ç ¾Õ 3¹ÙÀÌÆ®¿¡ 0xEF, 0xBB, 0xBF °¡ ÀÖ´Â °Í °°À½ // ÀÖÀ¸¸é 3¹ÙÀÌÆ®¸¦ Á¦¿Ü unsigned char * confirm_utf(unsigned char *p, int *i) { if( (*p == 0xEF) && (*(p + 1) == 0xBB) && (*(p + 2) == 0xBF) ) { p += 3; *i -= 3; } return p; } /* * int utf2uni(unsigned *, unsigned *, int) * * ù¹ø° Àμö´Â UTF-8ÀÌ µé¾î ÀÖ´Â unsigned char Çü ¹è¿­ÀÇ ÁÖ¼Ò * µÎ¹ø° Àμö´Â º¯È¯µÈ UNICODE°¡ µé¾î °¥ unsigned char Çü ¹è¿­ÀÇ ÁÖ¼Ò * ¼¼¹ø° Àμö´Â ù¹ø° ÀμöÀÇ Å©±â(UNICODE°¡ µé¾îÀÖ´Â Å©±â) * * ù¹ø° ¹ÙÀÌÆ®¸¦ È®ÀÎÈÄ¿¡ 1¹ÙÀÌÆ®³ª 2 ~ 3 ¹ÙÀÌÆ®¾¿ ó¸® * * ¸®ÅÏ°ª : º¯È¯µÈ UNICODEÀÇ ±æÀÌ * */ int utf2uni(unsigned char *utf, unsigned char *uni, int i) { int t; unsigned char *p, *q; unsigned char k; p = utf; q = uni; q = init_uni(q, i); p = confirm_utf(p, &i); for( t = 0; t < i; ) { // ù ¹ÙÀÌÆ®¸¦ Àо 0x80, 0xC0, 0xE0À¸·Î °¢°¢ & ¿¬»êÇؼ­ 1¹ÙÀÌÆ®¸¦ ÀÐÀ» Áö 2~3¹ÙÀÌÆ®¸¦ ÀÐÀ» Áö °áÁ¤. // ´Ü, 2 ¹ÙÀÌÆ®¸¦ Àоî¾ß ÇÏ´Â °æ¿ì 0xC0¸¸À¸·Î & ¿¬»êÀ» ÇÒ °æ¿ì 3 ¹ÙÀÌÆ®¸¦ Àоî¾ß ÇÏ´Â ºÎºÐµµ 󸮰¡ µÇ¹Ç·Î // 0xE0°¡ ¾Æ´Ñ °æ¿ì¸¦ Æ÷ÇÔ // 1 ¹ÙÀÌÆ®¸¦ Àоî¾ß ÇÏ´Â ºÎºÐ if( (*p & 0x80) == 0x00 ) { // 0xFE, 0xFF ¹æ½ÄÀÌ¸é ´ÙÀ½ µÎ°³ÀÇ ¹®À» º¯°æ *q = *p; *(q + 1) = 0x00; p++; t += 1; } // 2 ¹ÙÀÌÆ®¸¦ Àоî¾ß ÇÏ´Â ºÎºÐ else if( ((*p & 0xC0) == 0xC0) && ((*p & 0xE0) != 0xE0)) { *q = (*p & 0x1C) >> 2; *(q + 1) = (*p & 0x03) << 6; *(q + 1) = *(q + 1) | (*(p + 1) & 0x3F); // 0xFE, 0xFF ¹æ½ÄÀ̸é ÇÊ¿ä ¾øÀ½ k = *(q + 1); *(q + 1) = *q; *q = k; // p += 2; t += 2; } // 3 ¹ÙÀÌÆ®¸¦ Àоî¾ß ÇÏ´Â ºÎºÐ else if( (*p & 0xE0) == 0xE0 ) { *q = *p << 4; *q = *q | ((*(p + 1) & 0x3C) >> 2); *(q + 1) = (*(p + 1) & 0x03) << 6; *(q + 1) = *(q + 1) | (*(p + 2) & 0x3F); // 0xFE, 0xFF ¹æ½ÄÀ̸é ÇÊ¿ä ¾øÀ½ k = *(q + 1); *(q + 1) = *q; *q = k; // p += 3; t += 3; } // ¿©±â¼­ 4 ¹ÙÀÌÆ® ÀÌ»óÀº Á¦¿Ü else { } q += 2; } return q-uni; } /****************** utf¸¦ unicode·Î º¯È¯½ÃÅ°´Â ºÎºÐ ½ÃÀÛ *****************/ // unicode º¯È¯°ú °ü°è ¾øÀ½ long fileSize(const char *file) { FILE *fp = fopen(file, "rb"); long len; if (fp != NULL) { if (fseek(fp, 0L, 2) != -1) len = ftell(fp); fclose(fp); } return len; } // Unicode¿Í UTF-8Àº windows 2000ÀÇ ¸Þ¸ðÀå¿¡ ÀÖÀº ÀÎÄÚµù ¹æ½ÄÀ¸·Î // °¢°¢À» ÀúÀåÇÑ ÈÄ¿¡ ±× °ÍÀ» Àо Å×½ºÆ® ÇÑ °ÍÀÓ. void main() { FILE *fp, *stream; unsigned char *buffer; unsigned char *tmp_buffer; int i, t; i = (int)fileSize(UNICODE_FILE); buffer = malloc(i); tmp_buffer = malloc(i*2); // Unicode·Î µÈ ÆÄÀÏÀ» ÀÐÀ» °æ¿ì binary¸ðµå·Î Àоî¾ß ÇÔ // ¾Æ´Ï¸é 0x0D, 0x0A°¡ ´Þ¶óÁú ¼ö ÀÖÀ½. fp = fopen(UNICODE_FILE, "rb"); if(!fp) { printf("file open fail\n"); return; } i = fread(buffer, sizeof(unsigned char), i, fp); t = uni2utf(buffer, tmp_buffer, i, UTF_8); fclose(fp); fp = fopen("tmp.txt", "wb"); if(!fp) { printf("file open fail\n"); return; } fwrite(tmp_buffer, sizeof(unsigned char), t, fp); free(buffer); free(tmp_buffer); fclose(fp); // **************** UNICODE to UTF-8 º¯È¯ ºÎºÐ ³¡ **************** i = (int)fileSize(UTF_8_FILE); buffer = malloc(i); tmp_buffer = malloc(i*2); // Unicode·Î µÈ ÆÄÀÏÀ» ÀÐÀ» °æ¿ì binary¸ðµå·Î Àоî¾ß ÇÔ // ¾Æ´Ï¸é 0x0D, 0x0A°¡ ´Þ¶óÁú ¼ö ÀÖÀ½. stream = fopen(UTF_8_FILE, "rb"); if(!stream) { printf("file open fail\n"); return; } i = fread(buffer, sizeof(unsigned char), i, stream); t = utf2uni(buffer, tmp_buffer, i); fclose(stream); stream = fopen("tmp1.txt", "wb"); if(!stream) { printf("file open fail\n"); return; } fwrite(tmp_buffer, sizeof(unsigned char), t, stream); free(buffer); free(tmp_buffer); fclose(stream); // **************** UTF-8 to UNICODE º¯È¯ ºÎºÐ ³¡ **************** return; }