Created
August 20, 2017 06:51
-
-
Save Arathi/faf796c6045294cf8a0615842afbd989 to your computer and use it in GitHub Desktop.
C++的URL转换
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <cstring> | |
using namespace std; | |
const int CHARSET_NULL = 0; | |
const int CHARSET_ANSI = 1252; //ISO-8859-1 | |
const int CHARSET_ASCII = CHARSET_ANSI; | |
const int CHARSET_GBK = 936; | |
const int CHARSET_UTF8 = 65001; | |
const int CHARSET_MIXED = 65536; | |
const int UTF8_3BYTES_PREFIX_MARK = 0xE0; | |
const int UTF8_OTHER_BYTE_MASK = 0x80; | |
int get_charset(const char *origin) | |
{ | |
//unsigned char c = origin[0]; // UTF8_PREFIX_MARK | |
bool utf8Flag = false; | |
bool gbkFlag = false; | |
int length = strlen(origin); | |
bool existsCodeGT80 = false; | |
for (int index=0; index<length; index++) | |
{ | |
unsigned char uc = origin[index]; | |
if (uc >= 0x80) | |
{ | |
existsCodeGT80 = true; | |
if (uc & UTF8_3BYTES_PREFIX_MARK == UTF8_3BYTES_PREFIX_MARK) | |
{ | |
//说明可能是UTF-8,检查后面两个字符 | |
//TODO 检查是否越界 | |
unsigned char uc2nd = origin[index+1]; | |
unsigned char uc3rd = origin[index+2]; | |
if ( (uc2nd & UTF8_OTHER_BYTE_MASK==UTF8_OTHER_BYTE_MASK) && | |
(uc3rd & UTF8_OTHER_BYTE_MASK==UTF8_OTHER_BYTE_MASK) ) | |
{ | |
utf8Flag = true; | |
break; | |
} | |
} | |
if (uc >= 0x81 && uc <= 0xFE) | |
{ | |
//说明可能是GB2312或者GBK,检查第二个字节 | |
//TODO 检查是否越界 | |
unsigned char uc2nd = origin[index+1]; | |
if ( ( uc >= 0xA1 && uc <= 0xA9 && uc2nd >= 0xA1 && uc2nd <= 0xFE ) || | |
( uc >= 0xB0 && uc <= 0xF7 && uc2nd >= 0xA1 && uc2nd <= 0xFE ) || | |
( uc >= 0x81 && uc <= 0xA0 && uc2nd >= 0x40 && uc2nd <= 0xFE && uc2nd != 0x7F ) || | |
( uc >= 0xAA && uc <= 0xFE && uc2nd >= 0x40 && uc2nd <= 0xA0 && uc2nd != 0x7F ) || | |
( uc >= 0xA8 && uc <= 0xA9 && uc2nd >= 0x40 && uc2nd <= 0xA0 && uc2nd != 0x7F ) ) | |
{ | |
gbkFlag = true; | |
break; | |
} | |
} | |
} | |
} | |
if (!existsCodeGT80) | |
{ | |
return CHARSET_ANSI; | |
} | |
if (utf8Flag == true && gbkFlag == false) | |
{ | |
return CHARSET_UTF8; | |
} | |
else if (utf8Flag == false && gbkFlag == true) | |
{ | |
return CHARSET_GBK; | |
} | |
else if (utf8Flag == true && gbkFlag == true) | |
{ | |
return CHARSET_MIXED; | |
} | |
return CHARSET_NULL; | |
} | |
string url_encoding_gbk(const char *origin) | |
{ | |
string encoded = ""; | |
} | |
string to_hex(unsigned char d) | |
{ | |
string hex = ""; | |
int high, low; | |
high = d / 16; | |
low = d % 16; | |
if (high>=0 && high<=9) hex+=('0'+high); | |
if (high>=10 && high<16) hex+=('A'+high-10); | |
if (low>=0 && low<=9) hex+=('0'+low); | |
if (low>=10 && low<16) hex+=('A'+low-10); | |
return hex; | |
} | |
int to_urlcode(unsigned char c, string& urlcode, bool skip, int charset) | |
{ | |
int codeLenght = 1; | |
if (skip) | |
{ | |
urlcode = "%" + to_hex(c); | |
} | |
else | |
{ | |
if (c >= 0x20 && c<= 0x2F) | |
{ | |
urlcode = "%" + to_hex(c); | |
} | |
else if (c == 0x3A) | |
{ | |
urlcode = "%3A"; | |
} | |
else if (c == 0x3B) | |
{ | |
urlcode = "%3B"; | |
} | |
else if (c == 0x3D) | |
{ | |
urlcode = "%3D"; | |
} | |
else if (c == 0x3F) | |
{ | |
urlcode = "%3F"; | |
} | |
else if (c == 0x40) | |
{ | |
urlcode = "%40"; | |
} | |
else if (c == 0x5B) | |
{ | |
urlcode = "%5B"; | |
} | |
else if (c == 0x5C) | |
{ | |
urlcode = "%5C"; | |
} | |
else if (c == 0x5D) | |
{ | |
urlcode = "%5D"; | |
} | |
else if (c >= 0x80) | |
{ | |
urlcode = "%" + to_hex(c); | |
if (charset == CHARSET_GBK) | |
{ | |
codeLenght = 2; | |
} | |
else if (charset == CHARSET_UTF8) | |
{ | |
codeLenght = 3; | |
} | |
if (skip) | |
{ | |
codeLenght = 0; | |
} | |
} | |
else | |
{ | |
urlcode = c; | |
} | |
} | |
return codeLenght; | |
} | |
string url_encoding(const char *origin) | |
{ | |
cout << "Charset: "; | |
string encoded = ""; | |
int length = strlen(origin); | |
int charset = get_charset(origin); | |
if ( charset == CHARSET_ASCII ) | |
{ | |
cout << "ASCII" << endl; | |
} | |
else if ( charset == CHARSET_UTF8 ) | |
{ | |
cout << "UTF-8" << endl; | |
} | |
else if ( charset == CHARSET_GBK ) | |
{ | |
cout << "GBK" << endl; | |
} | |
else if ( charset == CHARSET_MIXED ) | |
{ | |
cout << "混合编码" << endl; | |
//TODO 混合编码暂时按照算UTF-8算 | |
charset = CHARSET_UTF8; | |
} | |
else | |
{ | |
cout << "无效的编码!" << charset << endl; | |
} | |
int skipLeft = 0; | |
for (int index=0; index<length; index++) | |
{ | |
string encodedPart = ""; | |
int skipAddition = to_urlcode(origin[index], encodedPart, skipLeft>0, charset) - 1; | |
cout << "0x" << hex << (unsigned int)(origin[index] & 0xFF) << " : " << encodedPart << " " << skipAddition << endl; | |
encoded += encodedPart; | |
if (skipLeft>0) skipLeft--; | |
skipLeft += skipAddition; | |
} | |
return encoded; | |
} | |
int main() | |
{ | |
string url = "https://www.google.com/search?num=50&newwindow=1&safe=strict&q=urlencoding 特殊字符昇&oq=urlencoding 特殊字符昇&gs_l=serp.3...222.3060.0.3195.16.9.0.0.0.0.586.586.5-1.1.0....0...1c.1j4.64.serp..15.1.584.8KYDVu9BU3Q"; | |
url = "urlencoding 特殊字符昇&oq"; | |
cout << url_encoding(url.c_str()) << endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment