Skip to content

Instantly share code, notes, and snippets.

@Arathi
Created August 20, 2017 06:51
Show Gist options
  • Save Arathi/faf796c6045294cf8a0615842afbd989 to your computer and use it in GitHub Desktop.
Save Arathi/faf796c6045294cf8a0615842afbd989 to your computer and use it in GitHub Desktop.
C++的URL转换
#include <iostream>
#include <string>
#include <cstring>
using namespace std;
const int CHARSET_NULL = 0;
const int CHARSET_ANSI = 1252; //ISO-8859-1
const int CHARSET_ASCII = CHARSET_ANSI;
const int CHARSET_GBK = 936;
const int CHARSET_UTF8 = 65001;
const int CHARSET_MIXED = 65536;
const int UTF8_3BYTES_PREFIX_MARK = 0xE0;
const int UTF8_OTHER_BYTE_MASK = 0x80;
int get_charset(const char *origin)
{
//unsigned char c = origin[0]; // UTF8_PREFIX_MARK
bool utf8Flag = false;
bool gbkFlag = false;
int length = strlen(origin);
bool existsCodeGT80 = false;
for (int index=0; index<length; index++)
{
unsigned char uc = origin[index];
if (uc >= 0x80)
{
existsCodeGT80 = true;
if (uc & UTF8_3BYTES_PREFIX_MARK == UTF8_3BYTES_PREFIX_MARK)
{
//说明可能是UTF-8,检查后面两个字符
//TODO 检查是否越界
unsigned char uc2nd = origin[index+1];
unsigned char uc3rd = origin[index+2];
if ( (uc2nd & UTF8_OTHER_BYTE_MASK==UTF8_OTHER_BYTE_MASK) &&
(uc3rd & UTF8_OTHER_BYTE_MASK==UTF8_OTHER_BYTE_MASK) )
{
utf8Flag = true;
break;
}
}
if (uc >= 0x81 && uc <= 0xFE)
{
//说明可能是GB2312或者GBK,检查第二个字节
//TODO 检查是否越界
unsigned char uc2nd = origin[index+1];
if ( ( uc >= 0xA1 && uc <= 0xA9 && uc2nd >= 0xA1 && uc2nd <= 0xFE ) ||
( uc >= 0xB0 && uc <= 0xF7 && uc2nd >= 0xA1 && uc2nd <= 0xFE ) ||
( uc >= 0x81 && uc <= 0xA0 && uc2nd >= 0x40 && uc2nd <= 0xFE && uc2nd != 0x7F ) ||
( uc >= 0xAA && uc <= 0xFE && uc2nd >= 0x40 && uc2nd <= 0xA0 && uc2nd != 0x7F ) ||
( uc >= 0xA8 && uc <= 0xA9 && uc2nd >= 0x40 && uc2nd <= 0xA0 && uc2nd != 0x7F ) )
{
gbkFlag = true;
break;
}
}
}
}
if (!existsCodeGT80)
{
return CHARSET_ANSI;
}
if (utf8Flag == true && gbkFlag == false)
{
return CHARSET_UTF8;
}
else if (utf8Flag == false && gbkFlag == true)
{
return CHARSET_GBK;
}
else if (utf8Flag == true && gbkFlag == true)
{
return CHARSET_MIXED;
}
return CHARSET_NULL;
}
string url_encoding_gbk(const char *origin)
{
string encoded = "";
}
string to_hex(unsigned char d)
{
string hex = "";
int high, low;
high = d / 16;
low = d % 16;
if (high>=0 && high<=9) hex+=('0'+high);
if (high>=10 && high<16) hex+=('A'+high-10);
if (low>=0 && low<=9) hex+=('0'+low);
if (low>=10 && low<16) hex+=('A'+low-10);
return hex;
}
int to_urlcode(unsigned char c, string& urlcode, bool skip, int charset)
{
int codeLenght = 1;
if (skip)
{
urlcode = "%" + to_hex(c);
}
else
{
if (c >= 0x20 && c<= 0x2F)
{
urlcode = "%" + to_hex(c);
}
else if (c == 0x3A)
{
urlcode = "%3A";
}
else if (c == 0x3B)
{
urlcode = "%3B";
}
else if (c == 0x3D)
{
urlcode = "%3D";
}
else if (c == 0x3F)
{
urlcode = "%3F";
}
else if (c == 0x40)
{
urlcode = "%40";
}
else if (c == 0x5B)
{
urlcode = "%5B";
}
else if (c == 0x5C)
{
urlcode = "%5C";
}
else if (c == 0x5D)
{
urlcode = "%5D";
}
else if (c >= 0x80)
{
urlcode = "%" + to_hex(c);
if (charset == CHARSET_GBK)
{
codeLenght = 2;
}
else if (charset == CHARSET_UTF8)
{
codeLenght = 3;
}
if (skip)
{
codeLenght = 0;
}
}
else
{
urlcode = c;
}
}
return codeLenght;
}
string url_encoding(const char *origin)
{
cout << "Charset: ";
string encoded = "";
int length = strlen(origin);
int charset = get_charset(origin);
if ( charset == CHARSET_ASCII )
{
cout << "ASCII" << endl;
}
else if ( charset == CHARSET_UTF8 )
{
cout << "UTF-8" << endl;
}
else if ( charset == CHARSET_GBK )
{
cout << "GBK" << endl;
}
else if ( charset == CHARSET_MIXED )
{
cout << "混合编码" << endl;
//TODO 混合编码暂时按照算UTF-8算
charset = CHARSET_UTF8;
}
else
{
cout << "无效的编码!" << charset << endl;
}
int skipLeft = 0;
for (int index=0; index<length; index++)
{
string encodedPart = "";
int skipAddition = to_urlcode(origin[index], encodedPart, skipLeft>0, charset) - 1;
cout << "0x" << hex << (unsigned int)(origin[index] & 0xFF) << " : " << encodedPart << " " << skipAddition << endl;
encoded += encodedPart;
if (skipLeft>0) skipLeft--;
skipLeft += skipAddition;
}
return encoded;
}
int main()
{
string url = "https://www.google.com/search?num=50&newwindow=1&safe=strict&q=urlencoding 特殊字符昇&oq=urlencoding 特殊字符昇&gs_l=serp.3...222.3060.0.3195.16.9.0.0.0.0.586.586.5-1.1.0....0...1c.1j4.64.serp..15.1.584.8KYDVu9BU3Q";
url = "urlencoding 特殊字符昇&oq";
cout << url_encoding(url.c_str()) << endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment