Skip to content

Instantly share code, notes, and snippets.

@Answeror
Created July 3, 2012 00:26
Show Gist options
  • Save Answeror/3036611 to your computer and use it in GitHub Desktop.
Save Answeror/3036611 to your computer and use it in GitHub Desktop.
用库算法提取URL
#include <string>
#include <vector>
#include <algorithm>
#include <functional>
#include <fstream>
#include <iterator>
#include <iostream>
using namespace std;
typedef string::const_iterator iter;
bool isurlch(char ch)
{
static const string urlcs = "~;/?:@=&$-_.+!*'{},";
return isalnum(ch) || find(urlcs.begin(), urlcs.end(), ch) != urlcs.end();
}
iter url_end(iter begin, iter end)
{
return find_if(begin, end, [](char ch){ return !isurlch(ch); });
}
iter url_begin(iter begin, iter end)
{
static const string sep = "://";
iter i = begin;
while (true)
{
i = search(i, end, sep.begin(), sep.end());
if (i == end) break;
if (i != begin && i + sep.size() != end && isurlch(i[sep.size()]))
{
iter result = i;
//while (result != begin && isalpha(result[-1])) --result;
result = find_if_not(string::const_reverse_iterator(i), string::const_reverse_iterator(begin), isalpha).base();
if (result != i) return result;
}
if (i != end) i += sep.size();
}
return end;
}
vector<string> extract_url(const string &text)
{
vector<string> urls;
iter begin = text.begin();
while (begin != text.end())
{
begin = url_begin(begin, text.end());
if (begin != text.end())
{
iter end = url_end(begin, text.end());
urls.push_back(string(begin, end));
begin = end;
}
}
return urls;
}
int main()
{
ifstream ifs("in.txt");
string line;
string text;
while (getline(ifs, line))
{
text += line + "\n";
}
auto urls = extract_url(text);
copy(urls.begin(), urls.end(), ostream_iterator<string>(cout, "\n"));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment