ebongzzang/HTMLCrawler.h

## HTMLCrawler.h
#ifndef HTMLCRAWLER_H
#define HTMLCRAWLER_H

#include <iostream>
#include <fstream>
#include <curl/curl.h>
#include <fstream>
#include <stdio.h>
#include <libxml2/libxml/HTMLparser.h>
#include <libxml2/libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/xpath.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <vector>

class HTMLCrawler
{
	public:
	HTMLCrawler(const std::string _sourceURL);
	std::string getHTML(const std::string encode = " "); //return contents of html, default encode = UTF-8
	std::string write(); //return filename
	std::vector<unsigned char *> parse_all(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag);
	//memory = true .html = false
	unsigned char * parse(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag);
	//memory = true .html = false

	~HTMLCrawler();

	private:
	CURL *curl;
	CURLcode res;
	std::string readBuffer;
	std::ofstream outputFile;
	std::string sourceURL;
	htmlDocPtr doc;

};


#endif

## HTMPCrawler.cpp
#include "HTMLCrawler.h"
#include <vector>
#include <string.h>
using CURLOPT_WRITEFUNCTION_PTR = size_t(*)(void * ,size_t, size_t, void*);

HTMLCrawler::HTMLCrawler(const std::string _sourceURL) : sourceURL(_sourceURL)
{

}

std::string HTMLCrawler::getHTML(const std::string encode)
{
	auto curl_callback = [](void *contents , size_t size, size_t nmemb, void *stream)->size_t
	{
	(reinterpret_cast<std::string *>(stream))->append(reinterpret_cast<char *>(contents),size * nmemb);
	return size * nmemb;
	};
	  curl = curl_easy_init();

	  if (curl)
	  {
		curl_easy_setopt(curl, CURLOPT_URL,sourceURL.c_str());
//curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
		curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(curl_callback));
		curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer);
		curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); //set user-agent
//		curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING,encode.c_str());

		res = curl_easy_perform(curl);

		if(res == CURLE_OK)
		{
				//something else
		}

		else
		{
			std::cout << "curl_easy_perform() failed:" << curl_easy_strerror(res) << std::endl;
			readBuffer.empty();
		}

			return readBuffer;

	  }
return " ";
}

std::vector<unsigned char *> HTMLCrawler::parse_all(bool isFile,bool printText, const std::string sourceHTML, const std::string Parsetag)
{
	std::vector<unsigned char *> resultVector;
	if(isFile)
	{
		doc = htmlReadFile(sourceHTML.c_str(), NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
	}
	else
	{
		doc = htmlReadMemory(sourceHTML.c_str(),sourceHTML.size(),NULL,NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
	}
	if (doc == NULL)
	{
	        fprintf(stderr, "Document not parsed successfully.\n");
			resultVector.clear();
			return resultVector;

	}

	xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
	//html -> xml DOM

	xmlChar *xpath = (xmlChar*)Parsetag.c_str();
	//xmlChar *xpath = (xmlChar*)"//div[@class='section_card']//ul[@class='desc_lst']//li//p";
	xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, xpathCtx);
	xmlNodeSetPtr nodeset;

	if(xmlXPathNodeSetIsEmpty(result->nodesetval))
	{
		resultVector.clear();
		return resultVector;
	}

	nodeset = result->nodesetval;
	resultVector.reserve(nodeset->nodeNr);
	for (int i=0; i < nodeset->nodeNr; i++)
   	{
			xmlBufferPtr nodeBuffer = xmlBufferCreate();
			xmlNodeDump(nodeBuffer,doc,nodeset->nodeTab[i],0,0);
			if(printText)
			{
				resultVector.push_back(xmlNodeGetContent(nodeset->nodeTab[i]));
			}
			else
			{
				resultVector.push_back(nodeBuffer->content);
			}
	}

	return  resultVector;
}

unsigned char * HTMLCrawler::parse(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag)
{
	if(isFile)
	{
		doc = htmlReadFile(sourceHTML.c_str(), NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
	}
	else
	{
		doc = htmlReadMemory(sourceHTML.c_str(),sourceHTML.size(),NULL,NULL,HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
	}

	if (doc == NULL)
	{
	        fprintf(stderr, "Document not parsed successfully.\n");
			return (xmlChar *)"";

	}

	xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
	//html -> xml DOM

	xmlChar *xpath = (xmlChar*)Parsetag.c_str();
	//xmlChar *xpath = (xmlChar*)"//div[@class='section_card']//ul[@class='desc_lst']//li//p";
	xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, xpathCtx);
	xmlNodeSetPtr nodeset;

	if(xmlXPathNodeSetIsEmpty(result->nodesetval))
	{
		std::cout << "no result! " << std::endl;
		return (xmlChar *)"";
	}

	nodeset = result->nodesetval;
	for (int i=0; i < 1; i++)// Return only one element
   	{
			xmlBufferPtr nodeBuffer = xmlBufferCreate();
			xmlNodeDump(nodeBuffer,doc,nodeset->nodeTab[i],0,0);
			if(printText)
			{
				return xmlNodeGetContent(nodeset->nodeTab[i]);
			}
			else
			{
				return nodeBuffer->content;
			}
	}
		std::cout << "unexpected error" << std::endl;
		return (xmlChar *)"";
}
std::string HTMLCrawler::write()
{
	std::string filename = sourceURL.substr(9,5)+".xml";
	//TODO:: hmm..
	outputFile.open(filename,std::ios::out);
	outputFile << readBuffer;
	return filename;
}

HTMLCrawler::~HTMLCrawler()
{
	curl_easy_cleanup(curl);
	outputFile.close();
}

## main.cpp
#include "HTMLCrawler.h"
#include <iostream>
#include <string>
#include <iterator>
#include <algorithm>
#include <cstring>

static inline void ltrim(std::string &s)
{
	    s.erase(s.begin(), std::find_if(s.begin(), s.end(),std::not1(std::ptr_fun<int, int>(std::isspace))));

}

static inline void rtrim(std::string &s)
{
	    s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());

}

// trim from both ends (in place)
 static inline void trim(std::string &s)
{
     ltrim(s);
     rtrim(s);
 }

int main(int argc,char ** argv)
{
	std::string argu(argv[1]);
	//std::string test = "http://m.endic.naver.com/search.nhn?searchOption=all&query=" + argu.replace(argu.find(" "),1, "");
	std::string test = "http://m.endic.naver.com/search.nhn?searchOption=all&query=" + argu;
	std::string htmlBuffer;
	std::vector<unsigned char *>::iterator it;

	HTMLCrawler * crawler = new HTMLCrawler(test);
	htmlBuffer = crawler->getHTML();
	std::string htmlfilename = crawler->write();
	auto entryVector = crawler->parse_all(false,false,htmlBuffer,"//div[@class='entry_search_word top']");

	for(std::vector<unsigned char *>::iterator it = entryVector.begin(); it != entryVector.end(); ++it)
	{
		auto word = crawler->parse(false,true,std::string(reinterpret_cast<char *>(*it)),"//strong[@class='target']");
		std::string wordStr(reinterpret_cast<char *>(word));
		trim(wordStr);
		std::cout << wordStr << std::endl;
		std::cout << '\n';

		auto wordMean = crawler->parse_all(false,true,std::string(reinterpret_cast<char *>(*it)),"//li");

		for(std::vector<unsigned char *>::iterator it2 = wordMean.begin(); it2 != wordMean.end(); ++it2)
		{
			std::string temp(reinterpret_cast<char *>(*it2));
			trim(temp);
			std::cout << temp << std::endl;
		}
		std::cout << '\n';

		 auto wordExamStc = crawler->parse(false,false,std::string(reinterpret_cast<char *>(*it)),"//p[@class='example_stc']");
		 auto wordExamStcVec = crawler->parse_all(false,true,std::string(reinterpret_cast<char *>(wordExamStc)),"//a");

			for(std::vector<unsigned char *>::iterator it2 = wordExamStcVec.begin(); it2 != wordExamStcVec.end(); ++it2)
			{
				if(std::string(reinterpret_cast<char *>(*it2)).find("발음듣기") != std::string::npos)
					break;
				std::cout << *it2 << " ";
			}

			std::cout << '\n';

			auto wordExamMean = crawler->parse(false,true,std::string(reinterpret_cast<char *>(*it)),"//p[@class='example_mean']");
			std::string temp(reinterpret_cast<char *>(wordExamMean));
			trim(temp);

			std::cout << temp <<std::endl;
			std::cout <<'\n';

	}
}
	#ifndef HTMLCRAWLER_H
	#define HTMLCRAWLER_H

	#include <iostream>
	#include <fstream>
	#include <curl/curl.h>
	#include <fstream>
	#include <stdio.h>
	#include <libxml2/libxml/HTMLparser.h>
	#include <libxml2/libxml/tree.h>
	#include <libxml/parser.h>
	#include <libxml/xpath.h>
	#include <stdlib.h>
	#include <string.h>
	#include <unistd.h>
	#include <vector>

	class HTMLCrawler
	{
	public:
	HTMLCrawler(const std::string _sourceURL);
	std::string getHTML(const std::string encode = " "); //return contents of html, default encode = UTF-8
	std::string write(); //return filename
	std::vector<unsigned char *> parse_all(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag);
	//memory = true .html = false
	unsigned char * parse(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag);
	//memory = true .html = false

	~HTMLCrawler();

	private:
	CURL *curl;
	CURLcode res;
	std::string readBuffer;
	std::ofstream outputFile;
	std::string sourceURL;
	htmlDocPtr doc;

	};





	#endif
	#include "HTMLCrawler.h"
	#include <vector>
	#include <string.h>
	using CURLOPT_WRITEFUNCTION_PTR = size_t()(void ,size_t, size_t, void*);

	HTMLCrawler::HTMLCrawler(const std::string _sourceURL) : sourceURL(_sourceURL)
	{

	}

	std::string HTMLCrawler::getHTML(const std::string encode)
	{
	auto curl_callback = [](void contents , size_t size, size_t nmemb, void stream)->size_t
	{
	(reinterpret_cast<std::string >(stream))->append(reinterpret_cast<char >(contents),size * nmemb);
	return size * nmemb;
	};
	curl = curl_easy_init();

	if (curl)
	{
	curl_easy_setopt(curl, CURLOPT_URL,sourceURL.c_str());
	//curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(curl_callback));
	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer);
	curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); //set user-agent
	// curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING,encode.c_str());

	res = curl_easy_perform(curl);

	if(res == CURLE_OK)
	{
	//something else
	}

	else
	{
	std::cout << "curl_easy_perform() failed:" << curl_easy_strerror(res) << std::endl;
	readBuffer.empty();
	}

	return readBuffer;

	}
	return " ";
	}

	std::vector<unsigned char *> HTMLCrawler::parse_all(bool isFile,bool printText, const std::string sourceHTML, const std::string Parsetag)
	{
	std::vector<unsigned char *> resultVector;
	if(isFile)
	{
	doc = htmlReadFile(sourceHTML.c_str(), NULL, HTML_PARSE_NOERROR \| HTML_PARSE_NOWARNING \| HTML_PARSE_NONET);
	}
	else
	{
	doc = htmlReadMemory(sourceHTML.c_str(),sourceHTML.size(),NULL,NULL, HTML_PARSE_NOERROR \| HTML_PARSE_NOWARNING \| HTML_PARSE_NONET);
	}
	if (doc == NULL)
	{
	fprintf(stderr, "Document not parsed successfully.\n");
	resultVector.clear();
	return resultVector;

	}

	xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
	//html -> xml DOM

	xmlChar xpath = (xmlChar)Parsetag.c_str();
	//xmlChar xpath = (xmlChar)"//div[@class='section_card']//ul[@class='desc_lst']//li//p";
	xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, xpathCtx);
	xmlNodeSetPtr nodeset;

	if(xmlXPathNodeSetIsEmpty(result->nodesetval))
	{
	resultVector.clear();
	return resultVector;
	}

	nodeset = result->nodesetval;
	resultVector.reserve(nodeset->nodeNr);
	for (int i=0; i < nodeset->nodeNr; i++)
	{
	xmlBufferPtr nodeBuffer = xmlBufferCreate();
	xmlNodeDump(nodeBuffer,doc,nodeset->nodeTab[i],0,0);
	if(printText)
	{
	resultVector.push_back(xmlNodeGetContent(nodeset->nodeTab[i]));
	}
	else
	{
	resultVector.push_back(nodeBuffer->content);
	}
	}

	return resultVector;
	}

	unsigned char * HTMLCrawler::parse(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag)
	{
	if(isFile)
	{
	doc = htmlReadFile(sourceHTML.c_str(), NULL, HTML_PARSE_NOBLANKS \| HTML_PARSE_NOERROR \| HTML_PARSE_NOWARNING \| HTML_PARSE_NONET);
	}
	else
	{
	doc = htmlReadMemory(sourceHTML.c_str(),sourceHTML.size(),NULL,NULL,HTML_PARSE_NOBLANKS \| HTML_PARSE_NOERROR \| HTML_PARSE_NOWARNING \| HTML_PARSE_NONET);
	}

	if (doc == NULL)
	{
	fprintf(stderr, "Document not parsed successfully.\n");
	return (xmlChar *)"";

	}

	xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
	//html -> xml DOM

	xmlChar xpath = (xmlChar)Parsetag.c_str();
	//xmlChar xpath = (xmlChar)"//div[@class='section_card']//ul[@class='desc_lst']//li//p";
	xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, xpathCtx);
	xmlNodeSetPtr nodeset;

	if(xmlXPathNodeSetIsEmpty(result->nodesetval))
	{
	std::cout << "no result! " << std::endl;
	return (xmlChar *)"";
	}

	nodeset = result->nodesetval;
	for (int i=0; i < 1; i++)// Return only one element
	{
	xmlBufferPtr nodeBuffer = xmlBufferCreate();
	xmlNodeDump(nodeBuffer,doc,nodeset->nodeTab[i],0,0);
	if(printText)
	{
	return xmlNodeGetContent(nodeset->nodeTab[i]);
	}
	else
	{
	return nodeBuffer->content;
	}
	}
	std::cout << "unexpected error" << std::endl;
	return (xmlChar *)"";
	}
	std::string HTMLCrawler::write()
	{
	std::string filename = sourceURL.substr(9,5)+".xml";
	//TODO:: hmm..
	outputFile.open(filename,std::ios::out);
	outputFile << readBuffer;
	return filename;
	}

	HTMLCrawler::~HTMLCrawler()
	{
	curl_easy_cleanup(curl);
	outputFile.close();
	}
	#include "HTMLCrawler.h"
	#include <iostream>
	#include <string>
	#include <iterator>
	#include <algorithm>
	#include <cstring>

	static inline void ltrim(std::string &s)
	{
	s.erase(s.begin(), std::find_if(s.begin(), s.end(),std::not1(std::ptr_fun<int, int>(std::isspace))));

	}

	static inline void rtrim(std::string &s)
	{
	s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());

	}

	// trim from both ends (in place)
	static inline void trim(std::string &s)
	{
	ltrim(s);
	rtrim(s);
	}

	int main(int argc,char ** argv)
	{
	std::string argu(argv[1]);
	//std::string test = "http://m.endic.naver.com/search.nhn?searchOption=all&query=" + argu.replace(argu.find(" "),1, "");
	std::string test = "http://m.endic.naver.com/search.nhn?searchOption=all&query=" + argu;
	std::string htmlBuffer;
	std::vector<unsigned char *>::iterator it;

	HTMLCrawler * crawler = new HTMLCrawler(test);
	htmlBuffer = crawler->getHTML();
	std::string htmlfilename = crawler->write();
	auto entryVector = crawler->parse_all(false,false,htmlBuffer,"//div[@class='entry_search_word top']");

	for(std::vector<unsigned char *>::iterator it = entryVector.begin(); it != entryVector.end(); ++it)
	{
	auto word = crawler->parse(false,true,std::string(reinterpret_cast<char >(it)),"//strong[@class='target']");
	std::string wordStr(reinterpret_cast<char *>(word));
	trim(wordStr);
	std::cout << wordStr << std::endl;
	std::cout << '\n';

	auto wordMean = crawler->parse_all(false,true,std::string(reinterpret_cast<char >(it)),"//li");

	for(std::vector<unsigned char *>::iterator it2 = wordMean.begin(); it2 != wordMean.end(); ++it2)
	{
	std::string temp(reinterpret_cast<char >(it2));
	trim(temp);
	std::cout << temp << std::endl;
	}
	std::cout << '\n';

	auto wordExamStc = crawler->parse(false,false,std::string(reinterpret_cast<char >(it)),"//p[@class='example_stc']");
	auto wordExamStcVec = crawler->parse_all(false,true,std::string(reinterpret_cast<char *>(wordExamStc)),"//a");

	for(std::vector<unsigned char *>::iterator it2 = wordExamStcVec.begin(); it2 != wordExamStcVec.end(); ++it2)
	{
	if(std::string(reinterpret_cast<char >(it2)).find("발음듣기") != std::string::npos)
	break;
	std::cout << *it2 << " ";
	}

	std::cout << '\n';

	auto wordExamMean = crawler->parse(false,true,std::string(reinterpret_cast<char >(it)),"//p[@class='example_mean']");
	std::string temp(reinterpret_cast<char *>(wordExamMean));
	trim(temp);

	std::cout << temp <<std::endl;
	std::cout <<'\n';

	}
	}