Skip to content

Instantly share code, notes, and snippets.

@ebongzzang
Last active March 24, 2017 16:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ebongzzang/cb3ede2c925f48750de537e4d93451f5 to your computer and use it in GitHub Desktop.
Save ebongzzang/cb3ede2c925f48750de537e4d93451f5 to your computer and use it in GitHub Desktop.
ebong2's endic Crawler
#ifndef HTMLCRAWLER_H
#define HTMLCRAWLER_H
#include <iostream>
#include <fstream>
#include <curl/curl.h>
#include <fstream>
#include <stdio.h>
#include <libxml2/libxml/HTMLparser.h>
#include <libxml2/libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/xpath.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <vector>
class HTMLCrawler
{
public:
HTMLCrawler(const std::string _sourceURL);
std::string getHTML(const std::string encode = " "); //return contents of html, default encode = UTF-8
std::string write(); //return filename
std::vector<unsigned char *> parse_all(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag);
//memory = true .html = false
unsigned char * parse(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag);
//memory = true .html = false
~HTMLCrawler();
private:
CURL *curl;
CURLcode res;
std::string readBuffer;
std::ofstream outputFile;
std::string sourceURL;
htmlDocPtr doc;
};
#endif
#include "HTMLCrawler.h"
#include <vector>
#include <string.h>
using CURLOPT_WRITEFUNCTION_PTR = size_t(*)(void * ,size_t, size_t, void*);
HTMLCrawler::HTMLCrawler(const std::string _sourceURL) : sourceURL(_sourceURL)
{
}
std::string HTMLCrawler::getHTML(const std::string encode)
{
auto curl_callback = [](void *contents , size_t size, size_t nmemb, void *stream)->size_t
{
(reinterpret_cast<std::string *>(stream))->append(reinterpret_cast<char *>(contents),size * nmemb);
return size * nmemb;
};
curl = curl_easy_init();
if (curl)
{
curl_easy_setopt(curl, CURLOPT_URL,sourceURL.c_str());
//curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(curl_callback));
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer);
curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); //set user-agent
// curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING,encode.c_str());
res = curl_easy_perform(curl);
if(res == CURLE_OK)
{
//something else
}
else
{
std::cout << "curl_easy_perform() failed:" << curl_easy_strerror(res) << std::endl;
readBuffer.empty();
}
return readBuffer;
}
return " ";
}
std::vector<unsigned char *> HTMLCrawler::parse_all(bool isFile,bool printText, const std::string sourceHTML, const std::string Parsetag)
{
std::vector<unsigned char *> resultVector;
if(isFile)
{
doc = htmlReadFile(sourceHTML.c_str(), NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
}
else
{
doc = htmlReadMemory(sourceHTML.c_str(),sourceHTML.size(),NULL,NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
}
if (doc == NULL)
{
fprintf(stderr, "Document not parsed successfully.\n");
resultVector.clear();
return resultVector;
}
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
//html -> xml DOM
xmlChar *xpath = (xmlChar*)Parsetag.c_str();
//xmlChar *xpath = (xmlChar*)"//div[@class='section_card']//ul[@class='desc_lst']//li//p";
xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, xpathCtx);
xmlNodeSetPtr nodeset;
if(xmlXPathNodeSetIsEmpty(result->nodesetval))
{
resultVector.clear();
return resultVector;
}
nodeset = result->nodesetval;
resultVector.reserve(nodeset->nodeNr);
for (int i=0; i < nodeset->nodeNr; i++)
{
xmlBufferPtr nodeBuffer = xmlBufferCreate();
xmlNodeDump(nodeBuffer,doc,nodeset->nodeTab[i],0,0);
if(printText)
{
resultVector.push_back(xmlNodeGetContent(nodeset->nodeTab[i]));
}
else
{
resultVector.push_back(nodeBuffer->content);
}
}
return resultVector;
}
unsigned char * HTMLCrawler::parse(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag)
{
if(isFile)
{
doc = htmlReadFile(sourceHTML.c_str(), NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
}
else
{
doc = htmlReadMemory(sourceHTML.c_str(),sourceHTML.size(),NULL,NULL,HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
}
if (doc == NULL)
{
fprintf(stderr, "Document not parsed successfully.\n");
return (xmlChar *)"";
}
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
//html -> xml DOM
xmlChar *xpath = (xmlChar*)Parsetag.c_str();
//xmlChar *xpath = (xmlChar*)"//div[@class='section_card']//ul[@class='desc_lst']//li//p";
xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, xpathCtx);
xmlNodeSetPtr nodeset;
if(xmlXPathNodeSetIsEmpty(result->nodesetval))
{
std::cout << "no result! " << std::endl;
return (xmlChar *)"";
}
nodeset = result->nodesetval;
for (int i=0; i < 1; i++)// Return only one element
{
xmlBufferPtr nodeBuffer = xmlBufferCreate();
xmlNodeDump(nodeBuffer,doc,nodeset->nodeTab[i],0,0);
if(printText)
{
return xmlNodeGetContent(nodeset->nodeTab[i]);
}
else
{
return nodeBuffer->content;
}
}
std::cout << "unexpected error" << std::endl;
return (xmlChar *)"";
}
std::string HTMLCrawler::write()
{
std::string filename = sourceURL.substr(9,5)+".xml";
//TODO:: hmm..
outputFile.open(filename,std::ios::out);
outputFile << readBuffer;
return filename;
}
HTMLCrawler::~HTMLCrawler()
{
curl_easy_cleanup(curl);
outputFile.close();
}
#include "HTMLCrawler.h"
#include <iostream>
#include <string>
#include <iterator>
#include <algorithm>
#include <cstring>
static inline void ltrim(std::string &s)
{
s.erase(s.begin(), std::find_if(s.begin(), s.end(),std::not1(std::ptr_fun<int, int>(std::isspace))));
}
static inline void rtrim(std::string &s)
{
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());
}
// trim from both ends (in place)
static inline void trim(std::string &s)
{
ltrim(s);
rtrim(s);
}
int main(int argc,char ** argv)
{
std::string argu(argv[1]);
//std::string test = "http://m.endic.naver.com/search.nhn?searchOption=all&query=" + argu.replace(argu.find(" "),1, "");
std::string test = "http://m.endic.naver.com/search.nhn?searchOption=all&query=" + argu;
std::string htmlBuffer;
std::vector<unsigned char *>::iterator it;
HTMLCrawler * crawler = new HTMLCrawler(test);
htmlBuffer = crawler->getHTML();
std::string htmlfilename = crawler->write();
auto entryVector = crawler->parse_all(false,false,htmlBuffer,"//div[@class='entry_search_word top']");
for(std::vector<unsigned char *>::iterator it = entryVector.begin(); it != entryVector.end(); ++it)
{
auto word = crawler->parse(false,true,std::string(reinterpret_cast<char *>(*it)),"//strong[@class='target']");
std::string wordStr(reinterpret_cast<char *>(word));
trim(wordStr);
std::cout << wordStr << std::endl;
std::cout << '\n';
auto wordMean = crawler->parse_all(false,true,std::string(reinterpret_cast<char *>(*it)),"//li");
for(std::vector<unsigned char *>::iterator it2 = wordMean.begin(); it2 != wordMean.end(); ++it2)
{
std::string temp(reinterpret_cast<char *>(*it2));
trim(temp);
std::cout << temp << std::endl;
}
std::cout << '\n';
auto wordExamStc = crawler->parse(false,false,std::string(reinterpret_cast<char *>(*it)),"//p[@class='example_stc']");
auto wordExamStcVec = crawler->parse_all(false,true,std::string(reinterpret_cast<char *>(wordExamStc)),"//a");
for(std::vector<unsigned char *>::iterator it2 = wordExamStcVec.begin(); it2 != wordExamStcVec.end(); ++it2)
{
if(std::string(reinterpret_cast<char *>(*it2)).find("발음듣기") != std::string::npos)
break;
std::cout << *it2 << " ";
}
std::cout << '\n';
auto wordExamMean = crawler->parse(false,true,std::string(reinterpret_cast<char *>(*it)),"//p[@class='example_mean']");
std::string temp(reinterpret_cast<char *>(wordExamMean));
trim(temp);
std::cout << temp <<std::endl;
std::cout <<'\n';
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment