Last active
March 24, 2017 16:59
-
-
Save ebongzzang/cb3ede2c925f48750de537e4d93451f5 to your computer and use it in GitHub Desktop.
ebong2's endic Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef HTMLCRAWLER_H | |
#define HTMLCRAWLER_H | |
#include <iostream> | |
#include <fstream> | |
#include <curl/curl.h> | |
#include <fstream> | |
#include <stdio.h> | |
#include <libxml2/libxml/HTMLparser.h> | |
#include <libxml2/libxml/tree.h> | |
#include <libxml/parser.h> | |
#include <libxml/xpath.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include <vector> | |
class HTMLCrawler | |
{ | |
public: | |
HTMLCrawler(const std::string _sourceURL); | |
std::string getHTML(const std::string encode = " "); //return contents of html, default encode = UTF-8 | |
std::string write(); //return filename | |
std::vector<unsigned char *> parse_all(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag); | |
//memory = true .html = false | |
unsigned char * parse(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag); | |
//memory = true .html = false | |
~HTMLCrawler(); | |
private: | |
CURL *curl; | |
CURLcode res; | |
std::string readBuffer; | |
std::ofstream outputFile; | |
std::string sourceURL; | |
htmlDocPtr doc; | |
}; | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "HTMLCrawler.h" | |
#include <vector> | |
#include <string.h> | |
using CURLOPT_WRITEFUNCTION_PTR = size_t(*)(void * ,size_t, size_t, void*); | |
HTMLCrawler::HTMLCrawler(const std::string _sourceURL) : sourceURL(_sourceURL) | |
{ | |
} | |
std::string HTMLCrawler::getHTML(const std::string encode) | |
{ | |
auto curl_callback = [](void *contents , size_t size, size_t nmemb, void *stream)->size_t | |
{ | |
(reinterpret_cast<std::string *>(stream))->append(reinterpret_cast<char *>(contents),size * nmemb); | |
return size * nmemb; | |
}; | |
curl = curl_easy_init(); | |
if (curl) | |
{ | |
curl_easy_setopt(curl, CURLOPT_URL,sourceURL.c_str()); | |
//curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); | |
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(curl_callback)); | |
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer); | |
curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); //set user-agent | |
// curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING,encode.c_str()); | |
res = curl_easy_perform(curl); | |
if(res == CURLE_OK) | |
{ | |
//something else | |
} | |
else | |
{ | |
std::cout << "curl_easy_perform() failed:" << curl_easy_strerror(res) << std::endl; | |
readBuffer.empty(); | |
} | |
return readBuffer; | |
} | |
return " "; | |
} | |
std::vector<unsigned char *> HTMLCrawler::parse_all(bool isFile,bool printText, const std::string sourceHTML, const std::string Parsetag) | |
{ | |
std::vector<unsigned char *> resultVector; | |
if(isFile) | |
{ | |
doc = htmlReadFile(sourceHTML.c_str(), NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); | |
} | |
else | |
{ | |
doc = htmlReadMemory(sourceHTML.c_str(),sourceHTML.size(),NULL,NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); | |
} | |
if (doc == NULL) | |
{ | |
fprintf(stderr, "Document not parsed successfully.\n"); | |
resultVector.clear(); | |
return resultVector; | |
} | |
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); | |
//html -> xml DOM | |
xmlChar *xpath = (xmlChar*)Parsetag.c_str(); | |
//xmlChar *xpath = (xmlChar*)"//div[@class='section_card']//ul[@class='desc_lst']//li//p"; | |
xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, xpathCtx); | |
xmlNodeSetPtr nodeset; | |
if(xmlXPathNodeSetIsEmpty(result->nodesetval)) | |
{ | |
resultVector.clear(); | |
return resultVector; | |
} | |
nodeset = result->nodesetval; | |
resultVector.reserve(nodeset->nodeNr); | |
for (int i=0; i < nodeset->nodeNr; i++) | |
{ | |
xmlBufferPtr nodeBuffer = xmlBufferCreate(); | |
xmlNodeDump(nodeBuffer,doc,nodeset->nodeTab[i],0,0); | |
if(printText) | |
{ | |
resultVector.push_back(xmlNodeGetContent(nodeset->nodeTab[i])); | |
} | |
else | |
{ | |
resultVector.push_back(nodeBuffer->content); | |
} | |
} | |
return resultVector; | |
} | |
unsigned char * HTMLCrawler::parse(bool isFile, bool printText, const std::string sourceHTML, const std::string Parsetag) | |
{ | |
if(isFile) | |
{ | |
doc = htmlReadFile(sourceHTML.c_str(), NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); | |
} | |
else | |
{ | |
doc = htmlReadMemory(sourceHTML.c_str(),sourceHTML.size(),NULL,NULL,HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); | |
} | |
if (doc == NULL) | |
{ | |
fprintf(stderr, "Document not parsed successfully.\n"); | |
return (xmlChar *)""; | |
} | |
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); | |
//html -> xml DOM | |
xmlChar *xpath = (xmlChar*)Parsetag.c_str(); | |
//xmlChar *xpath = (xmlChar*)"//div[@class='section_card']//ul[@class='desc_lst']//li//p"; | |
xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, xpathCtx); | |
xmlNodeSetPtr nodeset; | |
if(xmlXPathNodeSetIsEmpty(result->nodesetval)) | |
{ | |
std::cout << "no result! " << std::endl; | |
return (xmlChar *)""; | |
} | |
nodeset = result->nodesetval; | |
for (int i=0; i < 1; i++)// Return only one element | |
{ | |
xmlBufferPtr nodeBuffer = xmlBufferCreate(); | |
xmlNodeDump(nodeBuffer,doc,nodeset->nodeTab[i],0,0); | |
if(printText) | |
{ | |
return xmlNodeGetContent(nodeset->nodeTab[i]); | |
} | |
else | |
{ | |
return nodeBuffer->content; | |
} | |
} | |
std::cout << "unexpected error" << std::endl; | |
return (xmlChar *)""; | |
} | |
std::string HTMLCrawler::write() | |
{ | |
std::string filename = sourceURL.substr(9,5)+".xml"; | |
//TODO:: hmm.. | |
outputFile.open(filename,std::ios::out); | |
outputFile << readBuffer; | |
return filename; | |
} | |
HTMLCrawler::~HTMLCrawler() | |
{ | |
curl_easy_cleanup(curl); | |
outputFile.close(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "HTMLCrawler.h" | |
#include <iostream> | |
#include <string> | |
#include <iterator> | |
#include <algorithm> | |
#include <cstring> | |
static inline void ltrim(std::string &s) | |
{ | |
s.erase(s.begin(), std::find_if(s.begin(), s.end(),std::not1(std::ptr_fun<int, int>(std::isspace)))); | |
} | |
static inline void rtrim(std::string &s) | |
{ | |
s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end()); | |
} | |
// trim from both ends (in place) | |
static inline void trim(std::string &s) | |
{ | |
ltrim(s); | |
rtrim(s); | |
} | |
int main(int argc,char ** argv) | |
{ | |
std::string argu(argv[1]); | |
//std::string test = "http://m.endic.naver.com/search.nhn?searchOption=all&query=" + argu.replace(argu.find(" "),1, ""); | |
std::string test = "http://m.endic.naver.com/search.nhn?searchOption=all&query=" + argu; | |
std::string htmlBuffer; | |
std::vector<unsigned char *>::iterator it; | |
HTMLCrawler * crawler = new HTMLCrawler(test); | |
htmlBuffer = crawler->getHTML(); | |
std::string htmlfilename = crawler->write(); | |
auto entryVector = crawler->parse_all(false,false,htmlBuffer,"//div[@class='entry_search_word top']"); | |
for(std::vector<unsigned char *>::iterator it = entryVector.begin(); it != entryVector.end(); ++it) | |
{ | |
auto word = crawler->parse(false,true,std::string(reinterpret_cast<char *>(*it)),"//strong[@class='target']"); | |
std::string wordStr(reinterpret_cast<char *>(word)); | |
trim(wordStr); | |
std::cout << wordStr << std::endl; | |
std::cout << '\n'; | |
auto wordMean = crawler->parse_all(false,true,std::string(reinterpret_cast<char *>(*it)),"//li"); | |
for(std::vector<unsigned char *>::iterator it2 = wordMean.begin(); it2 != wordMean.end(); ++it2) | |
{ | |
std::string temp(reinterpret_cast<char *>(*it2)); | |
trim(temp); | |
std::cout << temp << std::endl; | |
} | |
std::cout << '\n'; | |
auto wordExamStc = crawler->parse(false,false,std::string(reinterpret_cast<char *>(*it)),"//p[@class='example_stc']"); | |
auto wordExamStcVec = crawler->parse_all(false,true,std::string(reinterpret_cast<char *>(wordExamStc)),"//a"); | |
for(std::vector<unsigned char *>::iterator it2 = wordExamStcVec.begin(); it2 != wordExamStcVec.end(); ++it2) | |
{ | |
if(std::string(reinterpret_cast<char *>(*it2)).find("발음듣기") != std::string::npos) | |
break; | |
std::cout << *it2 << " "; | |
} | |
std::cout << '\n'; | |
auto wordExamMean = crawler->parse(false,true,std::string(reinterpret_cast<char *>(*it)),"//p[@class='example_mean']"); | |
std::string temp(reinterpret_cast<char *>(wordExamMean)); | |
trim(temp); | |
std::cout << temp <<std::endl; | |
std::cout <<'\n'; | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment