Created
May 10, 2021 23:38
-
-
Save EmperorPenguin18/7469d91a11015ba4f5e7b56dfdefbd9a to your computer and use it in GitHub Desktop.
Get only the text from a webpage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Code that scrapes webpages | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <iostream> | |
#include <string> | |
#include <fstream> | |
using namespace std; | |
string url; | |
ifstream file; | |
string line; | |
int n; | |
const int array_size = 99999; | |
string array_in[array_size]; | |
string array_out[array_size]; | |
int i; | |
int j; | |
ofstream text; | |
int command(string input) | |
{ | |
system ( (input).c_str() ); | |
return 0; | |
} | |
int main(int argc, char **argv) | |
{ | |
if (argc < 2) { cout << "Too few arguments" << '\n'; exit(1); } | |
if (argc > 2) { cout << "Too many arguments" << '\n'; exit(1); } | |
url = argv[1]; | |
command("curl -k " + url + " -o scrape.html"); | |
command("tidy -q -asxml --numeric-entities yes scrape.html >scrape.xml 2>/dev/null"); | |
command("rm scrape.html"); | |
file.open("scrape.xml"); | |
n = 0; | |
while ( getline(file, line) ) | |
{ | |
array_in[n] = line; | |
n++; | |
} | |
file.close(); | |
command("rm scrape.xml"); | |
bool toggle = false; | |
for ( i=0; i < n; i++ ) | |
{ | |
array_out[i] = ""; | |
for ( j=0; j < array_in[i].length(); j++ ) | |
{ | |
if (array_in[i].at(j) == '<') | |
{ | |
toggle = false; | |
} | |
if (toggle) | |
{ | |
array_out[i] += array_in[i].at(j); | |
} | |
if (array_in[i].at(j) == '>') | |
{ | |
toggle = true; | |
} | |
} | |
if (i == array_size-1) | |
{ | |
cout << "Array ran out of space" << '\n'; | |
} | |
} | |
text.open("scrape.txt"); | |
for ( i=0; i < sizeof(array_out)/sizeof(*array_out); i++) | |
{ | |
if (array_out[i] != "") | |
{ | |
text << array_out[i] << '\n'; | |
} | |
} | |
text.close(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment