Skip to content

Instantly share code, notes, and snippets.

@EmperorPenguin18
Created May 10, 2021 23:38
Show Gist options
  • Save EmperorPenguin18/7469d91a11015ba4f5e7b56dfdefbd9a to your computer and use it in GitHub Desktop.
Save EmperorPenguin18/7469d91a11015ba4f5e7b56dfdefbd9a to your computer and use it in GitHub Desktop.
Get only the text from a webpage
//Code that scrapes webpages
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream>
using namespace std;
string url;
ifstream file;
string line;
int n;
const int array_size = 99999;
string array_in[array_size];
string array_out[array_size];
int i;
int j;
ofstream text;
int command(string input)
{
system ( (input).c_str() );
return 0;
}
int main(int argc, char **argv)
{
if (argc < 2) { cout << "Too few arguments" << '\n'; exit(1); }
if (argc > 2) { cout << "Too many arguments" << '\n'; exit(1); }
url = argv[1];
command("curl -k " + url + " -o scrape.html");
command("tidy -q -asxml --numeric-entities yes scrape.html >scrape.xml 2>/dev/null");
command("rm scrape.html");
file.open("scrape.xml");
n = 0;
while ( getline(file, line) )
{
array_in[n] = line;
n++;
}
file.close();
command("rm scrape.xml");
bool toggle = false;
for ( i=0; i < n; i++ )
{
array_out[i] = "";
for ( j=0; j < array_in[i].length(); j++ )
{
if (array_in[i].at(j) == '<')
{
toggle = false;
}
if (toggle)
{
array_out[i] += array_in[i].at(j);
}
if (array_in[i].at(j) == '>')
{
toggle = true;
}
}
if (i == array_size-1)
{
cout << "Array ran out of space" << '\n';
}
}
text.open("scrape.txt");
for ( i=0; i < sizeof(array_out)/sizeof(*array_out); i++)
{
if (array_out[i] != "")
{
text << array_out[i] << '\n';
}
}
text.close();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment