Skip to content

Instantly share code, notes, and snippets.

@EmperorPenguin18
Created May 10, 2021 23:38
Show Gist options
  • Save EmperorPenguin18/05b00e6772fc9cb959bbe85f6eed6908 to your computer and use it in GitHub Desktop.
Save EmperorPenguin18/05b00e6772fc9cb959bbe85f6eed6908 to your computer and use it in GitHub Desktop.
Get only the text from a webpage
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#define SIZE 999999
#define WORD 255
int n;
int m;
int i;
int j;
long fsize;
int command(char input[WORD])
{
system(input);
return 0;
}
int main(int argc, char **argv)
{
char **array_in;
char **array_out;
array_in = (char **)malloc(sizeof(char *)*SIZE);
array_out = (char **)malloc(sizeof(char *)*SIZE);
for(i = 0; i < SIZE; i++)
{
array_in[i] = (char *)malloc(sizeof(char)*WORD);
array_out[i] = (char *)malloc(sizeof(char)*WORD);
}
char temp[WORD];
FILE * file;
FILE * text;
if (argc > 2) { printf("Too many arguments\n"); exit(1); }
strcpy(temp, "curl -k "); strcat(temp, argv[1]); strcat(temp, " -o scrape.html");
command( temp );
command("tidy -q -asxml --numeric-entities yes scrape.html >scrape.xml");
command("rm scrape.html");
file = fopen("scrape.xml", "r");
n = 0;
while( fgets(array_in[n], WORD, (FILE*)file) ){ n++; }
fclose(file);
command("rm scrape.xml");
bool toggle = false;
for (i = 0; i < n; i++)
{
m = 0;
for (j = 0; j < WORD; j++)
{
if (array_in[i][j] == '<')
{
toggle = false;
}
if (toggle)
{
array_out[i][m] = array_in[i][j];
m++;
}
if (array_in[i][j] == '>')
{
toggle = true;
}
}
if (i == SIZE-1)
{
printf("Array ran out of space\n");
}
}
text = fopen("scrape.txt", "w");
for (i = 0; i < SIZE; i++)
{
if ( (array_out[i][0] != '\n') && (array_out[i][0] != '\0') )
{
fprintf(text, "%s", array_out[i]);
}
}
fclose(text);
free(array_in);
free(array_out);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment