Skip to content

Instantly share code, notes, and snippets.

@skylander86
Created March 25, 2016 22:32
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save skylander86/46abf91a244fb1b9ff43 to your computer and use it in GitHub Desktop.
Efficient C code for extracting instances from freebase GZip dump
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define FALSE 0
#define TRUE 1
#define BUFSIZE 1024
int main(void) {
setvbuf(stdin, (char *) NULL, _IOFBF, 0); /* better performance if we buffer stdin */
setvbuf(stdout, (char *) NULL, _IOFBF, 0); /* better performance if we buffer */
size_t bufsize = 0;
char *linebuf = NULL;
char cur_subj[BUFSIZE] = {0}, subj_name[BUFSIZE] = {0};
char subj_is_org = FALSE, subj_have_name = FALSE;
size_t subj_name_len = 0;
for (size_t line_processed = 1; getline(&linebuf, &bufsize, stdin) > 0; ++line_processed) {
/* there are 3,130,753,066 lines in freebase */
if (line_processed % 1000000 == 0)
fprintf(stderr, "%ld million lines processed.\n", line_processed / 1000000);
if (linebuf[28] != 'm') continue; /* only care about mentions */
char *tok = strtok(linebuf, "\t");
if (strcmp(cur_subj, tok) != 0) {
if (subj_is_org && subj_have_name) {
/* only want names longer than 2 words */
char have_space = FALSE;
for (size_t i = 0; i < subj_name_len; ++i)
if (subj_name[i] == ' ') {
have_space = TRUE;
break;
}
if (have_space)
fprintf(stdout, "%s\n", subj_name);
}
strcpy(cur_subj, tok);
subj_is_org = FALSE;
subj_name[0] = '\0';
subj_have_name = FALSE;
}
char pred[BUFSIZE] = {0};
strcpy(pred, strtok(NULL, "\t")); /* move to next tok and copy */
char *obj = strtok(NULL, "\t");
if (strcmp(pred, "<http://rdf.freebase.com/ns/type.object.type>") == 0 && strcmp(obj, "<http://rdf.freebase.com/ns/organization.organization>") == 0)
subj_is_org = TRUE;
else if (strcmp(pred, "<http://rdf.freebase.com/ns/type.object.name>") == 0) {
size_t len = strlen(obj);
if (len > 5 && len < BUFSIZE && strcmp(&obj[len - 3], "@en") == 0) {
strcpy(subj_name, &obj[1]);
subj_name[len - 5] = '\0'; /* get rid of language attribute at the end */
subj_name_len = len - 5;
subj_have_name = TRUE;
}
}
}
free(linebuf);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment