lojic/parser.c Secret

## parser.c
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <sys/time.h>
#include <assert.h>

#define MAX_LINE_LENGTH 511
#define MAX_NAME_LENGTH 30

struct name {
  char first         [MAX_NAME_LENGTH];
  char first_soundex [MAX_NAME_LENGTH];
  char middle        [MAX_NAME_LENGTH];
  char last          [MAX_NAME_LENGTH];
  char last_soundex  [MAX_NAME_LENGTH];
  char suffix        [MAX_NAME_LENGTH];
};

const char * sql_null_field = "\\N";

void soundex(const char * original, char * result) {
  char c;
  int src, dst;

  // Convert to upper case & clean
  for (src = 0, dst = 0; (c = original[src]) != '\0'; src++) {
    if (isalpha(c)) {
      result[dst++] = toupper(c);
    }
  }
  result[dst] = '\0';

  if (result[0] == '\0') {
    // Must have at least one character
    return;
  }

  // 1. Save the first letter.
  const char first_letter = result[0];

  // 2. Remove all occurrences of 'h' and 'w' EXCEPT first letter.
  src = 1;
  dst = 1;
  while ((c = result[src])) {
    if (c != 'H' && c != 'W') {
      result[dst++] = c;
    }

    src++;
  }
  result[dst] = '\0';

  // 3. Replace all consonants (including the first letter) with digits as follows:
  //    b, f, p, v             -> 1
  //    c, g, j, k, q, s, x, z -> 2
  //    d, t                   -> 3
  //    l                      -> 4
  //    m, n                   -> 5
  //    r                      -> 6
  for (src = 0; (c = result[src]); src++) {
    switch (c) {
    case 'B':
    case 'F':
    case 'P':
    case 'V':
      result[src] = '1';
      break;
    case 'C':
    case 'G':
    case 'J':
    case 'K':
    case 'Q':
    case 'S':
    case 'X':
    case 'Z':
      result[src] = '2';
      break;
    case 'D':
    case 'T':
      result[src] = '3';
      break;
    case 'L':
      result[src] = '4';
      break;
    case 'M':
    case 'N':
      result[src] = '5';
      break;
    case 'R':
      result[src] = '6';
      break;
    }
  }

  // 4. Replace all adjacent same digits with one digit.
  char last_digit = '\0';
  src        = 0;
  dst        = 0;

  while ((c = result[src])) {
    if (isdigit(c)) {
      if (c != last_digit) {
        last_digit = c;
        result[dst++] = c;
      }
    }
    else {
      last_digit = '\0';
      result[dst++] = c;
    }

    src++;
  }
  result[dst] = '\0';

  // 5. Remove all occurrences of a, e, i, o, u, y EXCEPT first letter.
  src = 1;
  dst = 1;
  while ((c = result[src])) {
    if (c != 'A' && c != 'E' && c != 'I' && c != 'O' && c!= 'U' && c != 'Y') {
      result[dst++] = c;
    }

    src++;
  }
  result[dst] = '\0';

  // 6. If first symbol is a digit replace it with letter saved on step 1.
  if (isdigit(result[0])) {
    result[0] = first_letter;
  }

  // 7. Append zeros or truncate so that the result is a letter followed by 3 digits.
  const int len = strlen(result);

  if (len > 4) {
    result[4] = '\0';
  }
  else if (len < 4) {
    for (src = len; src < 4; src++) {
      result[src] = '0';
    }
    result[src] = '\0';
  }
}

// begin is inclusive, end is exclusive
// trim whitespace from beg & end, and also strip backslash chars
void trim_field(const char * from, char * to, const int begin, const int end) {
  // Find index of first non-space char
  int i = begin;
  while (i < end && isspace(from[i])) {
    i++;
  }

  // Find index of last char that is neither space nor backslash
  int j = end - 1;
  while (j > i && (isspace(from[j]) || from[j] == '\\')) {
    j--;
  }

  // Copy while eliminating '\\' chars
  int dst;
  for (dst = 0; i <= j; i++) {
    if (from[i] != '\\') {
      to[dst++] = from[i];
    }
  }
  to[dst] = '\0';
}

void parse_name(const char * line, const int start, const int end, struct name * name) {
  char * token;
  char * string;
  char name_str[(MAX_NAME_LENGTH + 1) * 4];  // 4 name sub-parts plus delimiter

  // Eliminate \ chars
  int src, dst;
  for (src = start, dst = 0; src < end; src++) {
    if (line[src] != '\\') {
      name_str[dst++] = line[src];
    }
  }
  name_str[dst] = '\0';

  string = name_str;
  assert(string != NULL);
  int i = 0;
  char *argv[4] = { 0, 0, 0, 0 };

  while (i < 4 && (token = strsep(&string, ",")) != NULL) {
    argv[i++] = token;
  }

  if (argv[0]) {
    trim_field(argv[0], name->last, 0, strlen(argv[0]));
  }
  else {
    (name->last)[0] = '\0';
  }

  if (argv[1]) {
    trim_field(argv[1], name->first, 0, strlen(argv[1]));
  }
  else {
    (name->first)[0] = '\0';
  }

  if (argv[2]) {
    trim_field(argv[2], name->middle, 0, strlen(argv[2]));
  }
  else {
    (name->middle)[0] = '\0';
  }

  if (argv[3]) {
    trim_field(argv[3], name->suffix, 0, strlen(argv[3]));
  }
  else {
    (name->suffix)[0] = '\0';
  }

  if (strlen(name->first) > 0) {
    soundex(name->first, name->first_soundex);
  }
  else {
    (name->first_soundex)[0] = '\0';
  }

  if (strlen(name->last) > 0) {
    soundex(name->last, name->last_soundex);
  }
  else {
    (name->last_soundex)[0] = '\0';
  }

  return;
}

const char * opt_field(const char * str) {
  if (strlen(str) > 0) {
    return str;
  }
  else {
    return sql_null_field;
  }
}

void parse_case_key(const char * line, char * key) {
  trim_field(line, key, 2, 18);
}

void parse_alias(FILE * afile, int line_no, const char * line) {
  struct name name;
  parse_name(line, 20, 47, &name);

  char case_key[17];
  parse_case_key(line, case_key);

  char alias_line[3];
  trim_field(line, alias_line, 18, 20);

  fprintf(afile, "%s\t%ld\t%s\t%s\t%s\t%s\t%s\t%s\n",
          case_key,
          strtol(alias_line, NULL, 10),
          opt_field(name.last),
          opt_field(name.last_soundex),
          opt_field(name.first),
          opt_field(name.first_soundex),
          opt_field(name.middle),
          opt_field(name.suffix));
}

void parse_case(FILE * efile, int line_no, const char * line) {
  struct name name;
  parse_name(line, 30, 58, &name);

  char case_key             [17];  parse_case_key(line, case_key);
  char county               [13];  trim_field(line, county, 18, 30);
  char addr_street1         [21];  trim_field(line, addr_street1, 58,78);
  char addr_street2         [16];  trim_field(line, addr_street2, 78, 93);
  char city                 [16];  trim_field(line, city, 93, 108);
  char state                 [3];  trim_field(line, state, 108, 110);
  char zip5                  [6];  trim_field(line, zip5, 110, 115);
  char zip4                  [5];  trim_field(line, zip4, 115, 119);
  char race                  [2];  trim_field(line, race, 119, 120);
  char sex                   [2];  trim_field(line, sex, 120, 121);
  char birth_year            [5];  trim_field(line, birth_year, 121, 125);
  char birth_month           [3];  trim_field(line, birth_month, 125, 127);
  char birth_day             [3];  trim_field(line, birth_day, 127, 129);
  char last_4_ssn            [5];  trim_field(line, last_4_ssn, 129, 133);
  char driver_license       [21];  trim_field(line, driver_license, 133, 153);
  char driver_license_state  [3];  trim_field(line, driver_license_state, 153, 155);
  char citation_number       [9];  trim_field(line, citation_number, 155, 163);
  char added_to_acis_date    [9];  trim_field(line, added_to_acis_date, 163, 171);
  char extract_date          [9];  trim_field(line, extract_date, 171, 179);
  char personal_identifier  [11];  trim_field(line, personal_identifier, 179, 189);
  char case_disposed         [2];  trim_field(line, case_disposed, 189, 190);
  char court_type            [4];  trim_field(line, court_type, 190, 193);

  fprintf(efile, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
          case_key,
          opt_field(county),
          opt_field(name.last),
          opt_field(name.last_soundex),
          opt_field(name.first),
          opt_field(name.first_soundex),
          opt_field(name.middle),
          opt_field(name.suffix),
          opt_field(addr_street1),
          opt_field(addr_street2),
          opt_field(city),
          opt_field(state),
          opt_field(zip5),
          opt_field(zip4),
          opt_field(race),
          opt_field(sex),
          opt_field(birth_year),
          opt_field(birth_month),
          opt_field(birth_day),
          opt_field(last_4_ssn),
          opt_field(driver_license),
          opt_field(driver_license_state),
          opt_field(citation_number),
          opt_field(added_to_acis_date),
          opt_field(extract_date),
          opt_field(personal_identifier),
          opt_field(case_disposed),
          opt_field(court_type));
}

void parse_eof(int line_no, const char * line) {
}

void parse_records(FILE * ifile, FILE * efile, FILE * afile) {
  int line_no     = 0;
  int num_cases   = 0;
  int num_aliases = 0;

  fprintf(efile, "copy extracts (case_key,county_name,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,extract_date,personal_identifier,case_disposed,court_type) from stdin;\n");

  fprintf(afile, "copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,suffix) from stdin;\n");

  char buffer[MAX_LINE_LENGTH + 1];
  char * val = fgets(buffer, MAX_LINE_LENGTH, ifile);

  while (val) {
    line_no += 1;

    if (strncmp("00", buffer, 2) == 0) {
    }
    else if (strncmp("01", buffer, 2) == 0) {
      parse_case(efile, line_no, buffer);
      num_cases += 1;
    }
    else if (strncmp("02", buffer, 2) == 0) {
      parse_alias(afile, line_no, buffer);
      num_aliases += 1;
    }
    else if (strncmp("99", buffer, 2) == 0) {
      parse_eof(line_no, buffer);
    }
    else {
      printf("Unknown record type!");
      return;
    }

    val = fgets(buffer, MAX_LINE_LENGTH, ifile);
  }

  fprintf(efile, "\\.\n");
  fprintf(afile, "\\.\n");

  printf("--------------\n");
  printf("num lines   %d\n", line_no - 1);
  printf("num cases   %d\n", num_cases);
  printf("num aliases %d\n", num_aliases);
}

void parse_file(char * ipath, char * epath, char * apath) {
  parse_records(fopen(ipath, "r"), fopen(epath, "w"), fopen(apath, "w"));
}

void print_elapsed(const struct timeval * t1, const struct timeval * t2) {
  double d1 = ((double) t1->tv_sec) + ((double) t1->tv_usec) / 1000000.0;
  double d2 = ((double) t2->tv_sec) + ((double) t2->tv_usec) / 1000000.0;
  printf("Time: %f\n", d2 - d1);
}

int main(void)
{
  struct timeval t1;
  struct timeval t2;
  gettimeofday(&t1, NULL);
  parse_file("../cr-200",
             "../e2-file",
             "../a2-file");
  gettimeofday(&t2, NULL);
  print_elapsed(&t1, &t2);
  return 0;
}
	#include <stdio.h>
	#include <string.h>
	#include <stdlib.h>
	#include <ctype.h>
	#include <sys/time.h>
	#include <assert.h>

	#define MAX_LINE_LENGTH 511
	#define MAX_NAME_LENGTH 30

	struct name {
	char first [MAX_NAME_LENGTH];
	char first_soundex [MAX_NAME_LENGTH];
	char middle [MAX_NAME_LENGTH];
	char last [MAX_NAME_LENGTH];
	char last_soundex [MAX_NAME_LENGTH];
	char suffix [MAX_NAME_LENGTH];
	};

	const char * sql_null_field = "\\N";

	void soundex(const char * original, char * result) {
	char c;
	int src, dst;

	// Convert to upper case & clean
	for (src = 0, dst = 0; (c = original[src]) != '\0'; src++) {
	if (isalpha(c)) {
	result[dst++] = toupper(c);
	}
	}
	result[dst] = '\0';

	if (result[0] == '\0') {
	// Must have at least one character
	return;
	}

	// 1. Save the first letter.
	const char first_letter = result[0];

	// 2. Remove all occurrences of 'h' and 'w' EXCEPT first letter.
	src = 1;
	dst = 1;
	while ((c = result[src])) {
	if (c != 'H' && c != 'W') {
	result[dst++] = c;
	}

	src++;
	}
	result[dst] = '\0';

	// 3. Replace all consonants (including the first letter) with digits as follows:
	// b, f, p, v -> 1
	// c, g, j, k, q, s, x, z -> 2
	// d, t -> 3
	// l -> 4
	// m, n -> 5
	// r -> 6
	for (src = 0; (c = result[src]); src++) {
	switch (c) {
	case 'B':
	case 'F':
	case 'P':
	case 'V':
	result[src] = '1';
	break;
	case 'C':
	case 'G':
	case 'J':
	case 'K':
	case 'Q':
	case 'S':
	case 'X':
	case 'Z':
	result[src] = '2';
	break;
	case 'D':
	case 'T':
	result[src] = '3';
	break;
	case 'L':
	result[src] = '4';
	break;
	case 'M':
	case 'N':
	result[src] = '5';
	break;
	case 'R':
	result[src] = '6';
	break;
	}
	}

	// 4. Replace all adjacent same digits with one digit.
	char last_digit = '\0';
	src = 0;
	dst = 0;

	while ((c = result[src])) {
	if (isdigit(c)) {
	if (c != last_digit) {
	last_digit = c;
	result[dst++] = c;
	}
	}
	else {
	last_digit = '\0';
	result[dst++] = c;
	}

	src++;
	}
	result[dst] = '\0';

	// 5. Remove all occurrences of a, e, i, o, u, y EXCEPT first letter.
	src = 1;
	dst = 1;
	while ((c = result[src])) {
	if (c != 'A' && c != 'E' && c != 'I' && c != 'O' && c!= 'U' && c != 'Y') {
	result[dst++] = c;
	}

	src++;
	}
	result[dst] = '\0';

	// 6. If first symbol is a digit replace it with letter saved on step 1.
	if (isdigit(result[0])) {
	result[0] = first_letter;
	}

	// 7. Append zeros or truncate so that the result is a letter followed by 3 digits.
	const int len = strlen(result);

	if (len > 4) {
	result[4] = '\0';
	}
	else if (len < 4) {
	for (src = len; src < 4; src++) {
	result[src] = '0';
	}
	result[src] = '\0';
	}
	}

	// begin is inclusive, end is exclusive
	// trim whitespace from beg & end, and also strip backslash chars
	void trim_field(const char * from, char * to, const int begin, const int end) {
	// Find index of first non-space char
	int i = begin;
	while (i < end && isspace(from[i])) {
	i++;
	}

	// Find index of last char that is neither space nor backslash
	int j = end - 1;
	while (j > i && (isspace(from[j]) \|\| from[j] == '\\')) {
	j--;
	}

	// Copy while eliminating '\\' chars
	int dst;
	for (dst = 0; i <= j; i++) {
	if (from[i] != '\\') {
	to[dst++] = from[i];
	}
	}
	to[dst] = '\0';
	}

	void parse_name(const char * line, const int start, const int end, struct name * name) {
	char * token;
	char * string;
	char name_str[(MAX_NAME_LENGTH + 1) * 4]; // 4 name sub-parts plus delimiter

	// Eliminate \ chars
	int src, dst;
	for (src = start, dst = 0; src < end; src++) {
	if (line[src] != '\\') {
	name_str[dst++] = line[src];
	}
	}
	name_str[dst] = '\0';

	string = name_str;
	assert(string != NULL);
	int i = 0;
	char *argv[4] = { 0, 0, 0, 0 };

	while (i < 4 && (token = strsep(&string, ",")) != NULL) {
	argv[i++] = token;
	}

	if (argv[0]) {
	trim_field(argv[0], name->last, 0, strlen(argv[0]));
	}
	else {
	(name->last)[0] = '\0';
	}

	if (argv[1]) {
	trim_field(argv[1], name->first, 0, strlen(argv[1]));
	}
	else {
	(name->first)[0] = '\0';
	}

	if (argv[2]) {
	trim_field(argv[2], name->middle, 0, strlen(argv[2]));
	}
	else {
	(name->middle)[0] = '\0';
	}

	if (argv[3]) {
	trim_field(argv[3], name->suffix, 0, strlen(argv[3]));
	}
	else {
	(name->suffix)[0] = '\0';
	}

	if (strlen(name->first) > 0) {
	soundex(name->first, name->first_soundex);
	}
	else {
	(name->first_soundex)[0] = '\0';
	}

	if (strlen(name->last) > 0) {
	soundex(name->last, name->last_soundex);
	}
	else {
	(name->last_soundex)[0] = '\0';
	}

	return;
	}

	const char * opt_field(const char * str) {
	if (strlen(str) > 0) {
	return str;
	}
	else {
	return sql_null_field;
	}
	}

	void parse_case_key(const char * line, char * key) {
	trim_field(line, key, 2, 18);
	}

	void parse_alias(FILE * afile, int line_no, const char * line) {
	struct name name;
	parse_name(line, 20, 47, &name);

	char case_key[17];
	parse_case_key(line, case_key);

	char alias_line[3];
	trim_field(line, alias_line, 18, 20);

	fprintf(afile, "%s\t%ld\t%s\t%s\t%s\t%s\t%s\t%s\n",
	case_key,
	strtol(alias_line, NULL, 10),
	opt_field(name.last),
	opt_field(name.last_soundex),
	opt_field(name.first),
	opt_field(name.first_soundex),
	opt_field(name.middle),
	opt_field(name.suffix));
	}

	void parse_case(FILE * efile, int line_no, const char * line) {
	struct name name;
	parse_name(line, 30, 58, &name);

	char case_key [17]; parse_case_key(line, case_key);
	char county [13]; trim_field(line, county, 18, 30);
	char addr_street1 [21]; trim_field(line, addr_street1, 58,78);
	char addr_street2 [16]; trim_field(line, addr_street2, 78, 93);
	char city [16]; trim_field(line, city, 93, 108);
	char state [3]; trim_field(line, state, 108, 110);
	char zip5 [6]; trim_field(line, zip5, 110, 115);
	char zip4 [5]; trim_field(line, zip4, 115, 119);
	char race [2]; trim_field(line, race, 119, 120);
	char sex [2]; trim_field(line, sex, 120, 121);
	char birth_year [5]; trim_field(line, birth_year, 121, 125);
	char birth_month [3]; trim_field(line, birth_month, 125, 127);
	char birth_day [3]; trim_field(line, birth_day, 127, 129);
	char last_4_ssn [5]; trim_field(line, last_4_ssn, 129, 133);
	char driver_license [21]; trim_field(line, driver_license, 133, 153);
	char driver_license_state [3]; trim_field(line, driver_license_state, 153, 155);
	char citation_number [9]; trim_field(line, citation_number, 155, 163);
	char added_to_acis_date [9]; trim_field(line, added_to_acis_date, 163, 171);
	char extract_date [9]; trim_field(line, extract_date, 171, 179);
	char personal_identifier [11]; trim_field(line, personal_identifier, 179, 189);
	char case_disposed [2]; trim_field(line, case_disposed, 189, 190);
	char court_type [4]; trim_field(line, court_type, 190, 193);

	fprintf(efile, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
	case_key,
	opt_field(county),
	opt_field(name.last),
	opt_field(name.last_soundex),
	opt_field(name.first),
	opt_field(name.first_soundex),
	opt_field(name.middle),
	opt_field(name.suffix),
	opt_field(addr_street1),
	opt_field(addr_street2),
	opt_field(city),
	opt_field(state),
	opt_field(zip5),
	opt_field(zip4),
	opt_field(race),
	opt_field(sex),
	opt_field(birth_year),
	opt_field(birth_month),
	opt_field(birth_day),
	opt_field(last_4_ssn),
	opt_field(driver_license),
	opt_field(driver_license_state),
	opt_field(citation_number),
	opt_field(added_to_acis_date),
	opt_field(extract_date),
	opt_field(personal_identifier),
	opt_field(case_disposed),
	opt_field(court_type));
	}

	void parse_eof(int line_no, const char * line) {
	}

	void parse_records(FILE * ifile, FILE * efile, FILE * afile) {
	int line_no = 0;
	int num_cases = 0;
	int num_aliases = 0;

	fprintf(efile, "copy extracts (case_key,county_name,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,extract_date,personal_identifier,case_disposed,court_type) from stdin;\n");

	fprintf(afile, "copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,suffix) from stdin;\n");

	char buffer[MAX_LINE_LENGTH + 1];
	char * val = fgets(buffer, MAX_LINE_LENGTH, ifile);

	while (val) {
	line_no += 1;

	if (strncmp("00", buffer, 2) == 0) {
	}
	else if (strncmp("01", buffer, 2) == 0) {
	parse_case(efile, line_no, buffer);
	num_cases += 1;
	}
	else if (strncmp("02", buffer, 2) == 0) {
	parse_alias(afile, line_no, buffer);
	num_aliases += 1;
	}
	else if (strncmp("99", buffer, 2) == 0) {
	parse_eof(line_no, buffer);
	}
	else {
	printf("Unknown record type!");
	return;
	}

	val = fgets(buffer, MAX_LINE_LENGTH, ifile);
	}

	fprintf(efile, "\\.\n");
	fprintf(afile, "\\.\n");

	printf("--------------\n");
	printf("num lines %d\n", line_no - 1);
	printf("num cases %d\n", num_cases);
	printf("num aliases %d\n", num_aliases);
	}

	void parse_file(char * ipath, char * epath, char * apath) {
	parse_records(fopen(ipath, "r"), fopen(epath, "w"), fopen(apath, "w"));
	}

	void print_elapsed(const struct timeval * t1, const struct timeval * t2) {
	double d1 = ((double) t1->tv_sec) + ((double) t1->tv_usec) / 1000000.0;
	double d2 = ((double) t2->tv_sec) + ((double) t2->tv_usec) / 1000000.0;
	printf("Time: %f\n", d2 - d1);
	}

	int main(void)
	{
	struct timeval t1;
	struct timeval t2;
	gettimeofday(&t1, NULL);
	parse_file("../cr-200",
	"../e2-file",
	"../a2-file");
	gettimeofday(&t2, NULL);
	print_elapsed(&t1, &t2);
	return 0;
	}