Skip to content

Instantly share code, notes, and snippets.

@lojic

lojic/parser.c Secret

Created January 17, 2016 02:47
Show Gist options
  • Save lojic/4369d9d57eb775296c92 to your computer and use it in GitHub Desktop.
Save lojic/4369d9d57eb775296c92 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <sys/time.h>
#include <assert.h>
#define MAX_LINE_LENGTH 511
#define MAX_NAME_LENGTH 30
struct name {
char first [MAX_NAME_LENGTH];
char first_soundex [MAX_NAME_LENGTH];
char middle [MAX_NAME_LENGTH];
char last [MAX_NAME_LENGTH];
char last_soundex [MAX_NAME_LENGTH];
char suffix [MAX_NAME_LENGTH];
};
const char * sql_null_field = "\\N";
void soundex(const char * original, char * result) {
char c;
int src, dst;
// Convert to upper case & clean
for (src = 0, dst = 0; (c = original[src]) != '\0'; src++) {
if (isalpha(c)) {
result[dst++] = toupper(c);
}
}
result[dst] = '\0';
if (result[0] == '\0') {
// Must have at least one character
return;
}
// 1. Save the first letter.
const char first_letter = result[0];
// 2. Remove all occurrences of 'h' and 'w' EXCEPT first letter.
src = 1;
dst = 1;
while ((c = result[src])) {
if (c != 'H' && c != 'W') {
result[dst++] = c;
}
src++;
}
result[dst] = '\0';
// 3. Replace all consonants (including the first letter) with digits as follows:
// b, f, p, v -> 1
// c, g, j, k, q, s, x, z -> 2
// d, t -> 3
// l -> 4
// m, n -> 5
// r -> 6
for (src = 0; (c = result[src]); src++) {
switch (c) {
case 'B':
case 'F':
case 'P':
case 'V':
result[src] = '1';
break;
case 'C':
case 'G':
case 'J':
case 'K':
case 'Q':
case 'S':
case 'X':
case 'Z':
result[src] = '2';
break;
case 'D':
case 'T':
result[src] = '3';
break;
case 'L':
result[src] = '4';
break;
case 'M':
case 'N':
result[src] = '5';
break;
case 'R':
result[src] = '6';
break;
}
}
// 4. Replace all adjacent same digits with one digit.
char last_digit = '\0';
src = 0;
dst = 0;
while ((c = result[src])) {
if (isdigit(c)) {
if (c != last_digit) {
last_digit = c;
result[dst++] = c;
}
}
else {
last_digit = '\0';
result[dst++] = c;
}
src++;
}
result[dst] = '\0';
// 5. Remove all occurrences of a, e, i, o, u, y EXCEPT first letter.
src = 1;
dst = 1;
while ((c = result[src])) {
if (c != 'A' && c != 'E' && c != 'I' && c != 'O' && c!= 'U' && c != 'Y') {
result[dst++] = c;
}
src++;
}
result[dst] = '\0';
// 6. If first symbol is a digit replace it with letter saved on step 1.
if (isdigit(result[0])) {
result[0] = first_letter;
}
// 7. Append zeros or truncate so that the result is a letter followed by 3 digits.
const int len = strlen(result);
if (len > 4) {
result[4] = '\0';
}
else if (len < 4) {
for (src = len; src < 4; src++) {
result[src] = '0';
}
result[src] = '\0';
}
}
// begin is inclusive, end is exclusive
// trim whitespace from beg & end, and also strip backslash chars
void trim_field(const char * from, char * to, const int begin, const int end) {
// Find index of first non-space char
int i = begin;
while (i < end && isspace(from[i])) {
i++;
}
// Find index of last char that is neither space nor backslash
int j = end - 1;
while (j > i && (isspace(from[j]) || from[j] == '\\')) {
j--;
}
// Copy while eliminating '\\' chars
int dst;
for (dst = 0; i <= j; i++) {
if (from[i] != '\\') {
to[dst++] = from[i];
}
}
to[dst] = '\0';
}
void parse_name(const char * line, const int start, const int end, struct name * name) {
char * token;
char * string;
char name_str[(MAX_NAME_LENGTH + 1) * 4]; // 4 name sub-parts plus delimiter
// Eliminate \ chars
int src, dst;
for (src = start, dst = 0; src < end; src++) {
if (line[src] != '\\') {
name_str[dst++] = line[src];
}
}
name_str[dst] = '\0';
string = name_str;
assert(string != NULL);
int i = 0;
char *argv[4] = { 0, 0, 0, 0 };
while (i < 4 && (token = strsep(&string, ",")) != NULL) {
argv[i++] = token;
}
if (argv[0]) {
trim_field(argv[0], name->last, 0, strlen(argv[0]));
}
else {
(name->last)[0] = '\0';
}
if (argv[1]) {
trim_field(argv[1], name->first, 0, strlen(argv[1]));
}
else {
(name->first)[0] = '\0';
}
if (argv[2]) {
trim_field(argv[2], name->middle, 0, strlen(argv[2]));
}
else {
(name->middle)[0] = '\0';
}
if (argv[3]) {
trim_field(argv[3], name->suffix, 0, strlen(argv[3]));
}
else {
(name->suffix)[0] = '\0';
}
if (strlen(name->first) > 0) {
soundex(name->first, name->first_soundex);
}
else {
(name->first_soundex)[0] = '\0';
}
if (strlen(name->last) > 0) {
soundex(name->last, name->last_soundex);
}
else {
(name->last_soundex)[0] = '\0';
}
return;
}
const char * opt_field(const char * str) {
if (strlen(str) > 0) {
return str;
}
else {
return sql_null_field;
}
}
void parse_case_key(const char * line, char * key) {
trim_field(line, key, 2, 18);
}
void parse_alias(FILE * afile, int line_no, const char * line) {
struct name name;
parse_name(line, 20, 47, &name);
char case_key[17];
parse_case_key(line, case_key);
char alias_line[3];
trim_field(line, alias_line, 18, 20);
fprintf(afile, "%s\t%ld\t%s\t%s\t%s\t%s\t%s\t%s\n",
case_key,
strtol(alias_line, NULL, 10),
opt_field(name.last),
opt_field(name.last_soundex),
opt_field(name.first),
opt_field(name.first_soundex),
opt_field(name.middle),
opt_field(name.suffix));
}
void parse_case(FILE * efile, int line_no, const char * line) {
struct name name;
parse_name(line, 30, 58, &name);
char case_key [17]; parse_case_key(line, case_key);
char county [13]; trim_field(line, county, 18, 30);
char addr_street1 [21]; trim_field(line, addr_street1, 58,78);
char addr_street2 [16]; trim_field(line, addr_street2, 78, 93);
char city [16]; trim_field(line, city, 93, 108);
char state [3]; trim_field(line, state, 108, 110);
char zip5 [6]; trim_field(line, zip5, 110, 115);
char zip4 [5]; trim_field(line, zip4, 115, 119);
char race [2]; trim_field(line, race, 119, 120);
char sex [2]; trim_field(line, sex, 120, 121);
char birth_year [5]; trim_field(line, birth_year, 121, 125);
char birth_month [3]; trim_field(line, birth_month, 125, 127);
char birth_day [3]; trim_field(line, birth_day, 127, 129);
char last_4_ssn [5]; trim_field(line, last_4_ssn, 129, 133);
char driver_license [21]; trim_field(line, driver_license, 133, 153);
char driver_license_state [3]; trim_field(line, driver_license_state, 153, 155);
char citation_number [9]; trim_field(line, citation_number, 155, 163);
char added_to_acis_date [9]; trim_field(line, added_to_acis_date, 163, 171);
char extract_date [9]; trim_field(line, extract_date, 171, 179);
char personal_identifier [11]; trim_field(line, personal_identifier, 179, 189);
char case_disposed [2]; trim_field(line, case_disposed, 189, 190);
char court_type [4]; trim_field(line, court_type, 190, 193);
fprintf(efile, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
case_key,
opt_field(county),
opt_field(name.last),
opt_field(name.last_soundex),
opt_field(name.first),
opt_field(name.first_soundex),
opt_field(name.middle),
opt_field(name.suffix),
opt_field(addr_street1),
opt_field(addr_street2),
opt_field(city),
opt_field(state),
opt_field(zip5),
opt_field(zip4),
opt_field(race),
opt_field(sex),
opt_field(birth_year),
opt_field(birth_month),
opt_field(birth_day),
opt_field(last_4_ssn),
opt_field(driver_license),
opt_field(driver_license_state),
opt_field(citation_number),
opt_field(added_to_acis_date),
opt_field(extract_date),
opt_field(personal_identifier),
opt_field(case_disposed),
opt_field(court_type));
}
void parse_eof(int line_no, const char * line) {
}
void parse_records(FILE * ifile, FILE * efile, FILE * afile) {
int line_no = 0;
int num_cases = 0;
int num_aliases = 0;
fprintf(efile, "copy extracts (case_key,county_name,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,extract_date,personal_identifier,case_disposed,court_type) from stdin;\n");
fprintf(afile, "copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,suffix) from stdin;\n");
char buffer[MAX_LINE_LENGTH + 1];
char * val = fgets(buffer, MAX_LINE_LENGTH, ifile);
while (val) {
line_no += 1;
if (strncmp("00", buffer, 2) == 0) {
}
else if (strncmp("01", buffer, 2) == 0) {
parse_case(efile, line_no, buffer);
num_cases += 1;
}
else if (strncmp("02", buffer, 2) == 0) {
parse_alias(afile, line_no, buffer);
num_aliases += 1;
}
else if (strncmp("99", buffer, 2) == 0) {
parse_eof(line_no, buffer);
}
else {
printf("Unknown record type!");
return;
}
val = fgets(buffer, MAX_LINE_LENGTH, ifile);
}
fprintf(efile, "\\.\n");
fprintf(afile, "\\.\n");
printf("--------------\n");
printf("num lines %d\n", line_no - 1);
printf("num cases %d\n", num_cases);
printf("num aliases %d\n", num_aliases);
}
void parse_file(char * ipath, char * epath, char * apath) {
parse_records(fopen(ipath, "r"), fopen(epath, "w"), fopen(apath, "w"));
}
void print_elapsed(const struct timeval * t1, const struct timeval * t2) {
double d1 = ((double) t1->tv_sec) + ((double) t1->tv_usec) / 1000000.0;
double d2 = ((double) t2->tv_sec) + ((double) t2->tv_usec) / 1000000.0;
printf("Time: %f\n", d2 - d1);
}
int main(void)
{
struct timeval t1;
struct timeval t2;
gettimeofday(&t1, NULL);
parse_file("../cr-200",
"../e2-file",
"../a2-file");
gettimeofday(&t2, NULL);
print_elapsed(&t1, &t2);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment