-
-
Save lojic/4369d9d57eb775296c92 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <ctype.h> | |
#include <sys/time.h> | |
#include <assert.h> | |
#define MAX_LINE_LENGTH 511 | |
#define MAX_NAME_LENGTH 30 | |
struct name { | |
char first [MAX_NAME_LENGTH]; | |
char first_soundex [MAX_NAME_LENGTH]; | |
char middle [MAX_NAME_LENGTH]; | |
char last [MAX_NAME_LENGTH]; | |
char last_soundex [MAX_NAME_LENGTH]; | |
char suffix [MAX_NAME_LENGTH]; | |
}; | |
const char * sql_null_field = "\\N"; | |
void soundex(const char * original, char * result) { | |
char c; | |
int src, dst; | |
// Convert to upper case & clean | |
for (src = 0, dst = 0; (c = original[src]) != '\0'; src++) { | |
if (isalpha(c)) { | |
result[dst++] = toupper(c); | |
} | |
} | |
result[dst] = '\0'; | |
if (result[0] == '\0') { | |
// Must have at least one character | |
return; | |
} | |
// 1. Save the first letter. | |
const char first_letter = result[0]; | |
// 2. Remove all occurrences of 'h' and 'w' EXCEPT first letter. | |
src = 1; | |
dst = 1; | |
while ((c = result[src])) { | |
if (c != 'H' && c != 'W') { | |
result[dst++] = c; | |
} | |
src++; | |
} | |
result[dst] = '\0'; | |
// 3. Replace all consonants (including the first letter) with digits as follows: | |
// b, f, p, v -> 1 | |
// c, g, j, k, q, s, x, z -> 2 | |
// d, t -> 3 | |
// l -> 4 | |
// m, n -> 5 | |
// r -> 6 | |
for (src = 0; (c = result[src]); src++) { | |
switch (c) { | |
case 'B': | |
case 'F': | |
case 'P': | |
case 'V': | |
result[src] = '1'; | |
break; | |
case 'C': | |
case 'G': | |
case 'J': | |
case 'K': | |
case 'Q': | |
case 'S': | |
case 'X': | |
case 'Z': | |
result[src] = '2'; | |
break; | |
case 'D': | |
case 'T': | |
result[src] = '3'; | |
break; | |
case 'L': | |
result[src] = '4'; | |
break; | |
case 'M': | |
case 'N': | |
result[src] = '5'; | |
break; | |
case 'R': | |
result[src] = '6'; | |
break; | |
} | |
} | |
// 4. Replace all adjacent same digits with one digit. | |
char last_digit = '\0'; | |
src = 0; | |
dst = 0; | |
while ((c = result[src])) { | |
if (isdigit(c)) { | |
if (c != last_digit) { | |
last_digit = c; | |
result[dst++] = c; | |
} | |
} | |
else { | |
last_digit = '\0'; | |
result[dst++] = c; | |
} | |
src++; | |
} | |
result[dst] = '\0'; | |
// 5. Remove all occurrences of a, e, i, o, u, y EXCEPT first letter. | |
src = 1; | |
dst = 1; | |
while ((c = result[src])) { | |
if (c != 'A' && c != 'E' && c != 'I' && c != 'O' && c!= 'U' && c != 'Y') { | |
result[dst++] = c; | |
} | |
src++; | |
} | |
result[dst] = '\0'; | |
// 6. If first symbol is a digit replace it with letter saved on step 1. | |
if (isdigit(result[0])) { | |
result[0] = first_letter; | |
} | |
// 7. Append zeros or truncate so that the result is a letter followed by 3 digits. | |
const int len = strlen(result); | |
if (len > 4) { | |
result[4] = '\0'; | |
} | |
else if (len < 4) { | |
for (src = len; src < 4; src++) { | |
result[src] = '0'; | |
} | |
result[src] = '\0'; | |
} | |
} | |
// begin is inclusive, end is exclusive | |
// trim whitespace from beg & end, and also strip backslash chars | |
void trim_field(const char * from, char * to, const int begin, const int end) { | |
// Find index of first non-space char | |
int i = begin; | |
while (i < end && isspace(from[i])) { | |
i++; | |
} | |
// Find index of last char that is neither space nor backslash | |
int j = end - 1; | |
while (j > i && (isspace(from[j]) || from[j] == '\\')) { | |
j--; | |
} | |
// Copy while eliminating '\\' chars | |
int dst; | |
for (dst = 0; i <= j; i++) { | |
if (from[i] != '\\') { | |
to[dst++] = from[i]; | |
} | |
} | |
to[dst] = '\0'; | |
} | |
void parse_name(const char * line, const int start, const int end, struct name * name) { | |
char * token; | |
char * string; | |
char name_str[(MAX_NAME_LENGTH + 1) * 4]; // 4 name sub-parts plus delimiter | |
// Eliminate \ chars | |
int src, dst; | |
for (src = start, dst = 0; src < end; src++) { | |
if (line[src] != '\\') { | |
name_str[dst++] = line[src]; | |
} | |
} | |
name_str[dst] = '\0'; | |
string = name_str; | |
assert(string != NULL); | |
int i = 0; | |
char *argv[4] = { 0, 0, 0, 0 }; | |
while (i < 4 && (token = strsep(&string, ",")) != NULL) { | |
argv[i++] = token; | |
} | |
if (argv[0]) { | |
trim_field(argv[0], name->last, 0, strlen(argv[0])); | |
} | |
else { | |
(name->last)[0] = '\0'; | |
} | |
if (argv[1]) { | |
trim_field(argv[1], name->first, 0, strlen(argv[1])); | |
} | |
else { | |
(name->first)[0] = '\0'; | |
} | |
if (argv[2]) { | |
trim_field(argv[2], name->middle, 0, strlen(argv[2])); | |
} | |
else { | |
(name->middle)[0] = '\0'; | |
} | |
if (argv[3]) { | |
trim_field(argv[3], name->suffix, 0, strlen(argv[3])); | |
} | |
else { | |
(name->suffix)[0] = '\0'; | |
} | |
if (strlen(name->first) > 0) { | |
soundex(name->first, name->first_soundex); | |
} | |
else { | |
(name->first_soundex)[0] = '\0'; | |
} | |
if (strlen(name->last) > 0) { | |
soundex(name->last, name->last_soundex); | |
} | |
else { | |
(name->last_soundex)[0] = '\0'; | |
} | |
return; | |
} | |
const char * opt_field(const char * str) { | |
if (strlen(str) > 0) { | |
return str; | |
} | |
else { | |
return sql_null_field; | |
} | |
} | |
void parse_case_key(const char * line, char * key) { | |
trim_field(line, key, 2, 18); | |
} | |
void parse_alias(FILE * afile, int line_no, const char * line) { | |
struct name name; | |
parse_name(line, 20, 47, &name); | |
char case_key[17]; | |
parse_case_key(line, case_key); | |
char alias_line[3]; | |
trim_field(line, alias_line, 18, 20); | |
fprintf(afile, "%s\t%ld\t%s\t%s\t%s\t%s\t%s\t%s\n", | |
case_key, | |
strtol(alias_line, NULL, 10), | |
opt_field(name.last), | |
opt_field(name.last_soundex), | |
opt_field(name.first), | |
opt_field(name.first_soundex), | |
opt_field(name.middle), | |
opt_field(name.suffix)); | |
} | |
void parse_case(FILE * efile, int line_no, const char * line) { | |
struct name name; | |
parse_name(line, 30, 58, &name); | |
char case_key [17]; parse_case_key(line, case_key); | |
char county [13]; trim_field(line, county, 18, 30); | |
char addr_street1 [21]; trim_field(line, addr_street1, 58,78); | |
char addr_street2 [16]; trim_field(line, addr_street2, 78, 93); | |
char city [16]; trim_field(line, city, 93, 108); | |
char state [3]; trim_field(line, state, 108, 110); | |
char zip5 [6]; trim_field(line, zip5, 110, 115); | |
char zip4 [5]; trim_field(line, zip4, 115, 119); | |
char race [2]; trim_field(line, race, 119, 120); | |
char sex [2]; trim_field(line, sex, 120, 121); | |
char birth_year [5]; trim_field(line, birth_year, 121, 125); | |
char birth_month [3]; trim_field(line, birth_month, 125, 127); | |
char birth_day [3]; trim_field(line, birth_day, 127, 129); | |
char last_4_ssn [5]; trim_field(line, last_4_ssn, 129, 133); | |
char driver_license [21]; trim_field(line, driver_license, 133, 153); | |
char driver_license_state [3]; trim_field(line, driver_license_state, 153, 155); | |
char citation_number [9]; trim_field(line, citation_number, 155, 163); | |
char added_to_acis_date [9]; trim_field(line, added_to_acis_date, 163, 171); | |
char extract_date [9]; trim_field(line, extract_date, 171, 179); | |
char personal_identifier [11]; trim_field(line, personal_identifier, 179, 189); | |
char case_disposed [2]; trim_field(line, case_disposed, 189, 190); | |
char court_type [4]; trim_field(line, court_type, 190, 193); | |
fprintf(efile, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", | |
case_key, | |
opt_field(county), | |
opt_field(name.last), | |
opt_field(name.last_soundex), | |
opt_field(name.first), | |
opt_field(name.first_soundex), | |
opt_field(name.middle), | |
opt_field(name.suffix), | |
opt_field(addr_street1), | |
opt_field(addr_street2), | |
opt_field(city), | |
opt_field(state), | |
opt_field(zip5), | |
opt_field(zip4), | |
opt_field(race), | |
opt_field(sex), | |
opt_field(birth_year), | |
opt_field(birth_month), | |
opt_field(birth_day), | |
opt_field(last_4_ssn), | |
opt_field(driver_license), | |
opt_field(driver_license_state), | |
opt_field(citation_number), | |
opt_field(added_to_acis_date), | |
opt_field(extract_date), | |
opt_field(personal_identifier), | |
opt_field(case_disposed), | |
opt_field(court_type)); | |
} | |
void parse_eof(int line_no, const char * line) { | |
} | |
void parse_records(FILE * ifile, FILE * efile, FILE * afile) { | |
int line_no = 0; | |
int num_cases = 0; | |
int num_aliases = 0; | |
fprintf(efile, "copy extracts (case_key,county_name,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,extract_date,personal_identifier,case_disposed,court_type) from stdin;\n"); | |
fprintf(afile, "copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,suffix) from stdin;\n"); | |
char buffer[MAX_LINE_LENGTH + 1]; | |
char * val = fgets(buffer, MAX_LINE_LENGTH, ifile); | |
while (val) { | |
line_no += 1; | |
if (strncmp("00", buffer, 2) == 0) { | |
} | |
else if (strncmp("01", buffer, 2) == 0) { | |
parse_case(efile, line_no, buffer); | |
num_cases += 1; | |
} | |
else if (strncmp("02", buffer, 2) == 0) { | |
parse_alias(afile, line_no, buffer); | |
num_aliases += 1; | |
} | |
else if (strncmp("99", buffer, 2) == 0) { | |
parse_eof(line_no, buffer); | |
} | |
else { | |
printf("Unknown record type!"); | |
return; | |
} | |
val = fgets(buffer, MAX_LINE_LENGTH, ifile); | |
} | |
fprintf(efile, "\\.\n"); | |
fprintf(afile, "\\.\n"); | |
printf("--------------\n"); | |
printf("num lines %d\n", line_no - 1); | |
printf("num cases %d\n", num_cases); | |
printf("num aliases %d\n", num_aliases); | |
} | |
void parse_file(char * ipath, char * epath, char * apath) { | |
parse_records(fopen(ipath, "r"), fopen(epath, "w"), fopen(apath, "w")); | |
} | |
void print_elapsed(const struct timeval * t1, const struct timeval * t2) { | |
double d1 = ((double) t1->tv_sec) + ((double) t1->tv_usec) / 1000000.0; | |
double d2 = ((double) t2->tv_sec) + ((double) t2->tv_usec) / 1000000.0; | |
printf("Time: %f\n", d2 - d1); | |
} | |
int main(void) | |
{ | |
struct timeval t1; | |
struct timeval t2; | |
gettimeofday(&t1, NULL); | |
parse_file("../cr-200", | |
"../e2-file", | |
"../a2-file"); | |
gettimeofday(&t2, NULL); | |
print_elapsed(&t1, &t2); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment