Last active
November 4, 2015 19:04
-
-
Save mattvchandler/2c438bdd3c0cf9bfc425 to your computer and use it in GitHub Desktop.
Title case library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// title_case.c | |
// Attempt at proper title casing algorithm | |
// The basic method is to 1st load a dictionary of exceptions | |
// when given a string, we will look up each word in the dictionary. | |
// If found, we will use the capitalization from the dictionary file | |
// otherwise, all words start with a capitals, and the rest are lowercase | |
// the 1st letter of the 1st and last words in the string are always capitalized | |
// only works for ascii text | |
// Copyright 2015 Matthew Chandler | |
// Permission is hereby granted, free of charge, to any person obtaining a copy | |
// of this software and associated documentation files (the "Software"), to deal | |
// in the Software without restriction, including without limitation the rights | |
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the Software is | |
// furnished to do so, subject to the following conditions: | |
// The above copyright notice and this permission notice shall be included in | |
// all copies or substantial portions of the Software. | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
// SOFTWARE. | |
#include "title_case.h" | |
#include <errno.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#ifndef TITLE_CASE_EXCEPTION_FILE | |
#define TITLE_CASE_EXCEPTION_FILE "title_case_exceptions" | |
#endif | |
struct _Except_node | |
{ | |
char value; | |
Except_node * lo; | |
Except_node * hi; | |
Except_node * eq; | |
char * capitalized; | |
}; | |
// return a new empty node - does not load the dictionary | |
Except_node * new_except_node() | |
{ | |
Except_node * node = (Except_node *)malloc(sizeof(Except_node)); | |
memset(node, 0, sizeof(Except_node)); | |
return node; | |
} | |
// build an exception dictionary from the above list | |
Except_node * build_except_dict() | |
{ | |
// open the file | |
FILE * exception_file = fopen(TITLE_CASE_EXCEPTION_FILE, "r"); | |
if(!exception_file) | |
{ | |
fprintf(stderr, "[WARNING]: could not open title case exception file: " TITLE_CASE_EXCEPTION_FILE ": %s\n", | |
strerror(errno)); | |
errno = 0; | |
return NULL; | |
} | |
// allocate storage for word list | |
int words_capacity = 64; | |
char ** words = (char **)malloc(sizeof(char **) * words_capacity); | |
enum {BUFFER_SIZE = 1024}; | |
char buffer[BUFFER_SIZE] = {0}; | |
// read each line | |
int num_words = 0; | |
while(fgets(buffer, BUFFER_SIZE, exception_file)) | |
{ | |
// skip commnets | |
if(*buffer == '#') | |
continue; | |
// strip newlines | |
while(buffer[strlen(buffer) - 1] == '\n' || buffer[strlen(buffer) - 1] == '\r') | |
buffer[strlen(buffer) - 1] = '\0'; | |
// skip empty lines | |
if(!*buffer) | |
continue; | |
// allocate space as needed | |
if(num_words == words_capacity) | |
{ | |
words_capacity *= 2; | |
words = (char **)realloc(words, sizeof(char **) * words_capacity); | |
} | |
// allocate and copy string to list | |
words[num_words] = (char *)malloc(sizeof(char *) * (strlen(buffer) + 1)); | |
strcpy(words[num_words], buffer); | |
++num_words; | |
} | |
// sorted input results in worst-case performance for this algorithm, so randomly shuffle word list (fisher-yates) | |
int i; | |
for(i = num_words - 1; i >= 1; --i) | |
{ | |
int j = rand() % (i + 1); | |
char * tmp = words[i]; | |
words[i] = words[j]; | |
words[j] = tmp; | |
} | |
// start a new exception dictionary | |
Except_node * root = new_except_node(); | |
// add each word, and then free its storage | |
for(i = 0; i < num_words; ++i) | |
{ | |
add_exception(root, words[i]); | |
free(words[i]); | |
} | |
free(words); | |
fclose(exception_file); | |
return root; | |
} | |
// recursive free of dictionary | |
void free_except_node(Except_node * root) | |
{ | |
if(!root) | |
return; | |
free_except_node(root->lo); | |
free_except_node(root->hi); | |
free_except_node(root->eq); | |
free(root->capitalized); | |
free(root); | |
} | |
// add a new word to the exception dictionary | |
void add_exception(Except_node * root, const char * exception_word) | |
{ | |
const char * next_char = exception_word; | |
Except_node * node = root; | |
// lowercase copy of char for lookup | |
char char_val; | |
if(*next_char >= 'A' && *next_char <= 'Z') | |
char_val = *next_char - ('A' - 'a'); | |
else | |
char_val = *next_char; | |
while(1) | |
{ | |
// current node is empty, so store our char here and move to next char | |
if(!node->value) | |
{ | |
node->value = char_val; | |
// done? | |
if(!*(++next_char)) | |
break; | |
// lowercase copy of char for lookup | |
if(*next_char >= 'A' && *next_char <= 'Z') | |
char_val = *next_char - ('A' - 'a'); | |
else | |
char_val = *next_char; | |
if(!node->eq) | |
node->eq = new_except_node(); | |
node = node->eq; | |
} | |
else | |
{ | |
// determine which child node to move to | |
if(char_val < node->value) | |
{ | |
if(!node->lo) | |
node->lo = new_except_node(); | |
node = node->lo; | |
} | |
else if(char_val > node->value) | |
{ | |
if(!node->hi) | |
node->hi = new_except_node(); | |
node = node->hi; | |
} | |
else | |
{ | |
// matches this node. move to next char and eq child node | |
// done? | |
if(!*(++next_char)) | |
break; | |
// lowercase copy of char for lookup | |
if(*next_char >= 'A' && *next_char <= 'Z') | |
char_val = *next_char - ('A' - 'a'); | |
else | |
char_val = *next_char; | |
if(!node->eq) | |
node->eq = new_except_node(); | |
node = node->eq; | |
} | |
} | |
} | |
// at end of word. store proper capitalization at this node | |
if(!node->capitalized) | |
{ | |
if(node->capitalized) | |
free(node->capitalized); | |
node->capitalized = (char *)malloc(sizeof(char *) * (strlen(exception_word) + 1)); | |
strcpy(node->capitalized, exception_word); | |
} | |
} | |
// look up a capitalization exception. | |
// key should be all lowercase | |
// return pointer is NULL if not found, otherwise points into exception_list | |
static const char * get_exception(const Except_node * root, const char * key) | |
{ | |
const char * next_char = key; | |
const Except_node * node = root; | |
// scan tree for key | |
while(1) | |
{ | |
// dead-end. key not in tree | |
if(!node) | |
return NULL; | |
// compare next char to node and move to corresponding child | |
if(*next_char < node->value) | |
{ | |
node = node->lo; | |
} | |
else if(*next_char > node->value) | |
{ | |
node = node->hi; | |
} | |
else | |
{ | |
// done? | |
if(!*(++next_char)) | |
break; | |
node = node->eq; | |
} | |
} | |
// return correct capitalization from node | |
return node->capitalized; | |
} | |
// convert a string to title_case (in place) | |
void title_case_inplace(const Except_node * except, char * str) | |
{ | |
char * word_start = str; | |
char * word_end = str; | |
int word_num = 0; | |
while(1) | |
{ | |
// find start of a word (A-Z or a-z) | |
while(!(*word_start >= 'a' && *word_start <= 'z') && | |
!(*word_start >= 'A' && *word_start <= 'Z')) | |
{ | |
// done? | |
if(!*word_start) | |
break; | |
// reset word_num so we capitalize after period or colon | |
if(*word_start == '.' || *word_start == ':') | |
word_num = 0; | |
++word_start; | |
} | |
word_end = word_start; | |
// done? | |
if(!*word_end) | |
break; | |
// find the end of the word +1 (A-Z, a-z, and ') | |
while(*word_end == '\'' || | |
(*word_end >= 'a' && *word_end <= 'z') || | |
(*word_end >= 'A' && *word_end <= 'Z')) | |
{ | |
// convert to lowercase as we go | |
if(*word_end >= 'A' && *word_end <= 'Z') | |
*word_end -= 'A' - 'a'; | |
++word_end; | |
} | |
// temporarily null-terminate word, and look it up in the exception dictionary | |
char save_char = *word_end; | |
*word_end = '\0'; | |
const char * lookup = get_exception(except, word_start); | |
*word_end = save_char; | |
// if an exception was found, use that capitalization | |
if(lookup) | |
{ | |
// copy word from dictionary back into str | |
strncpy(word_start, lookup, strlen(lookup)); | |
// always capitalize 1st letter of 1st and last word | |
if((word_num == 0 || !*word_end) && *word_start >= 'a' && *word_start <= 'z') | |
*word_start += 'A' - 'a'; | |
} | |
else | |
// otherwise just capitalize the 1st letter | |
*word_start += 'A' - 'a'; | |
++word_num; | |
if(!*word_end) | |
break; | |
word_start = word_end; | |
} | |
} | |
// convert a string to title_case | |
void title_case(const Except_node * except, const char * str, char * out) | |
{ | |
strcpy(out, str); | |
title_case_inplace(except, out); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// title_case.h | |
// Attempt at proper title casing algorithm | |
// The basic method is to 1st load a dictionary of exceptions | |
// when given a string, we will look up each word in the dictionary. | |
// If found, we will use the capitalization from the dictionary file | |
// otherwise, all words start with a capitals, and the rest are lowercase | |
// the 1st letter of the 1st and last words in the string are always capitalized | |
// only works for ascii text | |
// Copyright 2015 Matthew Chandler | |
// Permission is hereby granted, free of charge, to any person obtaining a copy | |
// of this software and associated documentation files (the "Software"), to deal | |
// in the Software without restriction, including without limitation the rights | |
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the Software is | |
// furnished to do so, subject to the following conditions: | |
// The above copyright notice and this permission notice shall be included in | |
// all copies or substantial portions of the Software. | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
// SOFTWARE. | |
#ifndef TITLE_CASE_H | |
#define TITLE_CASE_H | |
#ifdef __cplusplus | |
extern "C" | |
{ | |
#endif | |
// ternary search tree node type for exception dictionary | |
typedef struct _Except_node Except_node; | |
// return a new empty node - does not load the dictionary | |
Except_node * new_except_node(); | |
// build an exception dictionary | |
Except_node * build_except_dict(); | |
// recursive free of dictionary | |
void free_except_node(Except_node * root); | |
// add a new word to the exception dictionary | |
void add_exception(Except_node * root, const char * exception_word); | |
// convert a string to title_case (in place) | |
void title_case_inplace(const Except_node * except, char * str); | |
// convert a string to title_case | |
// buffer pointed to by out should be at least large enough to hold str | |
void title_case(const Except_node * except, const char * str, char * out); | |
#ifdef __cplusplus | |
} | |
#endif | |
#endif // TITLE_CASE_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// title_case_example.c | |
// simple usage example for title_case library | |
// Copyright 2015 Matthew Chandler | |
// Permission is hereby granted, free of charge, to any person obtaining a copy | |
// of this software and associated documentation files (the "Software"), to deal | |
// in the Software without restriction, including without limitation the rights | |
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the Software is | |
// furnished to do so, subject to the following conditions: | |
// The above copyright notice and this permission notice shall be included in | |
// all copies or substantial portions of the Software. | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
// SOFTWARE. | |
#include <stdio.h> | |
#include <string.h> | |
#include "title_case.h" | |
int main() | |
{ | |
Except_node * except = build_except_dict(); | |
const char * in = "test the title case algorithm on: a string"; | |
char out[1204]; | |
title_case(except, in, out); | |
printf("%s -> %s\n", in, out); | |
free_except_node(except); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# words that should not be capitalized in title-case | |
a | |
an | |
and | |
as | |
at | |
but | |
by | |
for | |
in | |
is | |
it | |
nor | |
or | |
the | |
to | |
up | |
# roman numerals 2-9 (no need to include single char numerals) | |
II | |
III | |
IV | |
VI | |
VII | |
VIII | |
IX | |
# general acronyms | |
AB | |
AC | |
ACT | |
AED | |
AIDS | |
ASL | |
ATC | |
BA | |
BS | |
CAD | |
CNC | |
CPR | |
CSI | |
CSS | |
DC | |
DNA | |
EKG | |
EMT | |
ER | |
GIS | |
GPS | |
GYN | |
HIV | |
HTML | |
HVAC | |
IP | |
LAN | |
LDS | |
MA | |
MIDI | |
MS | |
NE | |
NW | |
OB | |
PC | |
PE | |
RN | |
SAT | |
SCUBA | |
SE | |
SW | |
TCP | |
TV | |
UNIX | |
USA | |
USAF | |
WAN | |
XHTML | |
XML | |
# other |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment