Skip to content

Instantly share code, notes, and snippets.

@mattvchandler
Last active November 4, 2015 19:04
Show Gist options
  • Save mattvchandler/2c438bdd3c0cf9bfc425 to your computer and use it in GitHub Desktop.
Save mattvchandler/2c438bdd3c0cf9bfc425 to your computer and use it in GitHub Desktop.
Title case library
// title_case.c
// Attempt at proper title casing algorithm
// The basic method is to 1st load a dictionary of exceptions
// when given a string, we will look up each word in the dictionary.
// If found, we will use the capitalization from the dictionary file
// otherwise, all words start with a capitals, and the rest are lowercase
// the 1st letter of the 1st and last words in the string are always capitalized
// only works for ascii text
// Copyright 2015 Matthew Chandler
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "title_case.h"
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef TITLE_CASE_EXCEPTION_FILE
#define TITLE_CASE_EXCEPTION_FILE "title_case_exceptions"
#endif
struct _Except_node
{
char value;
Except_node * lo;
Except_node * hi;
Except_node * eq;
char * capitalized;
};
// return a new empty node - does not load the dictionary
Except_node * new_except_node()
{
Except_node * node = (Except_node *)malloc(sizeof(Except_node));
memset(node, 0, sizeof(Except_node));
return node;
}
// build an exception dictionary from the above list
Except_node * build_except_dict()
{
// open the file
FILE * exception_file = fopen(TITLE_CASE_EXCEPTION_FILE, "r");
if(!exception_file)
{
fprintf(stderr, "[WARNING]: could not open title case exception file: " TITLE_CASE_EXCEPTION_FILE ": %s\n",
strerror(errno));
errno = 0;
return NULL;
}
// allocate storage for word list
int words_capacity = 64;
char ** words = (char **)malloc(sizeof(char **) * words_capacity);
enum {BUFFER_SIZE = 1024};
char buffer[BUFFER_SIZE] = {0};
// read each line
int num_words = 0;
while(fgets(buffer, BUFFER_SIZE, exception_file))
{
// skip commnets
if(*buffer == '#')
continue;
// strip newlines
while(buffer[strlen(buffer) - 1] == '\n' || buffer[strlen(buffer) - 1] == '\r')
buffer[strlen(buffer) - 1] = '\0';
// skip empty lines
if(!*buffer)
continue;
// allocate space as needed
if(num_words == words_capacity)
{
words_capacity *= 2;
words = (char **)realloc(words, sizeof(char **) * words_capacity);
}
// allocate and copy string to list
words[num_words] = (char *)malloc(sizeof(char *) * (strlen(buffer) + 1));
strcpy(words[num_words], buffer);
++num_words;
}
// sorted input results in worst-case performance for this algorithm, so randomly shuffle word list (fisher-yates)
int i;
for(i = num_words - 1; i >= 1; --i)
{
int j = rand() % (i + 1);
char * tmp = words[i];
words[i] = words[j];
words[j] = tmp;
}
// start a new exception dictionary
Except_node * root = new_except_node();
// add each word, and then free its storage
for(i = 0; i < num_words; ++i)
{
add_exception(root, words[i]);
free(words[i]);
}
free(words);
fclose(exception_file);
return root;
}
// recursive free of dictionary
void free_except_node(Except_node * root)
{
if(!root)
return;
free_except_node(root->lo);
free_except_node(root->hi);
free_except_node(root->eq);
free(root->capitalized);
free(root);
}
// add a new word to the exception dictionary
void add_exception(Except_node * root, const char * exception_word)
{
const char * next_char = exception_word;
Except_node * node = root;
// lowercase copy of char for lookup
char char_val;
if(*next_char >= 'A' && *next_char <= 'Z')
char_val = *next_char - ('A' - 'a');
else
char_val = *next_char;
while(1)
{
// current node is empty, so store our char here and move to next char
if(!node->value)
{
node->value = char_val;
// done?
if(!*(++next_char))
break;
// lowercase copy of char for lookup
if(*next_char >= 'A' && *next_char <= 'Z')
char_val = *next_char - ('A' - 'a');
else
char_val = *next_char;
if(!node->eq)
node->eq = new_except_node();
node = node->eq;
}
else
{
// determine which child node to move to
if(char_val < node->value)
{
if(!node->lo)
node->lo = new_except_node();
node = node->lo;
}
else if(char_val > node->value)
{
if(!node->hi)
node->hi = new_except_node();
node = node->hi;
}
else
{
// matches this node. move to next char and eq child node
// done?
if(!*(++next_char))
break;
// lowercase copy of char for lookup
if(*next_char >= 'A' && *next_char <= 'Z')
char_val = *next_char - ('A' - 'a');
else
char_val = *next_char;
if(!node->eq)
node->eq = new_except_node();
node = node->eq;
}
}
}
// at end of word. store proper capitalization at this node
if(!node->capitalized)
{
if(node->capitalized)
free(node->capitalized);
node->capitalized = (char *)malloc(sizeof(char *) * (strlen(exception_word) + 1));
strcpy(node->capitalized, exception_word);
}
}
// look up a capitalization exception.
// key should be all lowercase
// return pointer is NULL if not found, otherwise points into exception_list
static const char * get_exception(const Except_node * root, const char * key)
{
const char * next_char = key;
const Except_node * node = root;
// scan tree for key
while(1)
{
// dead-end. key not in tree
if(!node)
return NULL;
// compare next char to node and move to corresponding child
if(*next_char < node->value)
{
node = node->lo;
}
else if(*next_char > node->value)
{
node = node->hi;
}
else
{
// done?
if(!*(++next_char))
break;
node = node->eq;
}
}
// return correct capitalization from node
return node->capitalized;
}
// convert a string to title_case (in place)
void title_case_inplace(const Except_node * except, char * str)
{
char * word_start = str;
char * word_end = str;
int word_num = 0;
while(1)
{
// find start of a word (A-Z or a-z)
while(!(*word_start >= 'a' && *word_start <= 'z') &&
!(*word_start >= 'A' && *word_start <= 'Z'))
{
// done?
if(!*word_start)
break;
// reset word_num so we capitalize after period or colon
if(*word_start == '.' || *word_start == ':')
word_num = 0;
++word_start;
}
word_end = word_start;
// done?
if(!*word_end)
break;
// find the end of the word +1 (A-Z, a-z, and ')
while(*word_end == '\'' ||
(*word_end >= 'a' && *word_end <= 'z') ||
(*word_end >= 'A' && *word_end <= 'Z'))
{
// convert to lowercase as we go
if(*word_end >= 'A' && *word_end <= 'Z')
*word_end -= 'A' - 'a';
++word_end;
}
// temporarily null-terminate word, and look it up in the exception dictionary
char save_char = *word_end;
*word_end = '\0';
const char * lookup = get_exception(except, word_start);
*word_end = save_char;
// if an exception was found, use that capitalization
if(lookup)
{
// copy word from dictionary back into str
strncpy(word_start, lookup, strlen(lookup));
// always capitalize 1st letter of 1st and last word
if((word_num == 0 || !*word_end) && *word_start >= 'a' && *word_start <= 'z')
*word_start += 'A' - 'a';
}
else
// otherwise just capitalize the 1st letter
*word_start += 'A' - 'a';
++word_num;
if(!*word_end)
break;
word_start = word_end;
}
}
// convert a string to title_case
void title_case(const Except_node * except, const char * str, char * out)
{
strcpy(out, str);
title_case_inplace(except, out);
}
// title_case.h
// Attempt at proper title casing algorithm
// The basic method is to 1st load a dictionary of exceptions
// when given a string, we will look up each word in the dictionary.
// If found, we will use the capitalization from the dictionary file
// otherwise, all words start with a capitals, and the rest are lowercase
// the 1st letter of the 1st and last words in the string are always capitalized
// only works for ascii text
// Copyright 2015 Matthew Chandler
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#ifndef TITLE_CASE_H
#define TITLE_CASE_H
#ifdef __cplusplus
extern "C"
{
#endif
// ternary search tree node type for exception dictionary
typedef struct _Except_node Except_node;
// return a new empty node - does not load the dictionary
Except_node * new_except_node();
// build an exception dictionary
Except_node * build_except_dict();
// recursive free of dictionary
void free_except_node(Except_node * root);
// add a new word to the exception dictionary
void add_exception(Except_node * root, const char * exception_word);
// convert a string to title_case (in place)
void title_case_inplace(const Except_node * except, char * str);
// convert a string to title_case
// buffer pointed to by out should be at least large enough to hold str
void title_case(const Except_node * except, const char * str, char * out);
#ifdef __cplusplus
}
#endif
#endif // TITLE_CASE_H
// title_case_example.c
// simple usage example for title_case library
// Copyright 2015 Matthew Chandler
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <stdio.h>
#include <string.h>
#include "title_case.h"
int main()
{
Except_node * except = build_except_dict();
const char * in = "test the title case algorithm on: a string";
char out[1204];
title_case(except, in, out);
printf("%s -> %s\n", in, out);
free_except_node(except);
return 0;
}
# words that should not be capitalized in title-case
a
an
and
as
at
but
by
for
in
is
it
nor
or
the
to
up
# roman numerals 2-9 (no need to include single char numerals)
II
III
IV
VI
VII
VIII
IX
# general acronyms
AB
AC
ACT
AED
AIDS
ASL
ATC
BA
BS
CAD
CNC
CPR
CSI
CSS
DC
DNA
EKG
EMT
ER
GIS
GPS
GYN
HIV
HTML
HVAC
IP
LAN
LDS
MA
MIDI
MS
NE
NW
OB
PC
PE
RN
SAT
SCUBA
SE
SW
TCP
TV
UNIX
USA
USAF
WAN
XHTML
XML
# other
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment