joswr1ght/mlatitlecase.py

## mlatitlecase.py
#!/usr/bin/env python3
# MLA Style: Capitalization
#
# ref: https://libguides.pvcc.edu/citationstyles/mla9-capitalization
#
# In a title or a subtitle, capitalize the first word, the last word, and all
# principal words, including those that follow hyphens in compound terms.
# Therefore, capitalize the following parts of speech:
#
# Nouns (e.g., flowers, as in The Flowers of Europe)
# Pronouns (e.g., our, as in Save Our Children; that, as in The Mouse That Roared)
# Verbs (e.g., watches, as in America Watches Television; is, as in What Is Literature?)
# Adjectives (e.g., ugly, as in The Ugly Duckling; that, as in Who Said That Phrase?)
# Adverbs (e.g., slightly, as in Only Slightly Corrupt; down, as in Go Down, Moses)
# Subordinating conjunctions (e.g., after, although, as if, as soon as,
#     because, before, if, that, unless, until, when, where, while, as in One If by
#     Land and Anywhere That Chance Leads)
#
# Do not capitalize the following parts of speech when they fall in the middle of a title:
#
# Articles (a, an, the, as in Under the Bamboo Tree)
# Prepositions (e.g., against, as, between, in, of, to, as in The Merchant of
#     Venice and “A Dialogue between the Soul and Body”)
# Coordinating conjunctions (and, but, for, nor, or, so, yet, as in Romeo and Juliet)
# The to in infinitives (as in How to Play Chess)

import sys
import os
import re
import fileinput


def title(s):
    # Capitalize the first letter
    return s[0].upper() + s[1:]


def lower(s):
    # Lowercase initial letter
    return s[0].lower() + s[1:]


def mlatitle(words):
    articles = ["a", "an", "the"]
    prepositions = ["abroad", "about", "above", "across", "after",
                    "against", "ago", "along", "amidst", "among", "amongst", "apart",
                    "around", "as", "aside", "at", "away", "barring", "before",
                    "behind", "below", "beneath", "beside", "besides", "between",
                    "beyond", "but", "by", "circa", "concerning", "despite", "down",
                    "during", "in", "inside", "instead", "into", "except",
                    "excluding", "for", "following", "from", "hence", "like",
                    "minus", "near", "next", "past", "per", "round", "of", "off", "on",
                    "onto", "opposite", "out", "outside", "over", "than", "through",
                    "throughout", "till", "times", "to", "toward", "towards",
                    "under", "underneath", "unlike", "until", "unto", "up", "upon",
                    "via", "with", "within", "without", "worth"]
    coordinatingconjunctions = ["and", "but", "for", "nor", "or", "so", "yet"]

    words = words.split()
    wordcounttotal = len(words)
    wordcount = 0
    mlatitlewords = ""
    for word in words:
        # Ignore the "word" that uses # or = as a repeating character (used for
        # headings in markup languages)
        if (re.findall(r"^#+$|^=+$", word)):
            mlatitlewords += word + " "
            continue

        # If it's not markup syntax, count this as a word
        wordcount += 1

        # Handle the case where the word has a leading dot (Asciidoc caption)
        # We need to restore the leading dot so preserve it here
        if (wordcount == 1 and word[0] == "."):
            leadingdot="."
            word = word[1:]
        else:
            leadingdot=""

        # Skip words that contain a URL
        # regex adapted from Django URL validator, https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
        urlregex = re.compile(
            r'^(?:http|ftp)s?://' # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
            r'localhost|' #localhost...
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
            r'(?::\d+)?' # optional port
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
        if (re.match(urlregex, word)):
            mlatitlewords += leadingdot + word + " "
            continue

        # Skip Apple products beginning with "i"
        if word in ["iPhone", "iPad"]:
            mlatitlewords += leadingdot + word + " "
            continue

        # Skip if the word is a hex string (not really MLA but it should be)
        # This pattern matches lowercase letters in the hex string; if the word uses mixed case, then
        # this will not match (and it should be fixed). If it's all uppercase, then it will appear in
        # proper title case format already.
        hexregex = re.compile(r'^[0-9a-f:-]+$')
        if (re.match(hexregex, word)):
            mlatitlewords += leadingdot + word + " "
            continue


        # Capitalize the first word
        if (wordcount == 1):
            mlatitlewords += leadingdot + title(word) + " "
            continue

        # Capitalize the last word
        if (wordcount == wordcounttotal):
            mlatitlewords += title(word)
            continue

        if (word.lower() in articles or word.lower() in prepositions or
                word.lower() in coordinatingconjunctions):
            # Do not capitalize
            mlatitlewords += lower(word) + " "
            continue

        if ("-" in word):
            # Capitalize the first letter and the first letter after each hyphen
            compounds = word.split("-")
            for compound in compounds[0:-1]:
                if (compound != ""):  # Accommodate trailing hyphens
                    mlatitlewords += title(compound) + "-"
                else:
                    mlatitlewords += "-"

            mlatitlewords += compounds[-1] + " "
            continue

        # Default to uppercase
        mlatitlewords += leadingdot + title(word) + " "

    return mlatitlewords


if __name__ == "__main__":
    if (len(sys.argv) != 1):
        progname = os.path.basename(sys.argv[0])
        print(f"{progname}: Convert string to title case using MLA rules.")
        print(f"Usage: {progname} <words>")
        sys.exit(-1)

    for line in fileinput.input():
        print(mlatitle(line))
	#!/usr/bin/env python3
	# MLA Style: Capitalization
	#
	# ref: https://libguides.pvcc.edu/citationstyles/mla9-capitalization
	#
	# In a title or a subtitle, capitalize the first word, the last word, and all
	# principal words, including those that follow hyphens in compound terms.
	# Therefore, capitalize the following parts of speech:
	#
	# Nouns (e.g., flowers, as in The Flowers of Europe)
	# Pronouns (e.g., our, as in Save Our Children; that, as in The Mouse That Roared)
	# Verbs (e.g., watches, as in America Watches Television; is, as in What Is Literature?)
	# Adjectives (e.g., ugly, as in The Ugly Duckling; that, as in Who Said That Phrase?)
	# Adverbs (e.g., slightly, as in Only Slightly Corrupt; down, as in Go Down, Moses)
	# Subordinating conjunctions (e.g., after, although, as if, as soon as,
	# because, before, if, that, unless, until, when, where, while, as in One If by
	# Land and Anywhere That Chance Leads)
	#
	# Do not capitalize the following parts of speech when they fall in the middle of a title:
	#
	# Articles (a, an, the, as in Under the Bamboo Tree)
	# Prepositions (e.g., against, as, between, in, of, to, as in The Merchant of
	# Venice and “A Dialogue between the Soul and Body”)
	# Coordinating conjunctions (and, but, for, nor, or, so, yet, as in Romeo and Juliet)
	# The to in infinitives (as in How to Play Chess)

	import sys
	import os
	import re
	import fileinput


	def title(s):
	# Capitalize the first letter
	return s[0].upper() + s[1:]


	def lower(s):
	# Lowercase initial letter
	return s[0].lower() + s[1:]


	def mlatitle(words):
	articles = ["a", "an", "the"]
	prepositions = ["abroad", "about", "above", "across", "after",
	"against", "ago", "along", "amidst", "among", "amongst", "apart",
	"around", "as", "aside", "at", "away", "barring", "before",
	"behind", "below", "beneath", "beside", "besides", "between",
	"beyond", "but", "by", "circa", "concerning", "despite", "down",
	"during", "in", "inside", "instead", "into", "except",
	"excluding", "for", "following", "from", "hence", "like",
	"minus", "near", "next", "past", "per", "round", "of", "off", "on",
	"onto", "opposite", "out", "outside", "over", "than", "through",
	"throughout", "till", "times", "to", "toward", "towards",
	"under", "underneath", "unlike", "until", "unto", "up", "upon",
	"via", "with", "within", "without", "worth"]
	coordinatingconjunctions = ["and", "but", "for", "nor", "or", "so", "yet"]

	words = words.split()
	wordcounttotal = len(words)
	wordcount = 0
	mlatitlewords = ""
	for word in words:
	# Ignore the "word" that uses # or = as a repeating character (used for
	# headings in markup languages)
	if (re.findall(r"^#+$\|^=+$", word)):
	mlatitlewords += word + " "
	continue

	# If it's not markup syntax, count this as a word
	wordcount += 1

	# Handle the case where the word has a leading dot (Asciidoc caption)
	# We need to restore the leading dot so preserve it here
	if (wordcount == 1 and word[0] == "."):
	leadingdot="."
	word = word[1:]
	else:
	leadingdot=""

	# Skip words that contain a URL
	# regex adapted from Django URL validator, https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
	urlregex = re.compile(
	r'^(?:http\|ftp)s?://' # http:// or https://
	r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?\|[A-Z0-9-]{2,}\.?)\|' #domain...
	r'localhost\|' #localhost...
	r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
	r'(?::\d+)?' # optional port
	r'(?:/?\|[/?]\S+)$', re.IGNORECASE)
	if (re.match(urlregex, word)):
	mlatitlewords += leadingdot + word + " "
	continue

	# Skip Apple products beginning with "i"
	if word in ["iPhone", "iPad"]:
	mlatitlewords += leadingdot + word + " "
	continue

	# Skip if the word is a hex string (not really MLA but it should be)
	# This pattern matches lowercase letters in the hex string; if the word uses mixed case, then
	# this will not match (and it should be fixed). If it's all uppercase, then it will appear in
	# proper title case format already.
	hexregex = re.compile(r'^[0-9a-f:-]+$')
	if (re.match(hexregex, word)):
	mlatitlewords += leadingdot + word + " "
	continue


	# Capitalize the first word
	if (wordcount == 1):
	mlatitlewords += leadingdot + title(word) + " "
	continue

	# Capitalize the last word
	if (wordcount == wordcounttotal):
	mlatitlewords += title(word)
	continue

	if (word.lower() in articles or word.lower() in prepositions or
	word.lower() in coordinatingconjunctions):
	# Do not capitalize
	mlatitlewords += lower(word) + " "
	continue

	if ("-" in word):
	# Capitalize the first letter and the first letter after each hyphen
	compounds = word.split("-")
	for compound in compounds[0:-1]:
	if (compound != ""): # Accommodate trailing hyphens
	mlatitlewords += title(compound) + "-"
	else:
	mlatitlewords += "-"

	mlatitlewords += compounds[-1] + " "
	continue

	# Default to uppercase
	mlatitlewords += leadingdot + title(word) + " "

	return mlatitlewords


	if __name__ == "__main__":
	if (len(sys.argv) != 1):
	progname = os.path.basename(sys.argv[0])
	print(f"{progname}: Convert string to title case using MLA rules.")
	print(f"Usage: {progname} <words>")
	sys.exit(-1)

	for line in fileinput.input():
	print(mlatitle(line))