Skip to content

Instantly share code, notes, and snippets.

@joswr1ght
Created July 7, 2022 13:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joswr1ght/d195184f115dc9e454e80f19e8dabb6c to your computer and use it in GitHub Desktop.
Save joswr1ght/d195184f115dc9e454e80f19e8dabb6c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# MLA Style: Capitalization
#
# ref: https://libguides.pvcc.edu/citationstyles/mla9-capitalization
#
# In a title or a subtitle, capitalize the first word, the last word, and all
# principal words, including those that follow hyphens in compound terms.
# Therefore, capitalize the following parts of speech:
#
# Nouns (e.g., flowers, as in The Flowers of Europe)
# Pronouns (e.g., our, as in Save Our Children; that, as in The Mouse That Roared)
# Verbs (e.g., watches, as in America Watches Television; is, as in What Is Literature?)
# Adjectives (e.g., ugly, as in The Ugly Duckling; that, as in Who Said That Phrase?)
# Adverbs (e.g., slightly, as in Only Slightly Corrupt; down, as in Go Down, Moses)
# Subordinating conjunctions (e.g., after, although, as if, as soon as,
# because, before, if, that, unless, until, when, where, while, as in One If by
# Land and Anywhere That Chance Leads)
#
# Do not capitalize the following parts of speech when they fall in the middle of a title:
#
# Articles (a, an, the, as in Under the Bamboo Tree)
# Prepositions (e.g., against, as, between, in, of, to, as in The Merchant of
# Venice and “A Dialogue between the Soul and Body”)
# Coordinating conjunctions (and, but, for, nor, or, so, yet, as in Romeo and Juliet)
# The to in infinitives (as in How to Play Chess)
import sys
import os
import re
import fileinput
def title(s):
# Capitalize the first letter
return s[0].upper() + s[1:]
def lower(s):
# Lowercase initial letter
return s[0].lower() + s[1:]
def mlatitle(words):
articles = ["a", "an", "the"]
prepositions = ["abroad", "about", "above", "across", "after",
"against", "ago", "along", "amidst", "among", "amongst", "apart",
"around", "as", "aside", "at", "away", "barring", "before",
"behind", "below", "beneath", "beside", "besides", "between",
"beyond", "but", "by", "circa", "concerning", "despite", "down",
"during", "in", "inside", "instead", "into", "except",
"excluding", "for", "following", "from", "hence", "like",
"minus", "near", "next", "past", "per", "round", "of", "off", "on",
"onto", "opposite", "out", "outside", "over", "than", "through",
"throughout", "till", "times", "to", "toward", "towards",
"under", "underneath", "unlike", "until", "unto", "up", "upon",
"via", "with", "within", "without", "worth"]
coordinatingconjunctions = ["and", "but", "for", "nor", "or", "so", "yet"]
words = words.split()
wordcounttotal = len(words)
wordcount = 0
mlatitlewords = ""
for word in words:
# Ignore the "word" that uses # or = as a repeating character (used for
# headings in markup languages)
if (re.findall(r"^#+$|^=+$", word)):
mlatitlewords += word + " "
continue
# If it's not markup syntax, count this as a word
wordcount += 1
# Handle the case where the word has a leading dot (Asciidoc caption)
# We need to restore the leading dot so preserve it here
if (wordcount == 1 and word[0] == "."):
leadingdot="."
word = word[1:]
else:
leadingdot=""
# Skip words that contain a URL
# regex adapted from Django URL validator, https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
urlregex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
if (re.match(urlregex, word)):
mlatitlewords += leadingdot + word + " "
continue
# Skip Apple products beginning with "i"
if word in ["iPhone", "iPad"]:
mlatitlewords += leadingdot + word + " "
continue
# Skip if the word is a hex string (not really MLA but it should be)
# This pattern matches lowercase letters in the hex string; if the word uses mixed case, then
# this will not match (and it should be fixed). If it's all uppercase, then it will appear in
# proper title case format already.
hexregex = re.compile(r'^[0-9a-f:-]+$')
if (re.match(hexregex, word)):
mlatitlewords += leadingdot + word + " "
continue
# Capitalize the first word
if (wordcount == 1):
mlatitlewords += leadingdot + title(word) + " "
continue
# Capitalize the last word
if (wordcount == wordcounttotal):
mlatitlewords += title(word)
continue
if (word.lower() in articles or word.lower() in prepositions or
word.lower() in coordinatingconjunctions):
# Do not capitalize
mlatitlewords += lower(word) + " "
continue
if ("-" in word):
# Capitalize the first letter and the first letter after each hyphen
compounds = word.split("-")
for compound in compounds[0:-1]:
if (compound != ""): # Accommodate trailing hyphens
mlatitlewords += title(compound) + "-"
else:
mlatitlewords += "-"
mlatitlewords += compounds[-1] + " "
continue
# Default to uppercase
mlatitlewords += leadingdot + title(word) + " "
return mlatitlewords
if __name__ == "__main__":
if (len(sys.argv) != 1):
progname = os.path.basename(sys.argv[0])
print(f"{progname}: Convert string to title case using MLA rules.")
print(f"Usage: {progname} <words>")
sys.exit(-1)
for line in fileinput.input():
print(mlatitle(line))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment