Last active
August 29, 2015 13:55
-
-
Save pydanny/8695423 to your computer and use it in GitHub Desktop.
This is the script used to get the links out of Two Scoops of Django 1.6 LaTeX files. It's been modified to generate HTML instead of a new appendix. It currently has a minor problem with duplicates, but as the rendered content is not going into the book, I'm not worrying about it.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import re | |
# Regex to find all the links defined in LaTeX | |
regex = re.compile(r'\\link{.*}', re.IGNORECASE) | |
def strip_http_things(link): | |
"""Used to clean links for better sorting and for 'cleaner' | |
final display: | |
e.g. http://www.djangopackages.com is djangopackages.com | |
""" | |
link = link.replace("https://", "").replace("http://", "") | |
return link.replace("www.", "") | |
def cleaned_url_comparison(a, b): | |
"""This custom sort ensures that the domain is the leading | |
sort, not the various protocol prefixes. | |
http://docs.python.org/2/library/functions.html#sorted | |
""" | |
a = strip_http_things(a) | |
b = strip_http_things(b) | |
if a > b: | |
return 1 | |
if a == b: | |
return 0 | |
return -1 | |
def main(): | |
tmp = [] # temporary holder for links | |
links = [] # results list | |
# Find all the LaTeX files in the chapters directory | |
for filename in glob.glob("chapters/*.tex"): | |
# Open the LaTeX file | |
with open(filename) as f: | |
text = f.read() | |
# Find all the links in the LaTeX file and add them to tmp | |
tmp += re.findall(regex, text) | |
# Loops through all the links in tmp and do light cleanup | |
for link in tmp: | |
if ':8000' not in link: # Ignore the links to port 8000 | |
# Remove the unnecessary 'www.' | |
# Nearly duplicates strip_http_things() | |
link = link.replace("www.2scoops.co", "2scoops.co") | |
# Lazy way to correct an intermittent bug in my regex. | |
links.append(link[0:link.index('}') + 1]) | |
# Remove MOST of the duplicates | |
links = list(set(links)) | |
# Sort the links. See cleaned_url_comparison() above. | |
links = sorted(links, cmp=cleaned_url_comparison) | |
return links | |
if __name__ == "__main__": | |
# Context manager! Yeah! | |
with open("links.html", "w") as f: | |
# Check out my old-school string handling! | |
text = "<ul>\n" | |
for link in main(): | |
# strip LaTeX elements | |
link = link.replace("\link{", "").replace("}", "") | |
# Add to the unordered list | |
text += """ <li><a href="{}">{}</a></li>\n""".format( | |
link, | |
strip_http_things(link) | |
) | |
# Bare metal rockstar method of closing out a string. | |
text += "</ul>\n" | |
f.write(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment