I do all four of the following often:
-
Write lengthy academic prose.
-
Write code.
-
Analyze data and do other math-y things.
import sys | |
import nltk | |
import string | |
dasfile = sys.argv[1] | |
with open(dasfile) as dastext: | |
thetext = dastext.read() | |
cleanstring = thetext.translate(string.maketrans("",""), string.punctuation) | |
cleanstring = cleanstring.translate(string.maketrans("",""), string.digits) | |
cleanstring = filter(lambda x: x in string.printable, cleanstring) | |
from nltk.corpus import stopwords |
# assumes documents are provided in the form of a list of (docid, doctext) tuples named thedocslist. docid = int/string/float; doctext = string | |
import nltk | |
import string | |
from collections import Counter | |
# get rid of punctuation, numbers; make all lowercase. no stemming. | |
counterslist = [] | |
for onedocument in thedocslist: |
# The point of this script is that pandoc commandline syntax is painful and hard to remember. | |
# I really only produce html, pdf, and docx. And I only ever use the defaults. Ergo, a script | |
# (subsequently to be put in $PATH with path to python added to top to be runnable trivially) to | |
# make it simple. | |
# | |
# usage: python pgmd.py INPUTFILE FORMAT[html/pdf/word] | |
# that's it. easy. | |
# | |
# there are a handful of other options (output file, overwrite output file, append scripts and | |
# css and such to html headers), details are in the commandline help via -h flag |
# OBSOLETE. | |
# GO HERE INSTEAD: https://github.com/paultopia/spideyscrape | |
# very basic scraper-spider for those html books where there's a table of contents page that links to a | |
# bunch of sub-pages with actual content. (Like the documentation for a bunch of libraries.) | |
# WARNING: has no validation, assumes pages contain relative links and are all on the same site. | |
# (this is an easy tweak but I don't have time today) | |
# also assumes all content is vanilla html or at least can be accessed through vanilla html. | |
# | |
# pass ToC page through raw_input. This script scrapes every unique page linked from ToC and |
# EDIT: this has now been upgraded to a full-fledged repo and is accepting PRs. This gist is no longer updating. | |
# go here: https://github.com/paultopia/spideyscrape | |
# This is a very basic scraper-spider for those html books where there's a table of contents page that links to a | |
# bunch of sub-pages with actual content (like the documentation for a bunch of libraries). | |
# | |
# Dependencies: Beautiful soup 4 on Python 2.7. | |
# | |
# It assumes all content is vanilla html or at least can be accessed through vanilla html. | |
# |
import sys | |
import spideyscrape | |
import console | |
import os | |
args = sys.argv[1:] # see if the user gave us a command line argument | |
start = args[0] if args else raw_input('URL to crawl: ') | |
html = spideyscrape.scrape(start) | |
filename = spideyscrape.savePage(html) | |
console.open_in(filename) |
# I think I've discovered a bit of Python code even more dangerous than https://github.com/ajalt/fuckitpy | |
class string(str): | |
def __call__(self): | |
try: | |
exec self | |
except Exception: | |
pass | |
evil = string('print "EVIL"') |
""" see http://stackoverflow.com/questions/35245929/python-ftplib-hangs-on-upload-stor-and-not-network-latency for the ftplib hell that led here | |
Obvs replace [SERVER] [USER] and [PASSWORD] with appropriate values, and add to the list of ascii extensions for whatever you use. | |
""" | |
import sys | |
import os | |
import datetime as dt | |
full = sys.argv[1] | |
path = 'public_html/' + full[:full.rfind('/') + 1] |
# convert a directory full of M$word .docx files to PDF | |
# REQUIREMENTS: | |
# 1. Only works on Mac. | |
# 2. Assign the variable USERNAME to your home directory | |
# 3. Requires the PDFwriter printer driver, get it here: https://sourceforge.net/projects/pdfwriterformac/ | |
# 3.5 (might require PDFwriter to be your default printer; on my machine it's the only printer, so I haven't tested with any other config | |
# 4. Requires the launch and appswitch apps from the wonderful Nicholas Riley, get them here: http://sabi.net/nriley/software/ | |
import glob, os, time | |
homedir = os.getcwd() |