Paul Gowder paultopia

## stopwordkill.py
import sys
import nltk
import string
dasfile = sys.argv[1]
with open(dasfile) as dastext:
    thetext = dastext.read()
cleanstring = thetext.translate(string.maketrans("",""), string.punctuation)
cleanstring = cleanstring.translate(string.maketrans("",""), string.digits)
cleanstring = filter(lambda x: x in string.printable, cleanstring)
from nltk.corpus import stopwords

## wordcount.py
# assumes documents are provided in the form of a list of (docid, doctext) tuples named thedocslist. docid = int/string/float; doctext = string

import nltk
import string
from collections import Counter

# get rid of punctuation, numbers; make all lowercase.  no stemming.

counterslist = []
for onedocument in thedocslist:

## toolchain.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                paultopia
                / toolchain.md
            
            
              Last active
              November 30, 2015 20:16
            
              
                The toolchain of a techy-ish political theorist/lawyer
              
          
    My toolchain (on OSX)

I do all four of the following often:


Write lengthy academic prose.


Write code.


Analyze data and do other math-y things.


## pgmd.py
# The point of this script is that pandoc commandline syntax is painful and hard to remember.
# I really only produce html, pdf, and docx.  And I only ever use the defaults.  Ergo, a script
# (subsequently to be put in $PATH with path to python added to top to be runnable trivially) to
# make it simple.
#
# usage: python pgmd.py INPUTFILE FORMAT[html/pdf/word]
# that's it.  easy.
#
# there are a handful of other options (output file, overwrite output file, append scripts and
# css and such to html headers), details are in the commandline help via -h flag

## quickscrape.py
# OBSOLETE.
# GO HERE INSTEAD: https://github.com/paultopia/spideyscrape

# very basic scraper-spider for those html books where there's a table of contents page that links to a
# bunch of sub-pages with actual content.  (Like the documentation for a bunch of libraries.)
# WARNING: has no validation, assumes pages contain relative links and are all on the same site.
# (this is an easy tweak but I don't have time today)
# also assumes all content is vanilla html or at least can be accessed through vanilla html.
#
# pass ToC page through raw_input.  This script scrapes every unique page linked from ToC and

## val_quickscrape.py
# EDIT: this has now been upgraded to a full-fledged repo and is accepting PRs.  This gist is no longer updating.
# go here: https://github.com/paultopia/spideyscrape

# This is a very basic scraper-spider for those html books where there's a table of contents page that links to a
# bunch of sub-pages with actual content (like the documentation for a bunch of libraries).
#
# Dependencies: Beautiful soup 4 on Python 2.7.
#
# It assumes all content is vanilla html or at least can be accessed through vanilla html.
#

## scrapewrap.py
import sys
import spideyscrape
import console
import os

args = sys.argv[1:]  # see if the user gave us a command line argument
start = args[0] if args else raw_input('URL to crawl: ')
html = spideyscrape.scrape(start)
filename = spideyscrape.savePage(html)
console.open_in(filename)

## worst_python_ever.py
# I think I've discovered a bit of Python code even more dangerous than https://github.com/ajalt/fuckitpy

class string(str):
  def __call__(self):
    try:
      exec self
    except Exception:
      pass

evil = string('print "EVIL"')

## upload.py
""" see http://stackoverflow.com/questions/35245929/python-ftplib-hangs-on-upload-stor-and-not-network-latency for the ftplib hell that led here

Obvs replace [SERVER] [USER] and [PASSWORD] with appropriate values, and add to the list of ascii extensions for whatever you use.
"""

import sys
import os
import datetime as dt
full = sys.argv[1]
path = 'public_html/' + full[:full.rfind('/') + 1]

## word-to-pdf.py
# convert a directory full of M$word .docx files to PDF
# REQUIREMENTS:
# 1.  Only works on Mac.
# 2.  Assign the variable USERNAME to your home directory
# 3.  Requires the PDFwriter printer driver, get it here: https://sourceforge.net/projects/pdfwriterformac/
# 3.5 (might require PDFwriter to be your default printer; on my machine it's the only printer, so I haven't tested with any other config
# 4. Requires the launch and appswitch apps from the wonderful Nicholas Riley, get them here: http://sabi.net/nriley/software/

import glob, os, time
homedir = os.getcwd()
	import sys
	import nltk
	import string
	dasfile = sys.argv[1]
	with open(dasfile) as dastext:
	thetext = dastext.read()
	cleanstring = thetext.translate(string.maketrans("",""), string.punctuation)
	cleanstring = cleanstring.translate(string.maketrans("",""), string.digits)
	cleanstring = filter(lambda x: x in string.printable, cleanstring)
	from nltk.corpus import stopwords
	# assumes documents are provided in the form of a list of (docid, doctext) tuples named thedocslist. docid = int/string/float; doctext = string

	import nltk
	import string
	from collections import Counter

	# get rid of punctuation, numbers; make all lowercase. no stemming.

	counterslist = []
	for onedocument in thedocslist:
	# The point of this script is that pandoc commandline syntax is painful and hard to remember.
	# I really only produce html, pdf, and docx. And I only ever use the defaults. Ergo, a script
	# (subsequently to be put in $PATH with path to python added to top to be runnable trivially) to
	# make it simple.
	#
	# usage: python pgmd.py INPUTFILE FORMAT[html/pdf/word]
	# that's it. easy.
	#
	# there are a handful of other options (output file, overwrite output file, append scripts and
	# css and such to html headers), details are in the commandline help via -h flag
	# OBSOLETE.
	# GO HERE INSTEAD: https://github.com/paultopia/spideyscrape

	# very basic scraper-spider for those html books where there's a table of contents page that links to a
	# bunch of sub-pages with actual content. (Like the documentation for a bunch of libraries.)
	# WARNING: has no validation, assumes pages contain relative links and are all on the same site.
	# (this is an easy tweak but I don't have time today)
	# also assumes all content is vanilla html or at least can be accessed through vanilla html.
	#
	# pass ToC page through raw_input. This script scrapes every unique page linked from ToC and
	# EDIT: this has now been upgraded to a full-fledged repo and is accepting PRs. This gist is no longer updating.
	# go here: https://github.com/paultopia/spideyscrape

	# This is a very basic scraper-spider for those html books where there's a table of contents page that links to a
	# bunch of sub-pages with actual content (like the documentation for a bunch of libraries).
	#
	# Dependencies: Beautiful soup 4 on Python 2.7.
	#
	# It assumes all content is vanilla html or at least can be accessed through vanilla html.
	#
	import sys
	import spideyscrape
	import console
	import os

	args = sys.argv[1:] # see if the user gave us a command line argument
	start = args[0] if args else raw_input('URL to crawl: ')
	html = spideyscrape.scrape(start)
	filename = spideyscrape.savePage(html)
	console.open_in(filename)
	# I think I've discovered a bit of Python code even more dangerous than https://github.com/ajalt/fuckitpy

	class string(str):
	def __call__(self):
	try:
	exec self
	except Exception:
	pass

	evil = string('print "EVIL"')
	""" see http://stackoverflow.com/questions/35245929/python-ftplib-hangs-on-upload-stor-and-not-network-latency for the ftplib hell that led here

	Obvs replace [SERVER] [USER] and [PASSWORD] with appropriate values, and add to the list of ascii extensions for whatever you use.
	"""

	import sys
	import os
	import datetime as dt
	full = sys.argv[1]
	path = 'public_html/' + full[:full.rfind('/') + 1]
	# convert a directory full of M$word .docx files to PDF
	# REQUIREMENTS:
	# 1. Only works on Mac.
	# 2. Assign the variable USERNAME to your home directory
	# 3. Requires the PDFwriter printer driver, get it here: https://sourceforge.net/projects/pdfwriterformac/
	# 3.5 (might require PDFwriter to be your default printer; on my machine it's the only printer, so I haven't tested with any other config
	# 4. Requires the launch and appswitch apps from the wonderful Nicholas Riley, get them here: http://sabi.net/nriley/software/

	import glob, os, time
	homedir = os.getcwd()