Daniel interrogator

## gist:89eb901f28923ec847fd
def quicktree(sentence):
    """Parse a sentence and return a visual representation"""
    from nltk import Tree
    from nltk.draw.util import CanvasFrame
    from nltk.draw import TreeWidget
    from stat_parser import Parser
    from IPython.display import display
    from IPython.display import Image
    parser = Parser()
    parsed = parser.parse(sentence)

## gist:a9757ae1b4dcaa3bc84a
extract () {
    if [ -f $1 ] ; then
        case $1 in
            *.tar.bz2)  tar xjf $1      ;;
            *.tar.gz)   tar xzf $1      ;;
            *.bz2)      bunzip2 $1      ;;
            *.rar)      rar x $1        ;;
            *.gz)       gunzip $1       ;;
            *.tar)      tar xf $1       ;;
            *.tbz2)     tar xjf $1      ;;

## gist:2b3f37cc14712c5964c5
def parse_sfl(n = 3):
    from bs4 import BeautifulSoup
    import os
    from collections import defaultdict

    # path to xml files
    xmlpath = 'XML'

    # list of sfl categories
    sfl_list = [

## gist:1466b785567b1affd9e2
!sudo yum -y install java
!git clone https://www.github.com/interrogator/risk

import corpkit
from corpkit import interrogator, plotter, quickview
import pandas as pd
corpus = 'data/nyt/years'
#immediate sister to left of risk word
query = r'__ $. /(?i).?\brisk.?/'
# interrogate, output words only

## gist:2466b3f1ee2c94302a24
1995
[-5, [('the', 351), ('to', 226), ('of', 159), ('and', 157), ('a', 157), ('that', 156), ('in', 93), ('is', 80), ('for', 72), ('not', 68)]]
[-4, [('the', 333), ('to', 199), ('a', 164), ('that', 161), ('of', 140), ('and', 133), ('it', 90), ('are', 89), ('is', 83), ('in', 79)]]
[-3, [('to', 419), ('the', 263), ('a', 164), ('and', 150), ('that', 135), ('of', 124), ('are', 116), ('is', 113), ('there', 108), ('it', 98)]]
[-2, [('the', 396), ('a', 292), ('of', 232), ('to', 204), ('is', 180), ('and', 180), ('at', 178), ('take', 128), ('that', 126), ('are', 97)]]
[-1, [('the', 1733), ('at', 549), ('a', 432), ('and', 212), ('to', 191), ('of', 188), ('high', 127), ('health', 91), ('increased', 84), ('their', 77)]]
[0, [('risk', 4749), ('risks', 1611), ('risky', 684), ('risking', 157), ('risked', 141), ('riskier', 82), ('riskiest', 32), ('riske', 31), ('riskin', 13), ('riskiness', 12)]]
[1, [('of', 1591), ('to', 297), ('and', 240), ('for', 218), ('that', 196), ('in', 160), ('is', 116), ('the', 114), ('factors', 110),

## gist:9feebf3bb571db23265e
def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10):
    import os
    import nltk
    import re
    from collections import Counter
    import pandas as pd
    # get list of subcorpora
    dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    # define risk word
    regex = r'(?i)\brisk'

## gist:cf42d9e3faf44a2be55b
#eugene script

def eugener(path = 'data/nyt/earlylate',
            regex = r'(?i)\brisk',
            depth = 5,
            top = 10,
            remove_stopwords = False):
    """
    get most frequent words in corpus path to left and right

## gist:545fd5eef1a3687626c6
2005
      -5  -4  -3  -2  -1    0   1   2   3   4   5  Total
NN    52  37  43  32  29   88  47  57  40  36  34    495
JJ    23  27  25  11  36   12  30  20  24  30  24    262
NNS   20  23  19  18  15   50  19  26  25  20  26    261
NNP   25  30  35  27   9    2  16  22  17  31  36    250
risk   6   1   1   0   0  177   0   0   1   1   7    194

## gist:c9fa4f43e6f16064190d
 Weeding-Out Urged In Foreign Service: U.S. URGED TO SIFT FOREIGN ...
 By E.W. KENWORTHY Special to The New York Times
 New York Times (1923-Current file); Dec 29, 1963;
 ProQuest Historical Newspapers: The New York Times (1851-2010)
 pg.1


Weeding-Out Urged

## gist:bc1e1d2320f8a63392a1
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
	<key>fileTypes</key>
	<array>
		<string>txt</string>
	</array>
	<key>name</key>
	<string>Plain Text</string>
	def quicktree(sentence):
	"""Parse a sentence and return a visual representation"""
	from nltk import Tree
	from nltk.draw.util import CanvasFrame
	from nltk.draw import TreeWidget
	from stat_parser import Parser
	from IPython.display import display
	from IPython.display import Image
	parser = Parser()
	parsed = parser.parse(sentence)
	extract () {
	if [ -f $1 ] ; then
	case $1 in
	*.tar.bz2) tar xjf $1 ;;
	*.tar.gz) tar xzf $1 ;;
	*.bz2) bunzip2 $1 ;;
	*.rar) rar x $1 ;;
	*.gz) gunzip $1 ;;
	*.tar) tar xf $1 ;;
	*.tbz2) tar xjf $1 ;;
	def parse_sfl(n = 3):
	from bs4 import BeautifulSoup
	import os
	from collections import defaultdict

	# path to xml files
	xmlpath = 'XML'

	# list of sfl categories
	sfl_list = [
	!sudo yum -y install java
	!git clone https://www.github.com/interrogator/risk

	import corpkit
	from corpkit import interrogator, plotter, quickview
	import pandas as pd
	corpus = 'data/nyt/years'
	#immediate sister to left of risk word
	query = r'__ $. /(?i).?\brisk.?/'
	# interrogate, output words only
	1995
	[-5, [('the', 351), ('to', 226), ('of', 159), ('and', 157), ('a', 157), ('that', 156), ('in', 93), ('is', 80), ('for', 72), ('not', 68)]]
	[-4, [('the', 333), ('to', 199), ('a', 164), ('that', 161), ('of', 140), ('and', 133), ('it', 90), ('are', 89), ('is', 83), ('in', 79)]]
	[-3, [('to', 419), ('the', 263), ('a', 164), ('and', 150), ('that', 135), ('of', 124), ('are', 116), ('is', 113), ('there', 108), ('it', 98)]]
	[-2, [('the', 396), ('a', 292), ('of', 232), ('to', 204), ('is', 180), ('and', 180), ('at', 178), ('take', 128), ('that', 126), ('are', 97)]]
	[-1, [('the', 1733), ('at', 549), ('a', 432), ('and', 212), ('to', 191), ('of', 188), ('high', 127), ('health', 91), ('increased', 84), ('their', 77)]]
	[0, [('risk', 4749), ('risks', 1611), ('risky', 684), ('risking', 157), ('risked', 141), ('riskier', 82), ('riskiest', 32), ('riske', 31), ('riskin', 13), ('riskiness', 12)]]
	[1, [('of', 1591), ('to', 297), ('and', 240), ('for', 218), ('that', 196), ('in', 160), ('is', 116), ('the', 114), ('factors', 110),
	def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10):
	import os
	import nltk
	import re
	from collections import Counter
	import pandas as pd
	# get list of subcorpora
	dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
	# define risk word
	regex = r'(?i)\brisk'
	#eugene script

	def eugener(path = 'data/nyt/earlylate',
	regex = r'(?i)\brisk',
	depth = 5,
	top = 10,
	remove_stopwords = False):
	"""
	get most frequent words in corpus path to left and right
	2005
	-5 -4 -3 -2 -1 0 1 2 3 4 5 Total
	NN 52 37 43 32 29 88 47 57 40 36 34 495
	JJ 23 27 25 11 36 12 30 20 24 30 24 262
	NNS 20 23 19 18 15 50 19 26 25 20 26 261
	NNP 25 30 35 27 9 2 16 22 17 31 36 250
	risk 6 1 1 0 0 177 0 0 1 1 7 194
	Weeding-Out Urged In Foreign Service: U.S. URGED TO SIFT FOREIGN ...
	By E.W. KENWORTHY Special to The New York Times
	New York Times (1923-Current file); Dec 29, 1963;
	ProQuest Historical Newspapers: The New York Times (1851-2010)
	pg.1



	Weeding-Out Urged
	<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
	<plist version="1.0">
	<dict>
	<key>fileTypes</key>
	<array>
	<string>txt</string>
	</array>
	<key>name</key>
	<string>Plain Text</string>