Skip to content

Instantly share code, notes, and snippets.

View interrogator's full-sized avatar

Daniel interrogator

  • UZH
  • Zurich, Switzerland
View GitHub Profile
@interrogator
interrogator / gist:89eb901f28923ec847fd
Last active February 11, 2024 14:46
visualising a parse tree
def quicktree(sentence):
"""Parse a sentence and return a visual representation"""
from nltk import Tree
from nltk.draw.util import CanvasFrame
from nltk.draw import TreeWidget
from stat_parser import Parser
from IPython.display import display
from IPython.display import Image
parser = Parser()
parsed = parser.parse(sentence)
extract () {
if [ -f $1 ] ; then
case $1 in
*.tar.bz2) tar xjf $1 ;;
*.tar.gz) tar xzf $1 ;;
*.bz2) bunzip2 $1 ;;
*.rar) rar x $1 ;;
*.gz) gunzip $1 ;;
*.tar) tar xf $1 ;;
*.tbz2) tar xjf $1 ;;
@interrogator
interrogator / gist:2b3f37cc14712c5964c5
Created May 18, 2015 03:31
treebank sfl conversion
def parse_sfl(n = 3):
from bs4 import BeautifulSoup
import os
from collections import defaultdict
# path to xml files
xmlpath = 'XML'
# list of sfl categories
sfl_list = [
!sudo yum -y install java
!git clone https://www.github.com/interrogator/risk
import corpkit
from corpkit import interrogator, plotter, quickview
import pandas as pd
corpus = 'data/nyt/years'
#immediate sister to left of risk word
query = r'__ $. /(?i).?\brisk.?/'
# interrogate, output words only
1995
[-5, [('the', 351), ('to', 226), ('of', 159), ('and', 157), ('a', 157), ('that', 156), ('in', 93), ('is', 80), ('for', 72), ('not', 68)]]
[-4, [('the', 333), ('to', 199), ('a', 164), ('that', 161), ('of', 140), ('and', 133), ('it', 90), ('are', 89), ('is', 83), ('in', 79)]]
[-3, [('to', 419), ('the', 263), ('a', 164), ('and', 150), ('that', 135), ('of', 124), ('are', 116), ('is', 113), ('there', 108), ('it', 98)]]
[-2, [('the', 396), ('a', 292), ('of', 232), ('to', 204), ('is', 180), ('and', 180), ('at', 178), ('take', 128), ('that', 126), ('are', 97)]]
[-1, [('the', 1733), ('at', 549), ('a', 432), ('and', 212), ('to', 191), ('of', 188), ('high', 127), ('health', 91), ('increased', 84), ('their', 77)]]
[0, [('risk', 4749), ('risks', 1611), ('risky', 684), ('risking', 157), ('risked', 141), ('riskier', 82), ('riskiest', 32), ('riske', 31), ('riskin', 13), ('riskiness', 12)]]
[1, [('of', 1591), ('to', 297), ('and', 240), ('for', 218), ('that', 196), ('in', 160), ('is', 116), ('the', 114), ('factors', 110),
def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10):
import os
import nltk
import re
from collections import Counter
import pandas as pd
# get list of subcorpora
dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
# define risk word
regex = r'(?i)\brisk'
@interrogator
interrogator / gist:cf42d9e3faf44a2be55b
Created May 19, 2015 13:20
eugener-code-and-output
#eugene script
def eugener(path = 'data/nyt/earlylate',
regex = r'(?i)\brisk',
depth = 5,
top = 10,
remove_stopwords = False):
"""
get most frequent words in corpus path to left and right
2005
-5 -4 -3 -2 -1 0 1 2 3 4 5 Total
NN 52 37 43 32 29 88 47 57 40 36 34 495
JJ 23 27 25 11 36 12 30 20 24 30 24 262
NNS 20 23 19 18 15 50 19 26 25 20 26 261
NNP 25 30 35 27 9 2 16 22 17 31 36 250
risk 6 1 1 0 0 177 0 0 1 1 7 194
Weeding-Out Urged In Foreign Service: U.S. URGED TO SIFT FOREIGN ...
By E.W. KENWORTHY Special to The New York Times
New York Times (1923-Current file); Dec 29, 1963;
ProQuest Historical Newspapers: The New York Times (1851-2010)
pg.1
Weeding-Out Urged
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>fileTypes</key>
<array>
<string>txt</string>
</array>
<key>name</key>
<string>Plain Text</string>