Skip to content

Instantly share code, notes, and snippets.

View interrogator's full-sized avatar

Daniel interrogator

  • UZH
  • Zurich, Switzerland
View GitHub Profile
@interrogator
interrogator / gist:cf42d9e3faf44a2be55b
Created May 19, 2015 13:20
eugener-code-and-output
#eugene script
def eugener(path = 'data/nyt/earlylate',
regex = r'(?i)\brisk',
depth = 5,
top = 10,
remove_stopwords = False):
"""
get most frequent words in corpus path to left and right
def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10):
import os
import nltk
import re
from collections import Counter
import pandas as pd
# get list of subcorpora
dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
# define risk word
regex = r'(?i)\brisk'
1995
[-5, [('the', 351), ('to', 226), ('of', 159), ('and', 157), ('a', 157), ('that', 156), ('in', 93), ('is', 80), ('for', 72), ('not', 68)]]
[-4, [('the', 333), ('to', 199), ('a', 164), ('that', 161), ('of', 140), ('and', 133), ('it', 90), ('are', 89), ('is', 83), ('in', 79)]]
[-3, [('to', 419), ('the', 263), ('a', 164), ('and', 150), ('that', 135), ('of', 124), ('are', 116), ('is', 113), ('there', 108), ('it', 98)]]
[-2, [('the', 396), ('a', 292), ('of', 232), ('to', 204), ('is', 180), ('and', 180), ('at', 178), ('take', 128), ('that', 126), ('are', 97)]]
[-1, [('the', 1733), ('at', 549), ('a', 432), ('and', 212), ('to', 191), ('of', 188), ('high', 127), ('health', 91), ('increased', 84), ('their', 77)]]
[0, [('risk', 4749), ('risks', 1611), ('risky', 684), ('risking', 157), ('risked', 141), ('riskier', 82), ('riskiest', 32), ('riske', 31), ('riskin', 13), ('riskiness', 12)]]
[1, [('of', 1591), ('to', 297), ('and', 240), ('for', 218), ('that', 196), ('in', 160), ('is', 116), ('the', 114), ('factors', 110),
!sudo yum -y install java
!git clone https://www.github.com/interrogator/risk
import corpkit
from corpkit import interrogator, plotter, quickview
import pandas as pd
corpus = 'data/nyt/years'
#immediate sister to left of risk word
query = r'__ $. /(?i).?\brisk.?/'
# interrogate, output words only
@interrogator
interrogator / gist:2b3f37cc14712c5964c5
Created May 18, 2015 03:31
treebank sfl conversion
def parse_sfl(n = 3):
from bs4 import BeautifulSoup
import os
from collections import defaultdict
# path to xml files
xmlpath = 'XML'
# list of sfl categories
sfl_list = [
extract () {
if [ -f $1 ] ; then
case $1 in
*.tar.bz2) tar xjf $1 ;;
*.tar.gz) tar xzf $1 ;;
*.bz2) bunzip2 $1 ;;
*.rar) rar x $1 ;;
*.gz) gunzip $1 ;;
*.tar) tar xf $1 ;;
*.tbz2) tar xjf $1 ;;
@interrogator
interrogator / gist:89eb901f28923ec847fd
Last active February 11, 2024 14:46
visualising a parse tree
def quicktree(sentence):
"""Parse a sentence and return a visual representation"""
from nltk import Tree
from nltk.draw.util import CanvasFrame
from nltk.draw import TreeWidget
from stat_parser import Parser
from IPython.display import display
from IPython.display import Image
parser = Parser()
parsed = parser.parse(sentence)