psychemedia/nb_sqlite_db.py

## nb_sqlite_db.py
import os
def nbpathwalk(path):
    ''' Walk down a directory path looking for ipynb notebook files... '''
    for path, _, files in os.walk(path):
        if '.ipynb_checkpoints' in path: continue
        for f in [i for i in files if i.endswith('.ipynb')]:
            yield os.path.join(path, f)

import nbformat

def get_cell_contents(nb_fn, c_md=None, cell_typ=None):
    ''' Extract the content of Jupyter notebook cells. '''

    if cell_typ is None: cell_typ=['markdown']
    if c_md is None: c_md = []
    nb=nbformat.read(nb_fn,nbformat.NO_CONVERT)
    _c_md=[i for i in nb.cells if i['cell_type'] in cell_typ]
    ix=len(c_md)
    for c in _c_md:
        c.update( {"ix":str(ix)})
        c.update( {"title":nb_fn})
        ix = ix+1
    c_md = c_md + _c_md
    return c_md

import sqlite3


def index_notebooks_sqlite(nbpath='.', outfile='notebooks.sqlite', jsonp=None):
    ''' Get content from each notebook down a path and index it. '''

    conn = sqlite3.connect(outfile)
    # Create table
    c = conn.cursor()
    c.execute('''DROP TABLE IF EXISTS nbindex''')
    #Enable full text search
    c.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS nbindex USING fts4(title text, source text, ix text PRIMARY KEY, cell_type text)''')


    c_md=[]
    for fn in nbpathwalk(nbpath):
        cells = get_cell_contents(fn,c_md, cell_typ=['markdown','code'])

    for cell in cells:
        # Insert a row of data
        c.execute("INSERT INTO nbindex VALUES (?,?,?,?)",(cell['title'],cell['source'],
                                                                cell['ix'], cell['cell_type']))

    # Save (commit) the changes and close the db connection
    conn.commit()
    conn.close()


#https://blog.ouseful.info/2015/12/13/n-gram-phrase-based-concordances-in-nltk/
import nltk
def n_concordance_tokenised(text,phrase,left_margin=5,right_margin=5):
    ''' Token concordance for multiple contiguous tokens. '''
    #concordance replication via https://simplypython.wordpress.com/2014/03/14/saving-output-of-nltk-text-concordance/
    phraseList=phrase.split(' ')

    c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())

    #Find the offset for each token in the phrase
    offsets=[c.offsets(x) for x in phraseList]
    offsets_norm=[]
    #For each token in the phraselist, find the offsets and rebase them to the start of the phrase
    for i in range(len(phraseList)):
        offsets_norm.append([x-i for x in offsets[i]])
    #We have found the offset of a phrase if the rebased values intersect
    #--
    # http://stackoverflow.com/a/3852792/454773
    #the intersection method takes an arbitrary amount of arguments
    #result = set(d[0]).intersection(*d[1:])
    #--
    intersects=set(offsets_norm[0]).intersection(*offsets_norm[1:])

    concordance_txt = ([text.tokens[list(map(lambda x: x-left_margin if (x-left_margin)>0 else 0,[offset]))[0]:offset+len(phraseList)+right_margin]
                        for offset in intersects])

    outputs=[''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
    return outputs

def n_concordance(txt,phrase,left_margin=5,right_margin=5):
    ''' Find text concordance for a phrase. '''
    tokens = nltk.word_tokenize(txt)
    text = nltk.Text(tokens)

    return n_concordance_tokenised(text,phrase,left_margin=left_margin,right_margin=right_margin)


## nb_sqlite_db_demo.py
#Generate sqlite db of notebook(s) cell contents
index_notebooks_sqlite('.')

import pandas as pd

# Run query and pull results into a pandas dataframe
with sqlite3.connect('notebooks.sqlite') as conn:
    df = pd.read_sql_query("SELECT * from nbindex WHERE source MATCH 'this notebook' LIMIT 10", conn)


#Apply concordance to source column in each row in dataframe
df['source'].apply(n_concordance,args=('this notebook',1,1))
	import os
	def nbpathwalk(path):
	''' Walk down a directory path looking for ipynb notebook files... '''
	for path, _, files in os.walk(path):
	if '.ipynb_checkpoints' in path: continue
	for f in [i for i in files if i.endswith('.ipynb')]:
	yield os.path.join(path, f)

	import nbformat

	def get_cell_contents(nb_fn, c_md=None, cell_typ=None):
	''' Extract the content of Jupyter notebook cells. '''

	if cell_typ is None: cell_typ=['markdown']
	if c_md is None: c_md = []
	nb=nbformat.read(nb_fn,nbformat.NO_CONVERT)
	_c_md=[i for i in nb.cells if i['cell_type'] in cell_typ]
	ix=len(c_md)
	for c in _c_md:
	c.update( {"ix":str(ix)})
	c.update( {"title":nb_fn})
	ix = ix+1
	c_md = c_md + _c_md
	return c_md

	import sqlite3


	def index_notebooks_sqlite(nbpath='.', outfile='notebooks.sqlite', jsonp=None):
	''' Get content from each notebook down a path and index it. '''

	conn = sqlite3.connect(outfile)
	# Create table
	c = conn.cursor()
	c.execute('''DROP TABLE IF EXISTS nbindex''')
	#Enable full text search
	c.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS nbindex USING fts4(title text, source text, ix text PRIMARY KEY, cell_type text)''')


	c_md=[]
	for fn in nbpathwalk(nbpath):
	cells = get_cell_contents(fn,c_md, cell_typ=['markdown','code'])

	for cell in cells:
	# Insert a row of data
	c.execute("INSERT INTO nbindex VALUES (?,?,?,?)",(cell['title'],cell['source'],
	cell['ix'], cell['cell_type']))

	# Save (commit) the changes and close the db connection
	conn.commit()
	conn.close()


	#https://blog.ouseful.info/2015/12/13/n-gram-phrase-based-concordances-in-nltk/
	import nltk
	def n_concordance_tokenised(text,phrase,left_margin=5,right_margin=5):
	''' Token concordance for multiple contiguous tokens. '''
	#concordance replication via https://simplypython.wordpress.com/2014/03/14/saving-output-of-nltk-text-concordance/
	phraseList=phrase.split(' ')

	c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())

	#Find the offset for each token in the phrase
	offsets=[c.offsets(x) for x in phraseList]
	offsets_norm=[]
	#For each token in the phraselist, find the offsets and rebase them to the start of the phrase
	for i in range(len(phraseList)):
	offsets_norm.append([x-i for x in offsets[i]])
	#We have found the offset of a phrase if the rebased values intersect
	#--
	# http://stackoverflow.com/a/3852792/454773
	#the intersection method takes an arbitrary amount of arguments
	#result = set(d[0]).intersection(*d[1:])
	#--
	intersects=set(offsets_norm[0]).intersection(*offsets_norm[1:])

	concordance_txt = ([text.tokens[list(map(lambda x: x-left_margin if (x-left_margin)>0 else 0,[offset]))[0]:offset+len(phraseList)+right_margin]
	for offset in intersects])

	outputs=[''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
	return outputs

	def n_concordance(txt,phrase,left_margin=5,right_margin=5):
	''' Find text concordance for a phrase. '''
	tokens = nltk.word_tokenize(txt)
	text = nltk.Text(tokens)

	return n_concordance_tokenised(text,phrase,left_margin=left_margin,right_margin=right_margin)
	#Generate sqlite db of notebook(s) cell contents
	index_notebooks_sqlite('.')

	import pandas as pd

	# Run query and pull results into a pandas dataframe
	with sqlite3.connect('notebooks.sqlite') as conn:
	df = pd.read_sql_query("SELECT * from nbindex WHERE source MATCH 'this notebook' LIMIT 10", conn)


	#Apply concordance to source column in each row in dataframe
	df['source'].apply(n_concordance,args=('this notebook',1,1))