Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Example of scraping md and code cells from Jupyter notebooks into sqlite db then doing text concordance on result
import os
def nbpathwalk(path):
''' Walk down a directory path looking for ipynb notebook files... '''
for path, _, files in os.walk(path):
if '.ipynb_checkpoints' in path: continue
for f in [i for i in files if i.endswith('.ipynb')]:
yield os.path.join(path, f)
import nbformat
def get_cell_contents(nb_fn, c_md=None, cell_typ=None):
''' Extract the content of Jupyter notebook cells. '''
if cell_typ is None: cell_typ=['markdown']
if c_md is None: c_md = [],nbformat.NO_CONVERT)
_c_md=[i for i in nb.cells if i['cell_type'] in cell_typ]
for c in _c_md:
c.update( {"ix":str(ix)})
c.update( {"title":nb_fn})
ix = ix+1
c_md = c_md + _c_md
return c_md
import sqlite3
def index_notebooks_sqlite(nbpath='.', outfile='notebooks.sqlite', jsonp=None):
''' Get content from each notebook down a path and index it. '''
conn = sqlite3.connect(outfile)
# Create table
c = conn.cursor()
c.execute('''DROP TABLE IF EXISTS nbindex''')
#Enable full text search
c.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS nbindex USING fts4(title text, source text, ix text PRIMARY KEY, cell_type text)''')
for fn in nbpathwalk(nbpath):
cells = get_cell_contents(fn,c_md, cell_typ=['markdown','code'])
for cell in cells:
# Insert a row of data
c.execute("INSERT INTO nbindex VALUES (?,?,?,?)",(cell['title'],cell['source'],
cell['ix'], cell['cell_type']))
# Save (commit) the changes and close the db connection
import nltk
def n_concordance_tokenised(text,phrase,left_margin=5,right_margin=5):
''' Token concordance for multiple contiguous tokens. '''
#concordance replication via
phraseList=phrase.split(' ')
c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
#Find the offset for each token in the phrase
offsets=[c.offsets(x) for x in phraseList]
#For each token in the phraselist, find the offsets and rebase them to the start of the phrase
for i in range(len(phraseList)):
offsets_norm.append([x-i for x in offsets[i]])
#We have found the offset of a phrase if the rebased values intersect
#the intersection method takes an arbitrary amount of arguments
#result = set(d[0]).intersection(*d[1:])
concordance_txt = ([text.tokens[list(map(lambda x: x-left_margin if (x-left_margin)>0 else 0,[offset]))[0]:offset+len(phraseList)+right_margin]
for offset in intersects])
outputs=[''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
return outputs
def n_concordance(txt,phrase,left_margin=5,right_margin=5):
''' Find text concordance for a phrase. '''
tokens = nltk.word_tokenize(txt)
text = nltk.Text(tokens)
return n_concordance_tokenised(text,phrase,left_margin=left_margin,right_margin=right_margin)
#Generate sqlite db of notebook(s) cell contents
import pandas as pd
# Run query and pull results into a pandas dataframe
with sqlite3.connect('notebooks.sqlite') as conn:
df = pd.read_sql_query("SELECT * from nbindex WHERE source MATCH 'this notebook' LIMIT 10", conn)
#Apply concordance to source column in each row in dataframe
df['source'].apply(n_concordance,args=('this notebook',1,1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment