Last active
December 9, 2022 22:42
-
-
Save psychemedia/925e190e2afd15b050f32334ceff9ef6 to your computer and use it in GitHub Desktop.
Example of scraping md and code cells from Jupyter notebooks into sqlite db then doing text concordance on result
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
def nbpathwalk(path): | |
''' Walk down a directory path looking for ipynb notebook files... ''' | |
for path, _, files in os.walk(path): | |
if '.ipynb_checkpoints' in path: continue | |
for f in [i for i in files if i.endswith('.ipynb')]: | |
yield os.path.join(path, f) | |
import nbformat | |
def get_cell_contents(nb_fn, c_md=None, cell_typ=None): | |
''' Extract the content of Jupyter notebook cells. ''' | |
if cell_typ is None: cell_typ=['markdown'] | |
if c_md is None: c_md = [] | |
nb=nbformat.read(nb_fn,nbformat.NO_CONVERT) | |
_c_md=[i for i in nb.cells if i['cell_type'] in cell_typ] | |
ix=len(c_md) | |
for c in _c_md: | |
c.update( {"ix":str(ix)}) | |
c.update( {"title":nb_fn}) | |
ix = ix+1 | |
c_md = c_md + _c_md | |
return c_md | |
import sqlite3 | |
def index_notebooks_sqlite(nbpath='.', outfile='notebooks.sqlite', jsonp=None): | |
''' Get content from each notebook down a path and index it. ''' | |
conn = sqlite3.connect(outfile) | |
# Create table | |
c = conn.cursor() | |
c.execute('''DROP TABLE IF EXISTS nbindex''') | |
#Enable full text search | |
c.execute('''CREATE VIRTUAL TABLE IF NOT EXISTS nbindex USING fts4(title text, source text, ix text PRIMARY KEY, cell_type text)''') | |
c_md=[] | |
for fn in nbpathwalk(nbpath): | |
cells = get_cell_contents(fn,c_md, cell_typ=['markdown','code']) | |
for cell in cells: | |
# Insert a row of data | |
c.execute("INSERT INTO nbindex VALUES (?,?,?,?)",(cell['title'],cell['source'], | |
cell['ix'], cell['cell_type'])) | |
# Save (commit) the changes and close the db connection | |
conn.commit() | |
conn.close() | |
#https://blog.ouseful.info/2015/12/13/n-gram-phrase-based-concordances-in-nltk/ | |
import nltk | |
def n_concordance_tokenised(text,phrase,left_margin=5,right_margin=5): | |
''' Token concordance for multiple contiguous tokens. ''' | |
#concordance replication via https://simplypython.wordpress.com/2014/03/14/saving-output-of-nltk-text-concordance/ | |
phraseList=phrase.split(' ') | |
c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower()) | |
#Find the offset for each token in the phrase | |
offsets=[c.offsets(x) for x in phraseList] | |
offsets_norm=[] | |
#For each token in the phraselist, find the offsets and rebase them to the start of the phrase | |
for i in range(len(phraseList)): | |
offsets_norm.append([x-i for x in offsets[i]]) | |
#We have found the offset of a phrase if the rebased values intersect | |
#-- | |
# http://stackoverflow.com/a/3852792/454773 | |
#the intersection method takes an arbitrary amount of arguments | |
#result = set(d[0]).intersection(*d[1:]) | |
#-- | |
intersects=set(offsets_norm[0]).intersection(*offsets_norm[1:]) | |
concordance_txt = ([text.tokens[list(map(lambda x: x-left_margin if (x-left_margin)>0 else 0,[offset]))[0]:offset+len(phraseList)+right_margin] | |
for offset in intersects]) | |
outputs=[''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt] | |
return outputs | |
def n_concordance(txt,phrase,left_margin=5,right_margin=5): | |
''' Find text concordance for a phrase. ''' | |
tokens = nltk.word_tokenize(txt) | |
text = nltk.Text(tokens) | |
return n_concordance_tokenised(text,phrase,left_margin=left_margin,right_margin=right_margin) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Generate sqlite db of notebook(s) cell contents | |
index_notebooks_sqlite('.') | |
import pandas as pd | |
# Run query and pull results into a pandas dataframe | |
with sqlite3.connect('notebooks.sqlite') as conn: | |
df = pd.read_sql_query("SELECT * from nbindex WHERE source MATCH 'this notebook' LIMIT 10", conn) | |
#Apply concordance to source column in each row in dataframe | |
df['source'].apply(n_concordance,args=('this notebook',1,1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment