Skip to content

Instantly share code, notes, and snippets.

@luerhard
Last active January 20, 2018 11:17
Show Gist options
  • Save luerhard/a5e2ade44e75e9a0a06216d682f723c0 to your computer and use it in GitHub Desktop.
Save luerhard/a5e2ade44e75e9a0a06216d682f723c0 to your computer and use it in GitHub Desktop.
A SqliteCorpusReader for nltk. To use it, instantiate a SqliteCorpusReader-Instance with keyword-arguments dbpath, table and field. to directly access other columns, add methods like SqliteCorpusReader.timestamps or .articles oder .folder. It recognizes the DB-Structure automatically.
import sqlite3 as sq
from nltk.data import LazyLoader
from nltk.util import AbstractLazySequence, LazyMap, LazyConcatenation
from nltk.tokenize import WordPunctTokenizer, sent_tokenize
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy.inspection import inspect
from sqlalchemy import func
class SqliteAbstractLazySequence(AbstractLazySequence):
def __init__(self, dbpath='path/to/sqlitedb.db', table='name_of_table', field='name_of_content_column'):
self.Base = automap_base()
self.engine = create_engine('sqlite:///' + dbpath)
self.Base.prepare(self.engine, reflect = True)
self.session = Session(self.engine)
self.table = eval("self.Base.classes." + table)
self.field = field
self.key = inspect(self.table).primary_key[0].name
def __len__(self):
return self.session.query(func.count(eval("self.table." + self.key))).scalar()
class TextSequence(SqliteAbstractLazySequence):
def iterate_from(self, start=0):
f = lambda d: eval("d." + self.field)
return iter(LazyMap(f, self.session.query(self.table).filter(eval("self.table." + self.key) > start).all()))
class PropertySequence(SqliteAbstractLazySequence):
def __init__(self, *args, **kwargs):
self.column = kwargs['column']
del kwargs['column']
super().__init__(*args, **kwargs)
def iterate_from(self, start=0):
"""Set column to return. If none, a SqlAlchemy-object with all columns is returned."""
if self.column:
f = lambda d: eval("d." + str(self.column))
else:
f = lambda d: d
return iter(LazyMap(f, self.session.query(self.table).filter(eval("self.table." + self.key) > start).all()))
class SqliteCorpusReader(object):
def __init__(self,
word_tokenizer = WordPunctTokenizer(),
sent_tokenizer=sent_tokenize, **kwargs):
self._seq = TextSequence(**kwargs)
self._kwargs = kwargs
self._word_tokenize = word_tokenizer.tokenize
self._sent_tokenize = sent_tokenizer
def text(self):
return self._seq
def _property(self, column=None):
self._kwargs['column'] = column
return PropertySequence(**self._kwargs)
def words(self):
return LazyConcatenation(LazyMap(self._word_tokenize, self.text()))
def sents(self):
return LazyConcatenation(LazyMap(self._sent_tokenize, self.text()))
def articles(self):
return self._property(column='content')
def timestamps(self):
return self._property(column='published')
def folder(self):
return self._property(column='folder')
def objects(self):
return self._property(column=None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment