Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active February 18, 2021 10:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasvc/2b14a083da41850881ba2c0631b4ee75 to your computer and use it in GitHub Desktop.
Save andreasvc/2b14a083da41850881ba2c0631b4ee75 to your computer and use it in GitHub Desktop.
Find the longest sequence of tokens in a text without any taboo n-grams
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Given a set of taboo n-grams and a text, find the longest sequence of tokens\n",
"without any of the taboo n-grams. An n-gram is represented as a string of\n",
"space-separated tokens.\n",
"\n",
"Usage: longestnontaboo.py [--len] <taboofile> <filenames...>\n",
"\n",
"Specify taboo n-grams with one n-gram per line in taboofile;\n",
"multiple filenames can be specified.\n",
"By default the output is the longest non-taboo sequence.\n",
"With --len, only the length of this sequence is reported.\n"
]
}
],
"source": [
"import longestnontaboo\n",
"print(longestnontaboo.__doc__)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"austen-emma.txt: jane fairfax , standing with her back to them , intent on her pianoforte . busy as he was , however , the young man was yet able to shew a most happy countenance on seeing emma again . `` this is a pleasure , '' said he , in rather a low voice , `` coming at least ten minutes earlier than i had calculated . you find me trying to be useful ; tell me if you think i shall succeed . '' `` what ! '' said mrs. weston , `` have not you finished it yet ? you would not earn a very good livelihood as a working silversmith at this rate . '' `` i have not been working uninterruptedly , '' he replied , `` i have been assisting miss fairfax in trying to make her instrument stand steadily , it was not quite firm ; an unevenness in the floor , i believe . you see we have been wedging one leg with paper . this was very kind of you to be persuaded to come . i was almost afraid you would be hurrying home . '' he contrived that she should be seated by him ;\n"
]
}
],
"source": [
"longestnontaboo.main('taboowords.txt', 'austen-emma.txt')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"taboo = {'and', 'but', 'or', 'yes', 'no'}\n",
"tokens = longestnontaboo.preprocess('austen-emma.txt')\n",
"ngrams = longestnontaboo.getngrams(tokens, taboo)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAPnklEQVR4nO3df4wc5X3H8fc3NiEVacNPnZBt9dzGamWECugEVImqLahgoKqJRJAjFOzIlf8xUiJZak1TiTYByfmjoUQqSG6wMCgKoPwQKCBRF1hF/YOfgQDGohxghC2DldiQHFFoj3z7xz5HV+bOO2vv7Z7veb+k1c0888zMM1+NPzc7N7uOzESSVIdPjHoAkqThMfQlqSKGviRVxNCXpIoY+pJUkaWjHsDRnHnmmTk+Pt73eu+//z6nnHLK4Ae0yFin5qxVM9apmfmu07PPPvuLzDxrtmULOvTHx8d55pln+l6v3W7TarUGP6BFxjo1Z62asU7NzHedIuLNuZZ5e0eSKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkiqyoD+Re7zGtz40kv3u3XbVSPYrSb14pS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIo1DPyKWRMRzEfGTMr8yIp6MiMmIuC8iPlnaTy7zk2X5eNc2biztr0TE5QM/GknSUfVzpf9VYE/X/LeAWzPzs8BhYGNp3wgcLu23ln5ExGpgHXAOsAa4PSKWHN/wJUn9aBT6EbEcuAr4bpkP4BLgB6XLTuDqMr22zFOWX1r6rwXuzcwPMvMNYBK4cADHIElqqOmV/r8Cfwf8rsyfAbybmdNlfh+wrEwvA94CKMvfK/0/ap9lHUnSEPT8auWI+GvgYGY+GxGt+R5QRGwCNgGMjY3Rbrf73sbU1BTtdpst50737jwPjmXMozBTJ/VmrZqxTs2Msk5Nvk//c8DfRMSVwKeAPwBuA06NiKXlan45sL/03w+sAPZFxFLgM8Avu9pndK/zkczcDmwHmJiYyFar1fdBtdttWq0WG0b1ffrXtUay337N1Em9WatmrFMzo6xTz9s7mXljZi7PzHE6f4h9LDOvAx4Hrind1gMPlOkHyzxl+WOZmaV9XXm6ZyWwCnhqYEciSerpeP7nrL8H7o2Im4HngDtL+53APRExCRyi84uCzNwdEfcDLwPTwObM/PA49i9J6lNfoZ+ZbaBdpl9nlqdvMvO3wBfnWP8W4JZ+BylJGgw/kStJFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFeoZ+RHwqIp6KiJ9HxO6I+OfSvjIinoyIyYi4LyI+WdpPLvOTZfl417ZuLO2vRMTl83ZUkqRZNbnS/wC4JDP/DDgPWBMRFwPfAm7NzM8Ch4GNpf9G4HBpv7X0IyJWA+uAc4A1wO0RsWSAxyJJ6qFn6GfHVJk9qbwSuAT4QWnfCVxdpteWecrySyMiSvu9mflBZr4BTAIXDuIgJEnNNLqnHxFLIuJ54CCwC3gNeDczp0uXfcCyMr0MeAugLH8POKO7fZZ1JElDsLRJp8z8EDgvIk4Ffgz86XwNKCI2AZsAxsbGaLfbfW9jamqKdrvNlnOne3eeB8cy5lGYqZN6s1bNWKdmRlmnRqE/IzPfjYjHgT8HTo2IpeVqfjmwv3TbD6wA9kXEUuAzwC+72md0r9O9j+3AdoCJiYlstVp9HRB0QrfVarFh60N9rzsIe69rjWS//Zqpk3qzVs1Yp2ZGWacmT++cVa7wiYjfA/4K2AM8DlxTuq0HHijTD5Z5yvLHMjNL+7rydM9KYBXw1ICOQ5LUQJMr/bOBneVJm08A92fmTyLiZeDeiLgZeA64s/S/E7gnIiaBQ3Se2CEzd0fE/cDLwDSwudw2kiQNSc/Qz8wXgPNnaX+dWZ6+yczfAl+cY1u3ALf0P0xJ0iD4iVxJqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkWW9uoQESuAu4ExIIHtmXlbRJwO3AeMA3uBazPzcEQEcBtwJfAbYENm/qxsaz3wj2XTN2fmzsEezsIwvvWhkex377arRrJfSSeOJlf608CWzFwNXAxsjojVwFbg0cxcBTxa5gGuAFaV1ybgDoDyS+Im4CLgQuCmiDhtgMciSeqhZ+hn5oGZK/XM/DWwB1gGrAVmrtR3AleX6bXA3dnxBHBqRJwNXA7sysxDmXkY2AWsGeTBSJKOruftnW4RMQ6cDzwJjGXmgbLobTq3f6DzC+GtrtX2lba52o/cxyY67xAYGxuj3W73M0QApqamaLfbbDl3uu91T2T91mqmTurNWjVjnZoZZZ0ah35EfBr4IfC1zPxV59Z9R2ZmROQgBpSZ24HtABMTE9lqtfreRrvdptVqsWFE99ZHZe91rb76z9RJvVmrZqxTM6OsU6OndyLiJDqB/73M/FFpfqfctqH8PFja9wMrulZfXtrmapckDUnP0C9P49wJ7MnMb3ctehBYX6bXAw90tV8fHRcD75XbQI8Al0XEaeUPuJeVNknSkDS5vfM54MvAixHxfGn7B2AbcH9EbATeBK4tyx6m87jmJJ1HNr8CkJmHIuKbwNOl3zcy89AgDkKS1EzP0M/M/wJijsWXztI/gc1zbGsHsKOfAUqSBsdP5EpSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SaqIoS9JFTH0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klSRnqEfETsi4mBEvNTVdnpE7IqIV8vP00p7RMR3ImIyIl6IiAu61llf+r8aEevn53AkSUfT5Er/LmDNEW1bgUczcxXwaJkHuAJYVV6bgDug80sCuAm4CLgQuGnmF4UkaXh6hn5m/hQ4dETzWmBnmd4JXN3Vfnd2PAGcGhFnA5cDuzLzUGYeBnbx8V8kkqR5tvQY1xvLzANl+m1grEwvA97q6revtM3V/jERsYnOuwTGxsZot9t9D25qaop2u82Wc6f7XvdE1m+tZuqk3qxVM9apmVHW6VhD/yOZmRGRgxhM2d52YDvAxMREtlqtvrfRbrdptVps2PrQoIZ1Qth7Xauv/jN1Um/Wqhnr1Mwo63SsT++8U27bUH4eLO37gRVd/ZaXtrnaJUlDdKyh/yAw8wTOeuCBrvbry1M8FwPvldtAjwCXRcRp5Q+4l5U2SdIQ9by9ExHfB1rAmRGxj85TONuA+yNiI/AmcG3p/jBwJTAJ/Ab4CkBmHoqIbwJPl37fyMwj/zgsSZpnPUM/M780x6JLZ+mbwOY5trMD2NHX6CRJA+UnciWpIoa+JFXE0Jekihz3c/paOMb7/FzClnOnB/ZZhr3brhrIdiTNL6/0Jakihr4kVcTQl6SKGPqSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0Jekihj6klQRQ1+SKmLoS1JFDH1JqoihL0kVMfQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRQx9SarI0lEPQIvD+NaHRrLfvduuGsl+pROVV/qSVBFDX5IqYuhLUkUMfUmqiKEvSRUx9CWpIoa+JFXE0JekivjhLJ3QhvWhsC3nTrPhiH35wTCdiLzSl6SKGPqSVJGh396JiDXAbcAS4LuZuW3YY5AGwe8b0oloqFf6EbEE+DfgCmA18KWIWD3MMUhSzYZ9pX8hMJmZrwNExL3AWuDlIY9DOmGN6h1GE7P9wXsQfHczOJGZw9tZxDXAmsz82zL/ZeCizLyhq88mYFOZ/RPglWPY1ZnAL45zuDWwTs1Zq2asUzPzXac/zMyzZluw4B7ZzMztwPbj2UZEPJOZEwMa0qJlnZqzVs1Yp2ZGWadhP72zH1jRNb+8tEmShmDYof80sCoiVkbEJ4F1wINDHoMkVWuot3cyczoibgAeofPI5o7M3D0Puzqu20MVsU7NWatmrFMzI6vTUP+QK0kaLT+RK0kVMfQlqSKLLvQjYk1EvBIRkxGxddTjWUgiYm9EvBgRz0fEM6Xt9IjYFRGvlp+njXqcwxYROyLiYES81NU2a12i4zvl/HohIi4Y3ciHa446/VNE7C/n1PMRcWXXshtLnV6JiMtHM+rhi4gVEfF4RLwcEbsj4qulfUGcU4sq9P2ah0b+MjPP63pGeCvwaGauAh4t87W5C1hzRNtcdbkCWFVem4A7hjTGheAuPl4ngFvLOXVeZj4MUP7drQPOKevcXv591mAa2JKZq4GLgc2lHgvinFpUoU/X1zxk5v8AM1/zoLmtBXaW6Z3A1aMbymhk5k+BQ0c0z1WXtcDd2fEEcGpEnD2UgY7YHHWay1rg3sz8IDPfACbp/Ptc9DLzQGb+rEz/GtgDLGOBnFOLLfSXAW91ze8rbepI4D8i4tnydRcAY5l5oEy/DYyNZmgLzlx18Rz7uBvKbYkdXbcHrRMQEePA+cCTLJBzarGFvo7u85l5AZ23k5sj4i+6F2bn+V2f4T2CdTmqO4A/Bs4DDgD/MtLRLCAR8Wngh8DXMvNX3ctGeU4tttD3ax6OIjP3l58HgR/Tebv9zsxbyfLz4OhGuKDMVRfPsS6Z+U5mfpiZvwP+nf+/hVN1nSLiJDqB/73M/FFpXhDn1GILfb/mYQ4RcUpE/P7MNHAZ8BKd+qwv3dYDD4xmhAvOXHV5ELi+PHFxMfBe11v26hxx7/kLdM4p6NRpXUScHBEr6fyR8qlhj28UIiKAO4E9mfntrkUL45zKzEX1Aq4E/ht4Dfj6qMezUF7AHwE/L6/dM7UBzqDzJMGrwH8Cp496rCOozffp3Jr4Xzr3UzfOVRcg6Dwh9hrwIjAx6vGPuE73lDq8QCe8zu7q//VSp1eAK0Y9/iHW6fN0bt28ADxfXlculHPKr2GQpIostts7kqSjMPQlqSKGviRVxNCXpIoY+pJUEUNfkipi6EtSRf4PEn2CCWX0GoMAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Where are the taboo matches?\n",
"indices = longestnontaboo.taboo_matches(ngrams, taboo)\n",
"# What is the distance between consecutive matches? (x-axis)\n",
"# how common is each distance? (y-axis)\n",
"(indices[1:] - indices[:-1].values).hist();"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"jane fairfax , standing with her back to them , intent on her pianoforte . busy as he was , however , the young man was yet able to shew a most happy countenance on seeing emma again . `` this is a pleasure , '' said he , in rather a low voice , `` coming at least ten minutes earlier than i had calculated . you find me trying to be useful ; tell me if you think i shall succeed . '' `` what ! '' said mrs. weston , `` have not you finished it yet ? you would not earn a very good livelihood as a working silversmith at this rate . '' `` i have not been working uninterruptedly , '' he replied , `` i have been assisting miss fairfax in trying to make her instrument stand steadily , it was not quite firm ; an unevenness in the floor , i believe . you see we have been wedging one leg with paper . this was very kind of you to be persuaded to come . i was almost afraid you would be hurrying home . '' he contrived that she should be seated by him ;\n"
]
}
],
"source": [
"# Show the longest non-taboo sequence\n",
"result = longestnontaboo.longest_non_taboo_sequence(ngrams, taboo)\n",
"print(' '.join(result))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
"""Given a set of taboo n-grams and a text, find the longest sequence of tokens
without any of the taboo n-grams. An n-gram is represented as a string of
space-separated tokens.
Usage: longestnontaboo.py [--len] <taboofile> <filenames...>
Specify taboo n-grams with one n-gram per line in taboofile;
multiple filenames can be specified.
By default the output is the longest non-taboo sequence.
With --len, only the length of this sequence is reported."""
import sys
import nltk
import pandas
def preprocess(fname, lowercase=True):
"""Tokenize and lowercase text file."""
with open(fname, encoding='utf8') as inp:
text = inp.read()
if lowercase:
text = text.lower()
tokens = nltk.word_tokenize(text)
return pandas.Series(tokens, name=fname)
def getngrams(tokens, taboo):
"""return list `ngrams` s.t. ngrams[n - 1] contains n-grams of tokens,
up to a maximum n determined by n-gram strings in `taboo`."""
result = [tokens]
if not taboo:
return result
maxn = max(len(a.split()) for a in taboo)
# padding for the last n-grams of the text
padding = pandas.Series([''] * maxn)
padded = tokens.append(padding, ignore_index=True)
for n in range(2, maxn + 1):
subngrams = [padded[m:len(tokens) + m].values for m in range(1, n)]
ngrams = tokens.str.cat(subngrams, sep=' ')
result.append(ngrams)
return result
def taboo_matches(ngrams, taboo):
"""return all indices of tokens with taboo n-grams.
:param ngrams: a list of Series objects, must include n-grams up to
the longest n-gram in taboo.
:params taboo: a set of strings."""
tokens = ngrams[0]
istaboo = tokens.isin(taboo)
for x in ngrams[1:]:
istaboo |= x.isin(taboo)
if not istaboo.any():
return tokens
indices = tokens[istaboo].index
return pandas.Series(indices)
def longest_non_taboo_sequence(ngrams, taboo):
"""return longest sequence of tokens without any taboo n-grams.
:param ngrams: a list of Series objects, must include n-grams up to
the longest n-gram in taboo.
:params taboo: a set of strings."""
indices = taboo_matches(ngrams, taboo)
tokens = ngrams[0]
if len(indices) == 0:
return tokens
# pad the begin and end such since the first or last tokens may
# be the longest non-taboo sequence
indices = pandas.Series([-1] + list(indices) + [len(tokens)])
lengths = indices[1:].values - indices[:-1].values
longest = lengths.argmax()
start = indices[longest]
end = indices[longest + 1]
if start == -1:
start = 0
else:
start += max(n for a in taboo
for n, x in enumerate(ngrams, 1) if x[start] == a)
return tokens[start:end]
def test_getngrams():
tokens = pandas.Series(['a', 'b', 'c'])
assert getngrams(tokens, set()) == [tokens]
assert getngrams(tokens, {'a'}) == [tokens]
assert len(getngrams(tokens, {'a', 'a b'})) == 2
assert len(getngrams(tokens, {'a', 'a b', 'a b c'})) == 3
assert (getngrams(tokens, {'a', 'a b'})[1] == pandas.Series(
['a b', 'b c', 'c '])).all()
assert (getngrams(tokens, {'a', 'a b', 'a b c'})[2] == pandas.Series(
['a b c', 'b c ', 'c '])).all()
def test_longest_non_taboo_sequence():
text = 'the cat is on the mat . the cat is sleeping .'
tokens = pandas.Series(text.split())
ngrams = getngrams(tokens, {'the cat'})
assert len(longest_non_taboo_sequence(ngrams, {'the cat'})) == 5
assert (longest_non_taboo_sequence(ngrams, {'the cat'})
== 'is on the mat .'.split()).all()
assert (longest_non_taboo_sequence(ngrams, {'the cat', 'the'})
== 'is sleeping .'.split()).all()
assert (longest_non_taboo_sequence(ngrams, {'mat'})
== '. the cat is sleeping .'.split()).all()
assert (longest_non_taboo_sequence(ngrams, {'sleeping'})
== 'the cat is on the mat . the cat is'.split()).all()
def main(*argv):
"""CLI."""
if len(argv) < 2:
print(__doc__)
return
reportlen = '--len' in sys.argv
if reportlen:
argv.remove('--len')
taboofile = argv[0]
textfnames = argv[1:]
with open(taboofile, encoding='utf8') as inp:
taboo = set(inp.read().splitlines())
for fname in textfnames:
tokens = preprocess(fname)
ngrams = getngrams(tokens, taboo)
result = longest_non_taboo_sequence(ngrams, taboo)
if reportlen:
print('%s: %d' % (fname, len(result)))
else:
print('%s: %s' % (fname, ' '.join(result)))
if __name__ == '__main__':
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment