Skip to content

Instantly share code, notes, and snippets.

@Fohlen
Created December 4, 2019 09:22
Show Gist options
  • Save Fohlen/8ad97cd0be73d8eb251912eed9acb24d to your computer and use it in GitHub Desktop.
Save Fohlen/8ad97cd0be73d8eb251912eed9acb24d to your computer and use it in GitHub Desktop.
British national corpus
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"import collections\n",
"from nltk.corpus.reader.bnc import BNCCorpusReader\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"bnc_root = pathlib.Path(pathlib.Path.home(), 'Downloads/download/Texts')\n",
"bnc_reader = BNCCorpusReader(root=str(bnc_root), fileids=r'[A-K]/\\w*/\\w*\\.xml')\n",
"words = bnc_reader.words(fileids=[f for f in bnc_reader.fileids() if f.startswith('A/')][:10]) # the words of the first 11 documents included in the general corpus"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"frequencies = collections.Counter(words)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x120167dd8>]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAbFUlEQVR4nO3de5hU9Z3n8feHbkBAubdIAAdURkfNzgZ7kEyyWVcTwMsT3Bnjo7M7koQn7JMxM8lsshnNZc0mcaObTTROErJEMWhcL2MSZeOFENQxXkAaRQQRaUGkO1xamptc+/LdP+rXbdHd1XR3NV1N1+f1PP30Od/zO6d+53R1fepc6pQiAjMzK279Ct0BMzMrPIeBmZk5DMzMzGFgZmY4DMzMDCgtdAe6avTo0TFx4sRCd8PM7ISycuXKdyOirGX9hA2DiRMnUlFRUehumJmdUCRtbqvuw0RmZuYwMDMzh4GZmdGBMJC0QNIOSWvamPZlSSFpdBqXpDskVUpaLWlKVtvZkjakn9lZ9QskvZbmuUOSumvlzMysYzqyZ/ALYGbLoqQJwHTgnazypcDk9DMXmJfajgRuAi4EpgI3SRqR5pkHfC5rvlaPZWZmx9cxwyAingVq25h0G/BVIPtOd7OAeyJjGTBc0lhgBrAkImojYhewBJiZpg2NiGWRuWPePcCV+a2SmZl1VpfOGUiaBVRHxKstJo0DtmSNV6Vae/WqNuq5HneupApJFTU1NV3pupmZtaHTYSBpMPA14L93f3faFxHzI6I8IsrLylp9ZqJD9h2q4+7nN7Fu695u7p2Z2YmrKx86OxOYBLyazvWOB16WNBWoBiZktR2fatXARS3qz6T6+DbaHzdz71nJixt3AvD2LZcfz4cyMzthdHrPICJei4hTI2JiREwkc2hnSkRsAxYB16WriqYBeyJiK7AYmC5pRDpxPB1YnKbtlTQtXUV0HfBoN61bm1Zt2X08F29mdkLqyKWl9wMvAmdLqpI0p53mjwMbgUrg58DfAURELfAdYEX6+XaqkdrcmeZ5C3iia6tiZmZddczDRBFx7TGmT8waDuD6HO0WAAvaqFcA5x+rH2Zmdvz4E8hmZuYwMDMzh4GZmeEwMDMzHAZmZobDwMzMcBiYmRkOAzMzw2FgZmYUYRjEUV+/YGZmUIRhYGZmrTkMzMzMYWBmZg4DMzPDYWBmZhRhGAgVugtmZr1O0YWBmZm15jAwMzOHgZmZdSAMJC2QtEPSmqza9yW9IWm1pN9IGp417UZJlZLWS5qRVZ+ZapWSbsiqT5K0PNUflDSgO1fQzMyOrSN7Br8AZraoLQHOj4h/A7wJ3Agg6VzgGuC8NM9PJZVIKgF+AlwKnAtcm9oC3ArcFhFnAbuAOXmtkZmZddoxwyAingVqW9R+FxH1aXQZMD4NzwIeiIjDEbEJqASmpp/KiNgYEUeAB4BZkgRcDDyc5l8IXJnnOpmZWSd1xzmDzwJPpOFxwJasaVWplqs+CtidFSxN9TZJmiupQlJFTU1NlzrrG9WZmbWWVxhI+jpQD9zXPd1pX0TMj4jyiCgvKyvriYc0MysKpV2dUdKngSuASyKi6e12NTAhq9n4VCNHfScwXFJp2jvIbm9mZj2kS3sGkmYCXwU+GREHsiYtAq6RNFDSJGAy8BKwApicrhwaQOYk86IUIk8DV6X5ZwOPdm1VzMysqzpyaen9wIvA2ZKqJM0BfgycAiyRtErSzwAiYi3wEPA68CRwfUQ0pHf9XwAWA+uAh1JbgH8C/qukSjLnEO7q1jU0M7NjOuZhooi4to1yzhfsiLgZuLmN+uPA423UN5K52sjMzArEn0A2MzOHgZmZOQzMzAyHgZmZ4TAwMzOKMAz8TWdmZq0VXRiYmVlrRRcGvlGdmVlrRRcGZmbWmsPAzMwcBmZm5jAwMzMcBmZmhsPAzMxwGJiZGQ4DMzPDYWBmZjgMzMwMh4GZmeEwMDMzOhAGkhZI2iFpTVZtpKQlkjak3yNSXZLukFQpabWkKVnzzE7tN0ianVW/QNJraZ47JPke02ZmPawjewa/AGa2qN0ALI2IycDSNA5wKTA5/cwF5kEmPICbgAuBqcBNTQGS2nwua76Wj9WtwjctNTNr5ZhhEBHPArUtyrOAhWl4IXBlVv2eyFgGDJc0FpgBLImI2ojYBSwBZqZpQyNiWUQEcE/WsszMrId09ZzBmIjYmoa3AWPS8DhgS1a7qlRrr17VRr1NkuZKqpBUUVNT06WO+yCUmVlreZ9ATu/oe+TgS0TMj4jyiCgvKyvriYc0MysKXQ2D7ekQD+n3jlSvBiZktRufau3Vx7dRNzOzHtTVMFgENF0RNBt4NKt+XbqqaBqwJx1OWgxMlzQinTieDixO0/ZKmpauIroua1lmZtZDSo/VQNL9wEXAaElVZK4KugV4SNIcYDNwdWr+OHAZUAkcAD4DEBG1kr4DrEjtvh0RTSel/47MFUuDgCfSj5mZ9aBjhkFEXJtj0iVttA3g+hzLWQAsaKNeAZx/rH6Ymdnx408gm5mZw8DMzBwGZmaGw8DMzHAYmJkZRRgGvlGdmVlrRRcGZmbWmsPAzMwcBmZm5jAwMzMcBmZmhsPAzMwowjDwN52ZmbVWdGFgZmatOQzMzMxhYGZmDgMzM8NhYGZmFGEY+EZ1Zmat5RUGkv5R0lpJayTdL+kkSZMkLZdUKelBSQNS24FpvDJNn5i1nBtTfb2kGfmtkpmZdVaXw0DSOOAfgPKIOB8oAa4BbgVui4izgF3AnDTLHGBXqt+W2iHp3DTfecBM4KeSSrraLzMz67x8DxOVAoMklQKDga3AxcDDafpC4Mo0PCuNk6ZfIkmp/kBEHI6ITUAlMDXPfpmZWSd0OQwiohr438A7ZEJgD7AS2B0R9alZFTAuDY8DtqR561P7Udn1NuYxM7MekM9hohFk3tVPAj4ADCFzmOe4kTRXUoWkipqamuP5UGZmRSWfw0QfBzZFRE1E1AG/Bj4CDE+HjQDGA9VpuBqYAJCmDwN2ZtfbmOcoETE/IsojorysrCyPrpuZWbZ8wuAdYJqkwenY/yXA68DTwFWpzWzg0TS8KI2Tpj8VEZHq16SrjSYBk4GX8uiXmZl1Uumxm7QtIpZLehh4GagHXgHmA48BD0j6bqrdlWa5C7hXUiVQS+YKIiJiraSHyARJPXB9RDR0tV9mZtZ5XQ4DgIi4CbipRXkjbVwNFBGHgE/lWM7NwM359MXMzLqu6D6BbGZmrTkMzMys+MLA33RmZtZa0YWBb1RnZtZa0YWBmZm15jAwMzOHgZmZOQzMzAyHgZmZ4TAwMzMcBmZmhsPAzMwowjA4XN9Y6C6YmfU6RRcGZmbWmsPAzMwcBmZm5jAwMzMcBmZmhsPAzMxwGJiZGXmGgaThkh6W9IakdZI+LGmkpCWSNqTfI1JbSbpDUqWk1ZKmZC1ndmq/QdLsfFfKzMw6J989gx8BT0bEOcCfA+uAG4ClETEZWJrGAS4FJqefucA8AEkjgZuAC4GpwE1NAWJmZj2jy2EgaRjwMeAugIg4EhG7gVnAwtRsIXBlGp4F3BMZy4DhksYCM4AlEVEbEbuAJcDMrvbLzMw6L589g0lADXC3pFck3SlpCDAmIramNtuAMWl4HLAla/6qVMtVb0XSXEkVkipqamry6LqZmWXLJwxKgSnAvIj4ELCf9w8JARARAXTbV9BHxPyIKI+I8rKysu5arJlZ0csnDKqAqohYnsYfJhMO29PhH9LvHWl6NTAha/7xqZarbmZmPaTLYRAR24Atks5OpUuA14FFQNMVQbOBR9PwIuC6dFXRNGBPOpy0GJguaUQ6cTw91czMrIeU5jn/3wP3SRoAbAQ+QyZgHpI0B9gMXJ3aPg5cBlQCB1JbIqJW0neAFandtyOiNs9+mZlZJ+QVBhGxCihvY9IlbbQN4Pocy1kALMinL2Zm1nX+BLKZmTkMzMzMYWBmZjgMzMwMh4GZmVHkYbD7wJFCd8HMrFco6jCY9r2lhe6CmVmvUNRhcKiusdBdMDPrFYo6DMzMLMNhYGZmDgMzM3MYmJkZDgMzM8NhYGZmOAzMzAyHgZmZ4TAwMzMcBmZmhsPAzMxwGJiZGd0QBpJKJL0i6bdpfJKk5ZIqJT0oaUCqD0zjlWn6xKxl3Jjq6yXNyLdPZmbWOd2xZ/BFYF3W+K3AbRFxFrALmJPqc4BdqX5baoekc4FrgPOAmcBPJZV0Q7/MzKyD8goDSeOBy4E707iAi4GHU5OFwJVpeFYaJ02/JLWfBTwQEYcjYhNQCUzNp19mZtY5+e4Z3A58FWj6YoBRwO6IqE/jVcC4NDwO2AKQpu9J7ZvrbcxzFElzJVVIqqipqcmz62Zm1qTLYSDpCmBHRKzsxv60KyLmR0R5RJSXlZX11MOamfV5pXnM+xHgk5IuA04ChgI/AoZLKk3v/scD1al9NTABqJJUCgwDdmbVm2TPY2ZmPaDLewYRcWNEjI+IiWROAD8VEf8JeBq4KjWbDTyahhelcdL0pyIiUv2adLXRJGAy8FJX+2VmZp2Xz55BLv8EPCDpu8ArwF2pfhdwr6RKoJZMgBARayU9BLwO1APXR0TDceiXmZnl0C1hEBHPAM+k4Y20cTVQRBwCPpVj/puBm7ujL2Zm1nn+BLKZmTkMzMzMYWBmZjgMzMwMh4GZmeEwMDMzHAZmZobDwMzMcBiYmRkOAzMzw2FgZmY4DMzMDIeBmZnhMDAzMxwGZmaGw8DMzHAYmJkZDgMzM8NhYGZm5BEGkiZIelrS65LWSvpiqo+UtETShvR7RKpL0h2SKiWtljQla1mzU/sNkmbnv1pmZtYZ+ewZ1ANfjohzgWnA9ZLOBW4AlkbEZGBpGge4FJicfuYC8yATHsBNwIXAVOCmpgAxM7Oe0eUwiIitEfFyGt4HrAPGAbOAhanZQuDKNDwLuCcylgHDJY0FZgBLIqI2InYBS4CZXe2XmZl1XrecM5A0EfgQsBwYExFb06RtwJg0PA7YkjVbVarlqrf1OHMlVUiqqKmp6Y6um5kZ3RAGkk4GfgV8KSL2Zk+LiAAi38fIWt78iCiPiPKysrLuWqyZWdHLKwwk9ScTBPdFxK9TeXs6/EP6vSPVq4EJWbOPT7Vc9R6x71Adh+sbeurhzMx6pXyuJhJwF7AuIn6YNWkR0HRF0Gzg0az6demqomnAnnQ4aTEwXdKIdOJ4eqr1iA9+63dc/bMXe+rhzMx6pdI85v0I8LfAa5JWpdrXgFuAhyTNATYDV6dpjwOXAZXAAeAzABFRK+k7wIrU7tsRUZtHvzrt1ao97U7f+d5hlm+q5bIPju2hHpmZ9awuh0FEPAcox+RL2mgfwPU5lrUAWNDVvhxvn11YwatbdvPKNz/BiCEDCt0dM7Nu508gd0D1roMA1DU2FrgnZmbHh8OgA9S0/9Nt10WZmfUuDoNOcBaYWV/lMOiA5h0Dp4GZ9VEOgw5oOkwU3jcwsz7KYdDCngN1VO5476ia0r6B9wzMrK9yGLRwxY//wMd/+K9H1ZTrAlozsz7CYdDCltqDrWq+mMjM+jqHQQdITYeJ+l4cPLN+B5t37i90N8yswBwGOVw7f1mrWh/MAj599wr+/fefKXQ3zKzAHAY5vLhxZ6G7YGbWYxwGyVs17+Wc1nxpaR/cMzAzA4dBsz0H67j1yTeOqv1wyZtU7TpAXUPmnkT+nIGZ9VX53MK6T/mrn77QqnbH0g3csXRDAXpjZtazvGfQCT5MZGZ9lcOgE45XFqyp3sO8Z946Tks3Mzs2h0EnZH/O4Lu/fZ07/7CxW5Z7xT8/1+p8hZlZT3IYdMLFP3j/NhV3PreJ7z62roC9MTPrPg6DTtpzsO6o8bdq3mPF221/ZfPSdduZefuz1Dc0snJzLZve9Sd9rX0ba97joRVbCt2NXu3Td7/Er1ZWFbobfY6vJuqk7y9+g5FDBjaPXzXvBXYdqON3//gxvvHIGr4y/Wz2HarjvuXv8NQbOwCoee8wfz3vRQDevuXyo5a3Y++h5uGIaL71RXv+uPsgHxg+qN02dz+/idr9R/jy9LM7vG5WeDN/9AeO1Ddy9V9MKHRXeoVfLtvMX545ijPKTm6uPbO+hmfW1/DXF4zvkT688Na7jBoykLNPO6VHHq9Qes2egaSZktZLqpR0Q6H7k8svl71z1OWmuw5k9hSm3/YsL22q5er/8yJzFlY0BwHAh7/3VPNw5Y73+C/3VjDvmbfYvHM/U//n0uZpt/9+A9f/35eZeMNj7Nh3iHtffJvNO/dTvfv9m+f9+uUq/vKWp/jDhpp2+/k//t/r/PNTlc3jC1/ILKtq1wHeO1wPwPOV7zZPf+SV6naXV9/QyK9WVtHY+P55kz0H6nhpU9t7Rdne2XmAN7btPWa7zvr8L1cy6yfPd/tyC+HRVdXsPVTHkfre+T3bjY1BfUPP9S0iqN59kG88sobL73iuub73UF07cx0ff/Pz5cy4/dkef9ye1iv2DCSVAD8BPgFUASskLYqI1wvbs+7XdHvsxWu3tzpp/KOskJl681La87d3vQTAv5s8mjFDT+LhtNvcv0TUNbz/gj3xhsfanP/Ln/hTfrDkzebxLz24iqfX7+AvJo7kG4+sAeCvpozjUxdMYPeBI3z+vpcz8/3Lq5xyUin7DtW3WuZHzhrFN684lxWbajl91BAigvtfeofFa7cf1e4L/+EsVm7exYEj9bxatQeAT/75B3hy7TaO1Dfy8+vK2VJ7gHEjBvGzf32Lb15xbibIag/yuY+dwcvv7GLoSf15Ys02IPNCOmHkYM469WQaGoIBpf1oiKC0nxBi0IASIPOC1q+fiAj2Ha6nsTEYNqg/ew7WMWxQfyRR39BIST+xc/8RhgwoZdCAElZX7eb0kYM50tDItj2HeL5yJ1NOH84Hxw+jf0k/SiQO1TfQT6Ixgnf3HQHg9FGDiQje3P4ej63+I5+/6CzqGzMvqE+u2cbitduZ95+n8MJbO/niA6u45JxT2/2b1zU00r+kH/sP1zOof2ad9h2q570j9YzLsaf4QuW7/MnoIQw9qZSv/WYN37z8zxg+eAC3//5NZpx3Go+squZLH/9Thg3q3zxPyz3Uxsbg7+9/hcde28pFZ5exsWY/T3/lItb+cQ9nnXoyg/qXUN+Y2d77Dtcz9KT+rfpR39BIXUM0/y2aHqcxoKSfqN598Kh1eHhlFf/t4dUAHKxraK5XZB2SXbm5limnj+CZN2vYe7COi885lcageV1y7Wk31Q8eaWBz7X7OOW0o9Q2NlJZk3htvenc/Jw8speyUga3mbXoOAWzdc5Cxwwa1+1gd0XLeiGDvoXqO1DcyfHB/+pf0zHt29YY7cUr6MPCtiJiRxm8EiIjv5ZqnvLw8KioqOv1YuV4czXqbwQNKOHCk4dgNu8GZZUPYUnuQI8fh3f+k0UOOOl/W9GLd8vwbkPONRr5GnzyQEYP7syF9cdXIIQOo3X/kmH3tjDPLhmQuPw/YmJZx1qknH3UVYrQYaGo3cdTg5hf9DS2+XGvyqSfT0m//4aMMLC1pVe8ISSsjorxlvVfsGQDjgOyzZlXAhS0bSZoLzAU4/fTTe6ZnZgVSPnEkz77Z/uHAbKX9RH1j197cnTbsJMYOG8Rz6dDh8MH92Z0OgZ7Uvx+H6roeEud9YOhRL7Blpwxk4qjB/H5d5lDqpeef1ryX98Fxw3hj2742X6jzcf64oQweUEI/ifXb9/GhCcNZmg7lTjl9OC+/sxuAM1qEgdT+h00njBzU/B0o55w2FJT5/pODdQ1s3XOIs8ek8wxZOw1Ng5LYsusAdQ3Bn40d2nwPtKGD+rNy867m9pPHtA4D0f3fuNVbwqBDImI+MB8yewZdWUbLE7hmdvz9+G8K3YPe6kOF7kCz3nICuRrIvnxifKqZmVkP6C1hsAKYLGmSpAHANcCiAvfJzKxo9IrDRBFRL+kLwGKgBFgQEWsL3C0zs6LRK8IAICIeBx4vdD/MzIpRbzlMZGZmBeQwMDMzh4GZmTkMzMyMXnI7iq6QVANs7uLso4F3j9mqOHnb5OZtk5u3TW69bdv8SUSUtSyesGGQD0kVbd2bw7xt2uNtk5u3TW4nyrbxYSIzM3MYmJlZ8YbB/EJ3oBfztsnN2yY3b5vcTohtU5TnDMzM7GjFumdgZmZZHAZmZlZcYSBppqT1kiol3VDo/vQUSW9Lek3SKkkVqTZS0hJJG9LvEakuSXekbbRa0pSs5cxO7TdIml2o9cmXpAWSdkhak1Xrtu0h6YK0vSvTvN3/tVTHSY5t8y1J1en5s0rSZVnTbkzruV7SjKx6m/9r6Tb1y1P9wXTL+l5P0gRJT0t6XdJaSV9M9b7zvImIovghc2vst4AzgAHAq8C5he5XD63728DoFrX/BdyQhm8Abk3DlwFPkPl2vmnA8lQfCWxMv0ek4RGFXrcubo+PAVOANcdjewAvpbZK815a6HXOc9t8C/hKG23PTf9HA4FJ6f+rpL3/NeAh4Jo0/DPg84Ve5w5ul7HAlDR8CvBmWv8+87wppj2DqUBlRGyMiCPAA8CsAvepkGYBC9PwQuDKrPo9kbEMGC5pLDADWBIRtRGxC1gCzOzpTneHiHgWqG1R7pbtkaYNjYhlkfkPvydrWb1ejm2TyyzggYg4HBGbgEoy/2dt/q+ld7oXAw+n+bO3c68WEVsj4uU0vA9YR+a72/vM86aYwmAcsCVrvCrVikEAv5O0UtLcVBsTEVvT8DZgTBrOtZ36+vbrru0xLg23rJ/ovpAOdyxoOhRC57fNKGB3RNS3qJ9QJE0k8+XFy+lDz5tiCoNi9tGImAJcClwv6WPZE9M7EV9jnHh7tDIPOBP4t8BW4AeF7U7hSDoZ+BXwpYjYmz3tRH/eFFMYVAMTssbHp1qfFxHV6fcO4DdkduO3p11T0u8dqXmu7dTXt193bY/qNNyyfsKKiO0R0RARjcDPyTx/oPPbZieZwyWlLeonBEn9yQTBfRHx61TuM8+bYgqDFcDkdDXDAOAaYFGB+3TcSRoi6ZSmYWA6sIbMujddyTAbeDQNLwKuS1dDTAP2pN3gxcB0SSPSYYLpqdZXdMv2SNP2SpqWjpFfl7WsE1LTi13yH8k8fyCzba6RNFDSJGAymZOgbf6vpXfOTwNXpfmzt3Ovlv6WdwHrIuKHWZP6zvOm0Gfpe/KHzBn+N8lc6fD1Qvenh9b5DDJXc7wKrG1abzLHb5cCG4DfAyNTXcBP0jZ6DSjPWtZnyZwkrAQ+U+h1y2Ob3E/mcEcdmWOzc7pzewDlZF4w3wJ+TPqk/4nwk2Pb3JvWfTWZF7mxWe2/ntZzPVlXv+T6X0vPx5fSNvsXYGCh17mD2+WjZA4BrQZWpZ/L+tLzxrejMDOzojpMZGZmOTgMzMzMYWBmZg4DMzPDYWBmZjgMzMwMh4GZmQH/HwrWTZ6TzjWpAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.plot(list(frequencies.values()))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment