Skip to content

Instantly share code, notes, and snippets.

@numpde
Last active May 29, 2017 13:08
Show Gist options
  • Save numpde/832167bf22709305ef05f2bc5d80ba54 to your computer and use it in GitHub Desktop.
Save numpde/832167bf22709305ef05f2bc5d80ba54 to your computer and use it in GitHub Desktop.
Histograms for the number of types V(N) within the first N tokens
#!/usr/bin/python3
# Histograms for the number of types V(N) within the first N tokens.
# Comparison of natives vs learners.
# R. Andreev, 2017-05-11 (first version), CC BY 4.0
# Designed for the ANGLISH corpus [Tortel 2008, via N. Ballier & P. Lisson]
# The texts are expected to be located in ./ANGLISH/*.txt
import os, sys, argparse
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
from scipy import stats
from glob import glob
from random import shuffle, randrange, choice
from itertools import accumulate
# This function analyses a text file "filename",
# returning a pair (S, R), where
# S[n] is the n-th token,
# R[n] is 1 if the token appears for the first time.
# Thus
# n -> (n, accumulate(R)[n])
# is the vocabulary growth curve.
def vgc(filename) :
# Collate lines, separating them by a space
f = open(filename, 'r')
S = ' '.join(f.readlines())
f.close()
# Remove non-text
S = "".join(c for c in S if (c.isalnum() or (c == ' ')))
# Split into a list of words
S = S.split()
#print(S)
R = []
for (n, w) in enumerate(S) :
# repeated word?
r = (n > 0) and (w in S[0:n-1])
R.append(int(not r))
return (S, R)
# Compute the mean and the std dev of a list L
def mean_std(L) :
return (np.mean(np.asarray(L)), np.std(np.asarray(L), ddof=1))
def main() :
# FIRST SET OF TEXTS
FR = []
for f in glob("./ANGLISH/F*FR*.txt") :
(S, R) = vgc(f)
FR.append(list(accumulate(R)))
# SECOND SET OF TEXTS
GB = []
for f in glob("./ANGLISH/F*GB*.txt") :
(S, R) = vgc(f)
GB.append(list(accumulate(R)))
for f in glob("./ANGLISH/H*GB*.txt") :
(S, R) = vgc(f)
GB.append(list(accumulate(R)))
# TEXT CUT-OFF LENGTH (TOKENS)
for N in [50, 100, 150, 200, 250, 300] :
# Compute V(N)
frN = [a[N-1] for a in FR if (len(a) >= N)]
gbN = [a[N-1] for a in GB if (len(a) >= N)]
print("N = {}".format(N))
print("V(N) for FR:", frN)
print("V(N) for GB:", gbN)
# https://stats.stackexchange.com/questions/13326/
print(stats.ks_2samp(frN, gbN))
bins = np.linspace(30, 170, 20)
xx = np.linspace(min(bins), max(bins), 1000)
dx = (max(bins) - min(bins)) / (len(bins) - 1)
w = np.ones_like(frN); w = w / sum(w)
plt.hist(frN, bins, alpha=0.3, color="red", label="Learner", normed=1)
(m, s) = mean_std(frN)
plt.plot(xx, mlab.normpdf(xx, m, s), color="red")
w = np.ones_like(gbN); w = w / sum(w)
plt.hist(gbN, bins, alpha=0.3, color="blue", label="Native", normed=1)
(m, s) = mean_std(gbN)
plt.plot(xx, mlab.normpdf(xx, m, s), color="blue")
plt.legend(loc='upper right')
plt.title("V(N = {}); # FR texts = {}, # GB texts = {}".format(N, len(frN), len(gbN)))
plt.xlim([min(bins), max(bins)])
#plt.ylim([0, 1])
f = "hist_N={}.png".format(N)
plt.savefig(f, bbox_inches='tight')
plt.show()
print(" ")
return
if (__name__ == "__main__") :
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment