numpde/histo.py

## histo.py
#!/usr/bin/python3

# Histograms for the number of types V(N) within the first N tokens.
# Comparison of natives vs learners.

# R. Andreev, 2017-05-11 (first version), CC BY 4.0

# Designed for the ANGLISH corpus [Tortel 2008, via N. Ballier & P. Lisson]
# The texts are expected to be located in ./ANGLISH/*.txt

import os, sys, argparse
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np

from scipy import stats
from glob import glob
from random import shuffle, randrange, choice
from itertools import accumulate

# This function analyses a text file "filename",
# returning a pair (S, R), where
#   S[n] is the n-th token,
#   R[n] is 1 if the token appears for the first time.
# Thus
#   n -> (n, accumulate(R)[n])
# is the vocabulary growth curve.
def vgc(filename) :
    # Collate lines, separating them by a space
    f = open(filename, 'r')
    S = ' '.join(f.readlines())
    f.close()

    # Remove non-text
    S = "".join(c for c in S if (c.isalnum() or (c == ' ')))
    # Split into a list of words
    S = S.split()

    #print(S)

    R = []
    for (n, w) in enumerate(S) :
        # repeated word?
        r = (n > 0) and (w in S[0:n-1])
        R.append(int(not r))

    return (S, R)

# Compute the mean and the std dev of a list L
def mean_std(L) :
    return (np.mean(np.asarray(L)), np.std(np.asarray(L), ddof=1))

def main() :
    # FIRST SET OF TEXTS
    FR = []
    for f in glob("./ANGLISH/F*FR*.txt") :
        (S, R) = vgc(f)
        FR.append(list(accumulate(R)))

    # SECOND SET OF TEXTS
    GB = []
    for f in glob("./ANGLISH/F*GB*.txt") :
        (S, R) = vgc(f)
        GB.append(list(accumulate(R)))
    for f in glob("./ANGLISH/H*GB*.txt") :
        (S, R) = vgc(f)
        GB.append(list(accumulate(R)))

    # TEXT CUT-OFF LENGTH (TOKENS)
    for N in [50, 100, 150, 200, 250, 300] :
        # Compute V(N)
        frN = [a[N-1] for a in FR if (len(a) >= N)]
        gbN = [a[N-1] for a in GB if (len(a) >= N)]
        print("N = {}".format(N))
        print("V(N) for FR:", frN)
        print("V(N) for GB:", gbN)

        # https://stats.stackexchange.com/questions/13326/
        print(stats.ks_2samp(frN, gbN))

        bins = np.linspace(30, 170, 20)
        xx = np.linspace(min(bins), max(bins), 1000)
        dx = (max(bins) - min(bins)) / (len(bins) - 1)

        w = np.ones_like(frN); w = w / sum(w)
        plt.hist(frN, bins, alpha=0.3, color="red", label="Learner", normed=1)
        (m, s) = mean_std(frN)
        plt.plot(xx, mlab.normpdf(xx, m, s), color="red")

        w = np.ones_like(gbN); w = w / sum(w)
        plt.hist(gbN, bins, alpha=0.3, color="blue", label="Native", normed=1)
        (m, s) = mean_std(gbN)
        plt.plot(xx, mlab.normpdf(xx, m, s), color="blue")

        plt.legend(loc='upper right')
        plt.title("V(N = {}); # FR texts = {}, # GB texts = {}".format(N, len(frN), len(gbN)))

        plt.xlim([min(bins), max(bins)])
        #plt.ylim([0, 1])

        f = "hist_N={}.png".format(N)
        plt.savefig(f, bbox_inches='tight')

        plt.show()

        print(" ")
    return


if (__name__ == "__main__") :
    main()
	#!/usr/bin/python3

	# Histograms for the number of types V(N) within the first N tokens.
	# Comparison of natives vs learners.

	# R. Andreev, 2017-05-11 (first version), CC BY 4.0

	# Designed for the ANGLISH corpus [Tortel 2008, via N. Ballier & P. Lisson]
	# The texts are expected to be located in ./ANGLISH/*.txt

	import os, sys, argparse
	import matplotlib.pyplot as plt
	import matplotlib.mlab as mlab
	import numpy as np

	from scipy import stats
	from glob import glob
	from random import shuffle, randrange, choice
	from itertools import accumulate

	# This function analyses a text file "filename",
	# returning a pair (S, R), where
	# S[n] is the n-th token,
	# R[n] is 1 if the token appears for the first time.
	# Thus
	# n -> (n, accumulate(R)[n])
	# is the vocabulary growth curve.
	def vgc(filename) :
	# Collate lines, separating them by a space
	f = open(filename, 'r')
	S = ' '.join(f.readlines())
	f.close()

	# Remove non-text
	S = "".join(c for c in S if (c.isalnum() or (c == ' ')))
	# Split into a list of words
	S = S.split()

	#print(S)

	R = []
	for (n, w) in enumerate(S) :
	# repeated word?
	r = (n > 0) and (w in S[0:n-1])
	R.append(int(not r))

	return (S, R)

	# Compute the mean and the std dev of a list L
	def mean_std(L) :
	return (np.mean(np.asarray(L)), np.std(np.asarray(L), ddof=1))

	def main() :
	# FIRST SET OF TEXTS
	FR = []
	for f in glob("./ANGLISH/FFR.txt") :
	(S, R) = vgc(f)
	FR.append(list(accumulate(R)))

	# SECOND SET OF TEXTS
	GB = []
	for f in glob("./ANGLISH/FGB.txt") :
	(S, R) = vgc(f)
	GB.append(list(accumulate(R)))
	for f in glob("./ANGLISH/HGB.txt") :
	(S, R) = vgc(f)
	GB.append(list(accumulate(R)))

	# TEXT CUT-OFF LENGTH (TOKENS)
	for N in [50, 100, 150, 200, 250, 300] :
	# Compute V(N)
	frN = [a[N-1] for a in FR if (len(a) >= N)]
	gbN = [a[N-1] for a in GB if (len(a) >= N)]
	print("N = {}".format(N))
	print("V(N) for FR:", frN)
	print("V(N) for GB:", gbN)

	# https://stats.stackexchange.com/questions/13326/
	print(stats.ks_2samp(frN, gbN))

	bins = np.linspace(30, 170, 20)
	xx = np.linspace(min(bins), max(bins), 1000)
	dx = (max(bins) - min(bins)) / (len(bins) - 1)

	w = np.ones_like(frN); w = w / sum(w)
	plt.hist(frN, bins, alpha=0.3, color="red", label="Learner", normed=1)
	(m, s) = mean_std(frN)
	plt.plot(xx, mlab.normpdf(xx, m, s), color="red")

	w = np.ones_like(gbN); w = w / sum(w)
	plt.hist(gbN, bins, alpha=0.3, color="blue", label="Native", normed=1)
	(m, s) = mean_std(gbN)
	plt.plot(xx, mlab.normpdf(xx, m, s), color="blue")

	plt.legend(loc='upper right')
	plt.title("V(N = {}); # FR texts = {}, # GB texts = {}".format(N, len(frN), len(gbN)))

	plt.xlim([min(bins), max(bins)])
	#plt.ylim([0, 1])

	f = "hist_N={}.png".format(N)
	plt.savefig(f, bbox_inches='tight')

	plt.show()

	print(" ")
	return


	if (__name__ == "__main__") :
	main()