allisonmorgan/acronym_count.txt

## acronym_count.txt
1959	2	80
1960	0	9
1961	0	26
1962	7	147
1963	1	13
1964	4	42
1965	4	75
1966	20	157
1967	37	279
1968	65	549
1969	58	456
1970	43	306
1971	73	631
1972	71	631
1973	106	1007
1974	177	1359
1975	171	1311
1976	272	1639
1977	234	1674
1978	257	1932
1979	319	2156
1980	417	2496
1981	527	2870
1982	580	3223
1983	645	3595
1984	865	4531
1985	806	4490
1986	1176	6139
1987	1055	6523
1988	1402	8250
1989	1653	9820
1990	1860	11719
1991	2272	12637
1992	2404	14395
1993	3075	18354
1994	3843	21959
1995	3853	21587
1996	4009	23786
1997	4899	26848
1998	5535	31607
1999	6668	35135
2000	7815	39933
2001	8589	42756
2002	10529	49407
2003	12651	60270
2004	16443	76917
2005	18539	88419
2006	21628	97463
2007	22791	106399
2008	24231	110326
2009	24679	116249
2010	25779	121596
2011	28523	132934
2012	30868	139126
2013	30765	142647
2014	31439	144993
2015	32187	147075
2016	32574	147512
2017	32049	145898
2018	16562	79072

## colon_count.txt
1959	0	80
1960	0	9
1961	0	26
1962	2	147
1963	1	13
1964	3	42
1965	3	75
1966	15	157
1967	35	279
1968	61	549
1969	64	456
1970	47	306
1971	81	631
1972	103	631
1973	95	1007
1974	125	1359
1975	128	1311
1976	154	1639
1977	185	1674
1978	212	1932
1979	183	2156
1980	274	2496
1981	349	2870
1982	380	3223
1983	443	3595
1984	527	4531
1985	619	4490
1986	870	6139
1987	909	6523
1988	1223	8250
1989	1356	9820
1990	1650	11719
1991	1769	12637
1992	2119	14395
1993	2733	18354
1994	3147	21959
1995	3190	21587
1996	3473	23786
1997	4233	26848
1998	4764	31607
1999	5271	35135
2000	5914	39933
2001	6400	42756
2002	7546	49407
2003	9149	60270
2004	11205	76917
2005	12707	88419
2006	13530	97463
2007	15103	106399
2008	15954	110326
2009	17106	116249
2010	18368	121596
2011	20879	132934
2012	22924	139126
2013	24604	142647
2014	26098	144993
2015	27197	147075
2016	28741	147512
2017	29275	145898
2018	18444	79072

## getplot.py

import plot_utils
import csv
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

def get_wilson_score_interval(num_successes, num_failures, alpha=0.05):
    """ returns the confidence interval for p, the success probability """
    z = stats.norm.ppf(q=1-alpha/2.)
    n = num_successes + num_failures
    ns = num_successes
    nf = num_failures

    upper = (ns + .5*z**2)/(n+z**2) + z/(n+z**2)*np.sqrt((ns*nf)/n + .25*z**2)
    lower = (ns + .5*z**2)/(n+z**2) - z/(n+z**2)*np.sqrt((ns*nf)/n + .25*z**2)

    return (lower, upper)

fig, ax = plt.subplots(figsize=(12, 6))
files = ['colon_count.txt', 'hyphen_count.txt', 'question_count.txt', 'acronym_count.txt', "quote_count.txt"] #'acronym_colon_count.txt']
labels = ['colon', 'hyphen', 'question', 'acronym', 'quote']
for i, color in enumerate(['#539CAF', '#49586E', '#8A696E', '#B06B5D', '#BAA657']): #'#748BD1',

    with open(files[i],'r') as tsv:
        data = [line.strip().split('\t') for line in tsv]

    x = []; y = []; lower_CI = []; upper_CI = [];
    for row in data:
        if int(row[0]) >= 1975:
            x.append(int(row[0]))
            y.append(float(row[1])/float(row[2]))

            (low, high) = get_wilson_score_interval(int(row[1]), int(row[2]) - int(row[1]))
            lower_CI.append(low); upper_CI.append(high);


    ax.scatter(x, y, color=color, label = labels[i])
    ax.fill_between(x, lower_CI, upper_CI, color = color, alpha = 0.4)

ax.set_xticks(range(1975, 2020, 5))
ax.legend(fontsize=plot_utils.LEGEND_SIZE, frameon=False, ncol=2)
ax.set_ylabel('Frequency')
ax.set_xlabel('Year')
ax.set_title('Relative Counts of Punctuation (Source: DBLP 1975--Present)')
plot_utils.finalize(ax)

plt.tight_layout()
plt.savefig('frequency.pdf', bbox_inches='tight', format='pdf', dpi=1000)
plt.clf()

## hyphen_count.txt
1959	1	80
1960	0	9
1961	0	26
1962	7	147
1963	0	13
1964	0	42
1965	1	75
1966	1	157
1967	3	279
1968	12	549
1969	2	456
1970	6	306
1971	10	631
1972	18	631
1973	28	1007
1974	56	1359
1975	45	1311
1976	60	1639
1977	68	1674
1978	78	1932
1979	73	2156
1980	108	2496
1981	104	2870
1982	108	3223
1983	153	3595
1984	183	4531
1985	138	4490
1986	258	6139
1987	211	6523
1988	256	8250
1989	268	9820
1990	287	11719
1991	325	12637
1992	367	14395
1993	502	18354
1994	546	21959
1995	423	21587
1996	486	23786
1997	547	26848
1998	736	31607
1999	883	35135
2000	842	39933
2001	1003	42756
2002	1074	49407
2003	1273	60270
2004	1693	76917
2005	1821	88419
2006	2152	97463
2007	2242	106399
2008	2560	110326
2009	2676	116249
2010	2942	121596
2011	3514	132934
2012	3375	139126
2013	3462	142647
2014	3565	144993
2015	3450	147075
2016	2950	147512
2017	2846	145898
2018	1639	79072

## parse.go
package main

import (
    "compress/gzip"
    "encoding/xml"
    "fmt"
    "golang.org/x/net/html/charset"
    "log"
    "os"
    "strings"
    //"regexp"
    "sort"
)

// Structs to unmarshal the DBLP XML data into
// Only unpacks "inproceedings" publications
type InProceedings struct {
    Papers []Paper `xml:"inproceedings"`
}

// Common features of publications. Not a complete
// set of attributes.
type Paper struct {
    Key         string   `xml:"key,attr"`
    Authors     []string `xml:"author"`
    Title       string   `xml:"title"`
    Year        int      `xml:"year"`
    Booktitle   string   `xml:"booktitle"`
    EE          string   `xml:"ee"`
    Crossref    string   `xml:"crossref"`
    Url         string   `xml:"url"`
}

// Since decoder.Decode() will error on special
// characters, the map below will convert these
// foreign language characters
var characters = []string{"a", "e", "o", "u", "i", "c", "b", "s", "y", "n", "ae"}
var suffixes = []string{"acute", "uml", "ring", "zlig", "slash", "cedil", "grave", "circ", "tilde", "lig"}

func build_special_character_list() map[string]string {
    var special_characters = make(map[string]string)

    for _, character := range characters {
        for _, suffix := range suffixes {
            special_characters[character+suffix] = character
            special_characters[strings.ToUpper(character)+suffix] = character
        }
    }

    // Other random special characters
    special_characters["times"] = "*"
    special_characters["reg"] = "reg"
    special_characters["eth"] = "d"
    special_characters["ETH"] = "d"
    special_characters["micro"] = "u"
    special_characters["thorn"] = "p"
    special_characters["THORN"] = "p"

    return special_characters
}

func main() {
    // Download data from http://dblp.uni-trier.de/xml/,
    // change the file path below to point to it
    log.Println("Reading in DBLP gzipped file")
    f, err := os.Open("dblp.xml.gz")
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    // Golang can read straight from gzipped files!
    gzf, err := gzip.NewReader(f)
    if err != nil {
        log.Fatal(err)
    }

    // Start a new XML decoder instance and ask it to
    // find the encoding specified in the file's header.
    // Specify the special character replacements.
    decoder := xml.NewDecoder(gzf)
    decoder.CharsetReader = charset.NewReaderLabel
    decoder.Entity = build_special_character_list()

    // This step takes a while (~2 minutes)
    log.Println("Decoding all proceedings from file")
    var papers InProceedings
    err = decoder.Decode(&papers)
    if err != nil {
        log.Fatal(err)
    }
    log.Println("Done decoding")
    // log.Printf("Example paper: %+v\n", papers.Papers[0])

    counter := make(map[int]int)
    total := make(map[int]int)
    for _, paper := range papers.Papers {
        // matched, _ := regexp.MatchString(".*[A-Z]{3}.*", paper.Title)
        // matched := strings.Contains(paper.Title, ":")
        // matched := strings.Contains(paper.Title, " - ")
        // matched := strings.Contains(paper.Title, "?")
        matched := strings.Contains(paper.Title, "\"")
        if matched {
            counter[paper.Year] += 1
        }
        total[paper.Year] += 1
    }
    log.Println(counter)
    log.Println(total)
    // log.Printf("Number of %s titles: %v\tAverage title length: %v\n", strings.ToUpper(conference), len(titles), float64(avg_length)/float64(len(titles)))

    // Write all the papers to a new text file
    f, err = os.Create(fmt.Sprintf("quote_count.txt"))
    if err != nil{
        log.Fatal(err)
    }
    defer f.Close()

    var years []int
    for k, _ := range total {
        years = append(years, k)
    }
    sort.Ints(years)
    log.Println(years)

    for i := range years {
        _, err := f.WriteString(fmt.Sprintf("%d\t%d\t%d\n", years[i], counter[years[i]], total[years[i]]))
        if err != nil {
           log.Fatal(err)
        }
    }
    log.Println("Output data to quote_count.txt")
}

## question_count.txt
1959	1	80
1960	0	9
1961	0	26
1962	0	147
1963	0	13
1964	0	42
1965	0	75
1966	0	157
1967	3	279
1968	2	549
1969	0	456
1970	1	306
1971	6	631
1972	4	631
1973	7	1007
1974	9	1359
1975	15	1311
1976	11	1639
1977	16	1674
1978	25	1932
1979	17	2156
1980	36	2496
1981	30	2870
1982	29	3223
1983	41	3595
1984	56	4531
1985	63	4490
1986	85	6139
1987	82	6523
1988	107	8250
1989	140	9820
1990	130	11719
1991	150	12637
1992	184	14395
1993	207	18354
1994	296	21959
1995	263	21587
1996	286	23786
1997	331	26848
1998	377	31607
1999	431	35135
2000	472	39933
2001	494	42756
2002	645	49407
2003	709	60270
2004	839	76917
2005	910	88419
2006	936	97463
2007	1050	106399
2008	1223	110326
2009	1327	116249
2010	1330	121596
2011	1587	132934
2012	1745	139126
2013	1974	142647
2014	2032	144993
2015	2166	147075
2016	2223	147512
2017	2340	145898
2018	1363	79072

## quote_count.txt
1959	0	80
1960	0	9
1961	0	26
1962	1	147
1963	0	13
1964	1	42
1965	2	75
1966	1	157
1967	5	279
1968	2	549
1969	2	456
1970	2	306
1971	7	631
1972	12	631
1973	7	1007
1974	11	1359
1975	17	1311
1976	6	1639
1977	14	1674
1978	29	1932
1979	14	2156
1980	32	2496
1981	26	2870
1982	36	3223
1983	22	3595
1984	25	4531
1985	43	4490
1986	46	6139
1987	47	6523
1988	46	8250
1989	66	9820
1990	58	11719
1991	87	12637
1992	91	14395
1993	95	18354
1994	132	21959
1995	113	21587
1996	115	23786
1997	150	26848
1998	162	31607
1999	228	35135
2000	164	39933
2001	209	42756
2002	237	49407
2003	266	60270
2004	351	76917
2005	357	88419
2006	362	97463
2007	418	106399
2008	482	110326
2009	430	116249
2010	471	121596
2011	546	132934
2012	552	139126
2013	607	142647
2014	602	144993
2015	642	147075
2016	672	147512
2017	593	145898
2018	343	79072
	1959 2 80
	1960 0 9
	1961 0 26
	1962 7 147
	1963 1 13
	1964 4 42
	1965 4 75
	1966 20 157
	1967 37 279
	1968 65 549
	1969 58 456
	1970 43 306
	1971 73 631
	1972 71 631
	1973 106 1007
	1974 177 1359
	1975 171 1311
	1976 272 1639
	1977 234 1674
	1978 257 1932
	1979 319 2156
	1980 417 2496
	1981 527 2870
	1982 580 3223
	1983 645 3595
	1984 865 4531
	1985 806 4490
	1986 1176 6139
	1987 1055 6523
	1988 1402 8250
	1989 1653 9820
	1990 1860 11719
	1991 2272 12637
	1992 2404 14395
	1993 3075 18354
	1994 3843 21959
	1995 3853 21587
	1996 4009 23786
	1997 4899 26848
	1998 5535 31607
	1999 6668 35135
	2000 7815 39933
	2001 8589 42756
	2002 10529 49407
	2003 12651 60270
	2004 16443 76917
	2005 18539 88419
	2006 21628 97463
	2007 22791 106399
	2008 24231 110326
	2009 24679 116249
	2010 25779 121596
	2011 28523 132934
	2012 30868 139126
	2013 30765 142647
	2014 31439 144993
	2015 32187 147075
	2016 32574 147512
	2017 32049 145898
	2018 16562 79072
	1959 0 80
	1960 0 9
	1961 0 26
	1962 2 147
	1963 1 13
	1964 3 42
	1965 3 75
	1966 15 157
	1967 35 279
	1968 61 549
	1969 64 456
	1970 47 306
	1971 81 631
	1972 103 631
	1973 95 1007
	1974 125 1359
	1975 128 1311
	1976 154 1639
	1977 185 1674
	1978 212 1932
	1979 183 2156
	1980 274 2496
	1981 349 2870
	1982 380 3223
	1983 443 3595
	1984 527 4531
	1985 619 4490
	1986 870 6139
	1987 909 6523
	1988 1223 8250
	1989 1356 9820
	1990 1650 11719
	1991 1769 12637
	1992 2119 14395
	1993 2733 18354
	1994 3147 21959
	1995 3190 21587
	1996 3473 23786
	1997 4233 26848
	1998 4764 31607
	1999 5271 35135
	2000 5914 39933
	2001 6400 42756
	2002 7546 49407
	2003 9149 60270
	2004 11205 76917
	2005 12707 88419
	2006 13530 97463
	2007 15103 106399
	2008 15954 110326
	2009 17106 116249
	2010 18368 121596
	2011 20879 132934
	2012 22924 139126
	2013 24604 142647
	2014 26098 144993
	2015 27197 147075
	2016 28741 147512
	2017 29275 145898
	2018 18444 79072

	import plot_utils
	import csv
	import matplotlib.pyplot as plt
	import numpy as np
	from scipy import stats

	def get_wilson_score_interval(num_successes, num_failures, alpha=0.05):
	""" returns the confidence interval for p, the success probability """
	z = stats.norm.ppf(q=1-alpha/2.)
	n = num_successes + num_failures
	ns = num_successes
	nf = num_failures

	upper = (ns + .5z2)/(n+z2) + z/(n+z2)np.sqrt((nsnf)/n + .25z**2)
	lower = (ns + .5z2)/(n+z2) - z/(n+z2)np.sqrt((nsnf)/n + .25z**2)

	return (lower, upper)

	fig, ax = plt.subplots(figsize=(12, 6))
	files = ['colon_count.txt', 'hyphen_count.txt', 'question_count.txt', 'acronym_count.txt', "quote_count.txt"] #'acronym_colon_count.txt']
	labels = ['colon', 'hyphen', 'question', 'acronym', 'quote']
	for i, color in enumerate(['#539CAF', '#49586E', '#8A696E', '#B06B5D', '#BAA657']): #'#748BD1',

	with open(files[i],'r') as tsv:
	data = [line.strip().split('\t') for line in tsv]

	x = []; y = []; lower_CI = []; upper_CI = [];
	for row in data:
	if int(row[0]) >= 1975:
	x.append(int(row[0]))
	y.append(float(row[1])/float(row[2]))

	(low, high) = get_wilson_score_interval(int(row[1]), int(row[2]) - int(row[1]))
	lower_CI.append(low); upper_CI.append(high);


	ax.scatter(x, y, color=color, label = labels[i])
	ax.fill_between(x, lower_CI, upper_CI, color = color, alpha = 0.4)

	ax.set_xticks(range(1975, 2020, 5))
	ax.legend(fontsize=plot_utils.LEGEND_SIZE, frameon=False, ncol=2)
	ax.set_ylabel('Frequency')
	ax.set_xlabel('Year')
	ax.set_title('Relative Counts of Punctuation (Source: DBLP 1975--Present)')
	plot_utils.finalize(ax)

	plt.tight_layout()
	plt.savefig('frequency.pdf', bbox_inches='tight', format='pdf', dpi=1000)
	plt.clf()
	1959 1 80
	1960 0 9
	1961 0 26
	1962 7 147
	1963 0 13
	1964 0 42
	1965 1 75
	1966 1 157
	1967 3 279
	1968 12 549
	1969 2 456
	1970 6 306
	1971 10 631
	1972 18 631
	1973 28 1007
	1974 56 1359
	1975 45 1311
	1976 60 1639
	1977 68 1674
	1978 78 1932
	1979 73 2156
	1980 108 2496
	1981 104 2870
	1982 108 3223
	1983 153 3595
	1984 183 4531
	1985 138 4490
	1986 258 6139
	1987 211 6523
	1988 256 8250
	1989 268 9820
	1990 287 11719
	1991 325 12637
	1992 367 14395
	1993 502 18354
	1994 546 21959
	1995 423 21587
	1996 486 23786
	1997 547 26848
	1998 736 31607
	1999 883 35135
	2000 842 39933
	2001 1003 42756
	2002 1074 49407
	2003 1273 60270
	2004 1693 76917
	2005 1821 88419
	2006 2152 97463
	2007 2242 106399
	2008 2560 110326
	2009 2676 116249
	2010 2942 121596
	2011 3514 132934
	2012 3375 139126
	2013 3462 142647
	2014 3565 144993
	2015 3450 147075
	2016 2950 147512
	2017 2846 145898
	2018 1639 79072
	package main

	import (
	"compress/gzip"
	"encoding/xml"
	"fmt"
	"golang.org/x/net/html/charset"
	"log"
	"os"
	"strings"
	//"regexp"
	"sort"
	)

	// Structs to unmarshal the DBLP XML data into
	// Only unpacks "inproceedings" publications
	type InProceedings struct {
	Papers []Paper `xml:"inproceedings"`
	}

	// Common features of publications. Not a complete
	// set of attributes.
	type Paper struct {
	Key string `xml:"key,attr"`
	Authors []string `xml:"author"`
	Title string `xml:"title"`
	Year int `xml:"year"`
	Booktitle string `xml:"booktitle"`
	EE string `xml:"ee"`
	Crossref string `xml:"crossref"`
	Url string `xml:"url"`
	}

	// Since decoder.Decode() will error on special
	// characters, the map below will convert these
	// foreign language characters
	var characters = []string{"a", "e", "o", "u", "i", "c", "b", "s", "y", "n", "ae"}
	var suffixes = []string{"acute", "uml", "ring", "zlig", "slash", "cedil", "grave", "circ", "tilde", "lig"}

	func build_special_character_list() map[string]string {
	var special_characters = make(map[string]string)

	for _, character := range characters {
	for _, suffix := range suffixes {
	special_characters[character+suffix] = character
	special_characters[strings.ToUpper(character)+suffix] = character
	}
	}

	// Other random special characters
	special_characters["times"] = "*"
	special_characters["reg"] = "reg"
	special_characters["eth"] = "d"
	special_characters["ETH"] = "d"
	special_characters["micro"] = "u"
	special_characters["thorn"] = "p"
	special_characters["THORN"] = "p"

	return special_characters
	}

	func main() {
	// Download data from http://dblp.uni-trier.de/xml/,
	// change the file path below to point to it
	log.Println("Reading in DBLP gzipped file")
	f, err := os.Open("dblp.xml.gz")
	if err != nil {
	log.Fatal(err)
	}
	defer f.Close()

	// Golang can read straight from gzipped files!
	gzf, err := gzip.NewReader(f)
	if err != nil {
	log.Fatal(err)
	}

	// Start a new XML decoder instance and ask it to
	// find the encoding specified in the file's header.
	// Specify the special character replacements.
	decoder := xml.NewDecoder(gzf)
	decoder.CharsetReader = charset.NewReaderLabel
	decoder.Entity = build_special_character_list()

	// This step takes a while (~2 minutes)
	log.Println("Decoding all proceedings from file")
	var papers InProceedings
	err = decoder.Decode(&papers)
	if err != nil {
	log.Fatal(err)
	}
	log.Println("Done decoding")
	// log.Printf("Example paper: %+v\n", papers.Papers[0])

	counter := make(map[int]int)
	total := make(map[int]int)
	for _, paper := range papers.Papers {
	// matched, _ := regexp.MatchString(".[A-Z]{3}.", paper.Title)
	// matched := strings.Contains(paper.Title, ":")
	// matched := strings.Contains(paper.Title, " - ")
	// matched := strings.Contains(paper.Title, "?")
	matched := strings.Contains(paper.Title, "\"")
	if matched {
	counter[paper.Year] += 1
	}
	total[paper.Year] += 1
	}
	log.Println(counter)
	log.Println(total)
	// log.Printf("Number of %s titles: %v\tAverage title length: %v\n", strings.ToUpper(conference), len(titles), float64(avg_length)/float64(len(titles)))

	// Write all the papers to a new text file
	f, err = os.Create(fmt.Sprintf("quote_count.txt"))
	if err != nil{
	log.Fatal(err)
	}
	defer f.Close()

	var years []int
	for k, _ := range total {
	years = append(years, k)
	}
	sort.Ints(years)
	log.Println(years)

	for i := range years {
	_, err := f.WriteString(fmt.Sprintf("%d\t%d\t%d\n", years[i], counter[years[i]], total[years[i]]))
	if err != nil {
	log.Fatal(err)
	}
	}
	log.Println("Output data to quote_count.txt")
	}